인공지능 공부/NLP 연구

(NLP 연구) The Long-Document Transformer 03.28

LSH hashing 구현

def shingle(text: str, k:int):
    shingle_set = []
    for i in range(len(text) - k + 1):
        shingle_set.append(text[i:i+k])
        
    return set(shingle_set)

k = 2
a = shingle(a,k)
b = shingle(b,k)
c = shingle(c,2)

vocab = list(a.union(b).union(c))

a_1hot = [1 if x in a else 0 for x in vocab]
b_1hot = [1 if x in b else 0 for x in vocab]
c_1hot = [1 if x in c else 0 for x in vocab]

hash_ex = list(range(1,len(vocab)+1))

from random import shuffle
shuffle(hash_ex)

for i in range(1, len(vocab)+1):
    idx = hash_ex.index(i)
    signature_val = a_1hot[idx]
    print(f"{i} -> {idx} -> {signature_val}")

    if signature_val == 1:
        print("match!")
        break

def create_hash_func(size: int):
    # function for creating the hash vector/function
    hash_ex = list(range(1, len(vocab)+1))
    shuffle(hash_ex)
    return hash_ex

def build_minhash_func(vocab_size: int, nbits: int):
    hashes = []
    for _ in range(nbits):
        hashes.append(create_hash_func(vocab_size))
    return hashes

minhash_func = build_minhash_func(len(vocab), 20)

def create_hash(vector: list):
    
    signature = []
    for func in minhash_func:
        for i in range(1, len(vocab)+1):
            idx = func.index(i)
            signature_val = vector[idx]
            if signature_val == 1:
                signature.append(i)
                break
    return signature


a_sig = create_hash(a_1hot)
b_sig = create_hash(b_1hot)

print(a_sig)
print(b_sig)

def jaccard(x,y):
    return len(x.intersection(y)) / len(x.union(y))


jaccard(set(a_sig), set(b_sig))