LSH hashing 구현
def shingle(text: str, k:int):
shingle_set = []
for i in range(len(text) - k + 1):
shingle_set.append(text[i:i+k])
return set(shingle_set)
k = 2
a = shingle(a,k)
b = shingle(b,k)
c = shingle(c,2)
vocab = list(a.union(b).union(c))
a_1hot = [1 if x in a else 0 for x in vocab]
b_1hot = [1 if x in b else 0 for x in vocab]
c_1hot = [1 if x in c else 0 for x in vocab]
hash_ex = list(range(1,len(vocab)+1))
from random import shuffle
shuffle(hash_ex)
for i in range(1, len(vocab)+1):
idx = hash_ex.index(i)
signature_val = a_1hot[idx]
print(f"{i} -> {idx} -> {signature_val}")
if signature_val == 1:
print("match!")
break
def create_hash_func(size: int):
# function for creating the hash vector/function
hash_ex = list(range(1, len(vocab)+1))
shuffle(hash_ex)
return hash_ex
def build_minhash_func(vocab_size: int, nbits: int):
hashes = []
for _ in range(nbits):
hashes.append(create_hash_func(vocab_size))
return hashes
minhash_func = build_minhash_func(len(vocab), 20)
def create_hash(vector: list):
signature = []
for func in minhash_func:
for i in range(1, len(vocab)+1):
idx = func.index(i)
signature_val = vector[idx]
if signature_val == 1:
signature.append(i)
break
return signature
a_sig = create_hash(a_1hot)
b_sig = create_hash(b_1hot)
print(a_sig)
print(b_sig)
def jaccard(x,y):
return len(x.intersection(y)) / len(x.union(y))
jaccard(set(a_sig), set(b_sig))
'인공지능 공부 > NLP 연구' 카테고리의 다른 글
(NLP 연구) The Long-Document Transformer 04.01 (데이터셋 LSH 코딩) (0) | 2022.04.01 |
---|---|
(NLP 연구) The Long-Document Transformer 03.31 (LSH) (0) | 2022.04.01 |
(NLP 연구) The Long-Document Transformer 03.24 (0) | 2022.04.01 |
(NLP 연구) The Long-Document Transformer 03.21 (0) | 2022.03.29 |
(NLP 연구) The Long-Document Transformer 03.18 (0) | 2022.03.29 |