M bbkh.py => bbkh.py +49 -32
@@ 16,15 16,15 @@ chars = string.ascii_lowercase + string.digits + "$ "
# TODO: maybe extend to trigram
BIGRAM = [''.join(x) for x in itertools.product(chars, chars)]
TRIGRAM = [''.join(x) for x in itertools.product(chars, chars, chars)]
-ONE_HOT_ENCODER = sorted(BIGRAM + TRIGRAM)
-BITS_COUNT = 2**16
+ONE_HOT_ENCODER = sorted(BIGRAM)
+BITS_COUNT = 2**11
# BITS_COUNT must be the first power of two that is bigger than
# ONE_HOT_ENCODER.
assert len(ONE_HOT_ENCODER) <= BITS_COUNT
# That is related to the merkletree serialization.
-BYTES_COUNT = (2 * BITS_COUNT) // 8
+BYTES_COUNT = BITS_COUNT // 8
def ngram(string, n):
@@ 43,7 43,7 @@ def chunks(l, n):
def merkletree(booleans):
length = (2 * len(booleans) - 1)
- out = [False] * length
+ out = [None] * length
index = length - 1
booleans = list(reversed(booleans))
while len(booleans) > 1:
@@ 56,24 56,35 @@ def merkletree(booleans):
new.append(value)
booleans = new
assert index == 0
+ out[0] = booleans[0]
return out
+def rotate(strg, n):
+ return strg[n:] + strg[:n]
+
+
def bbkh(string):
integer = 0
string = ' '.join("$" + token + "$" for token in string.split())
- for n in (2, 3):
+ for n in [2]:
for gram in ngram(string, n):
hotbit = ONE_HOT_ENCODER.index(gram)
hotinteger = 1 << hotbit
integer = integer | hotinteger
booleans = integer2booleans(integer)
- tree = merkletree(booleans)
- fuzz = ''.join('1' if x else '0' for x in tree)
- buzz = int(fuzz, 2)
- assert buzz <= 2 ** (BYTES_COUNT * 8)
- hash = buzz.to_bytes(BYTES_COUNT, 'little')
- return hash
+
+ out = []
+ for i, op in enumerate([lambda x: x, lambda x: list(reversed(x))]):
+ r = 2
+ for j in range(r):
+ bits = rotate(op(booleans), BITS_COUNT // r * j)
+ fuzz = ''.join('1' if x else '0' for x in bits)
+ buzz = int(fuzz, 2)
+ hash = buzz.to_bytes(BYTES_COUNT, 'little')
+ out.append(hash)
+
+ return out
def strinc(key):
@@ 86,27 97,33 @@ def strinc(key):
def search(db, space, query, distance, limit=10):
- hash = bbkh(query)
- near = lexode.pack((space, hash, query))
-
scores = Counter()
-
- # select candidates foward
- candidates = db.iterator(start=near, stop=strinc(lexode.pack((space,))))
- for index, (key, _) in enumerate(candidates):
- if index == (limit * 10):
- break
- _, _, other = lexode.unpack(key)
- score = distance(query, other)
- scores[other] = score
-
- # select candidates backward
- candidates = db.iterator(stop=near, start=lexode.pack((space,)), reverse=True)
- for index, (key, _) in enumerate(candidates):
- if index == (limit * 10):
- break
- _, _, other = lexode.unpack(key)
- score = distance(query, other)
- scores[other] = score
+ effort = 10
+ keys = bbkh(query)
+ for key in keys:
+
+ near = lexode.pack((space, key, query))
+
+ # select candidates foward
+ candidates = db.iterator(start=near, stop=strinc(lexode.pack((space,))))
+ for index, (key, _) in enumerate(candidates):
+ if index == (limit * effort):
+ break
+ _, _, other = lexode.unpack(key)
+ score = distance(query, other)
+ if score > 0:
+ if other not in scores:
+ scores[other] = score
+
+ # select candidates backward
+ candidates = db.iterator(stop=near, start=lexode.pack((space,)), reverse=True)
+ for index, (key, _) in enumerate(candidates):
+ if index == (limit * effort):
+ break
+ _, _, other = lexode.unpack(key)
+ score = distance(query, other)
+ if score > 0:
+ if other not in scores:
+ scores[other] = score
return scores.most_common(limit)
M benchmark-typofix.py => benchmark-typofix.py +23 -16
@@ 25,6 25,10 @@ def score(a, b):
return 0
return fuzzywuzzy.fuzz.ratio(a, b)
+def score2(a, b):
+ return fuzzywuzzy.fuzz.ratio(a, b)
+
+
with open('pypi-index.html') as f:
index = lxml.html.parse(f)
@@ 37,14 41,17 @@ start = time()
scores = Counter()
for name in names:
- scores[name] = score(name, query)
+ scores[name] = score2(name, query)
top = scores.most_common(10)
print(time() - start)
+total = 0
for name, value in top:
+ total += value
print(value, name)
+print(total)
# typofix over neighboor
@@ 65,9 72,9 @@ def index(name):
if string.isspace():
return None, None
- key = bbkh.bbkh(string)
+ keys = bbkh.bbkh(string)
- return name, key
+ return name, keys
async def pool_for_each_par_map(loop, pool, f, p, iterator):
@@ 106,22 113,19 @@ db = plyvel.DB('typofix.okvslite', create_if_missing=True)
def progress(args):
global total, size
- name, key = args
+ name, keys = args
total += 1
if name is None:
return
- key = bbkh.lexode.pack((b'foobar', key, name))
- if len(key) > size:
- print("new max key", len(key))
- size = len(key)
-
- db.put(key, b'')
+ for key in keys:
+ key = bbkh.lexode.pack((b'foobar', key, name))
+ db.put(key, b'')
- if (total % 1_000) == 0:
- print(total, name, size, len(key), int(time() - start))
+ if (total % 10_000) == 0:
+ print(total, name)
async def main(loop):
@@ 131,13 135,16 @@ async def main(loop):
loop, pool, progress, index, names
)
-loop = asyncio.get_event_loop()
-loop.run_until_complete(main(loop))
-loop.close()
+# loop = asyncio.get_event_loop()
+# loop.run_until_complete(main(loop))
+# loop.close()
start = time()
-top = bbkh.search(db, b'foobar', query, score)
+top = bbkh.search(db, b'foobar', query, score2)
print(time() - start)
+total = 0
for name, value in top:
+ total += value
print(value, name)
+print(total)