~amirouche/sudopython

069bf15efaef57ad21a7a0cbfe7d3588ba3e2629 — Amirouche 1 year, 1 month ago c38c499 archive/001
wip
2 files changed, 72 insertions(+), 48 deletions(-)

M bbkh.py
M benchmark-typofix.py
M bbkh.py => bbkh.py +49 -32
@@ 16,15 16,15 @@ chars = string.ascii_lowercase + string.digits + "$ "
# TODO: maybe extend to trigram
BIGRAM = [''.join(x) for x in itertools.product(chars, chars)]
TRIGRAM = [''.join(x) for x in itertools.product(chars, chars, chars)]
ONE_HOT_ENCODER = sorted(BIGRAM + TRIGRAM)
BITS_COUNT = 2**16
ONE_HOT_ENCODER = sorted(BIGRAM)
BITS_COUNT = 2**11

# BITS_COUNT must be the first power of two that is bigger than
# ONE_HOT_ENCODER.
assert len(ONE_HOT_ENCODER) <= BITS_COUNT

# That is related to the merkletree serialization.
BYTES_COUNT = (2 * BITS_COUNT) // 8
BYTES_COUNT = BITS_COUNT // 8


def ngram(string, n):


@@ 43,7 43,7 @@ def chunks(l, n):

def merkletree(booleans):
    length = (2 * len(booleans) - 1)
    out = [False] * length
    out = [None] * length
    index = length - 1
    booleans = list(reversed(booleans))
    while len(booleans) > 1:


@@ 56,24 56,35 @@ def merkletree(booleans):
            new.append(value)
        booleans = new
    assert index == 0
    out[0] = booleans[0]
    return out


def rotate(strg, n):
    return strg[n:] + strg[:n]


def bbkh(string):
    integer = 0
    string = ' '.join("$" + token + "$" for token in string.split())
    for n in (2, 3):
    for n in [2]:
        for gram in ngram(string, n):
            hotbit = ONE_HOT_ENCODER.index(gram)
            hotinteger = 1 << hotbit
            integer = integer | hotinteger
    booleans = integer2booleans(integer)
    tree = merkletree(booleans)
    fuzz = ''.join('1' if x else '0' for x in tree)
    buzz = int(fuzz, 2)
    assert buzz <= 2 ** (BYTES_COUNT * 8)
    hash = buzz.to_bytes(BYTES_COUNT, 'little')
    return hash

    out = []
    for i, op in enumerate([lambda x: x, lambda x: list(reversed(x))]):
        r = 2
        for j in range(r):
            bits = rotate(op(booleans), BITS_COUNT // r * j)
            fuzz = ''.join('1' if x else '0' for x in bits)
            buzz = int(fuzz, 2)
            hash = buzz.to_bytes(BYTES_COUNT, 'little')
            out.append(hash)

    return out


def strinc(key):


@@ 86,27 97,33 @@ def strinc(key):


def search(db, space, query, distance, limit=10):
    hash = bbkh(query)
    near = lexode.pack((space, hash, query))

    scores = Counter()

    # select candidates foward
    candidates = db.iterator(start=near, stop=strinc(lexode.pack((space,))))
    for index, (key, _) in enumerate(candidates):
        if index == (limit * 10):
            break
        _, _, other = lexode.unpack(key)
        score = distance(query, other)
        scores[other] = score

    # select candidates backward
    candidates = db.iterator(stop=near, start=lexode.pack((space,)), reverse=True)
    for index, (key, _) in enumerate(candidates):
        if index == (limit * 10):
            break
        _, _, other = lexode.unpack(key)
        score = distance(query, other)
        scores[other] = score
    effort = 10
    keys = bbkh(query)
    for key in keys:

        near = lexode.pack((space, key, query))

        # select candidates foward
        candidates = db.iterator(start=near, stop=strinc(lexode.pack((space,))))
        for index, (key, _) in enumerate(candidates):
            if index == (limit * effort):
                break
            _, _, other = lexode.unpack(key)
            score = distance(query, other)
            if score > 0:
                if other not in scores:
                    scores[other] = score

        # select candidates backward
        candidates = db.iterator(stop=near, start=lexode.pack((space,)), reverse=True)
        for index, (key, _) in enumerate(candidates):
            if index == (limit * effort):
                break
            _, _, other = lexode.unpack(key)
            score = distance(query, other)
            if score > 0:
                if other not in scores:
                    scores[other] = score

    return scores.most_common(limit)

M benchmark-typofix.py => benchmark-typofix.py +23 -16
@@ 25,6 25,10 @@ def score(a, b):
        return 0
    return fuzzywuzzy.fuzz.ratio(a, b)

def score2(a, b):
    return fuzzywuzzy.fuzz.ratio(a, b)


with open('pypi-index.html') as f:
    index = lxml.html.parse(f)



@@ 37,14 41,17 @@ start = time()
scores = Counter()

for name in names:
    scores[name] = score(name, query)
    scores[name] = score2(name, query)

top = scores.most_common(10)

print(time() - start)

total = 0
for name, value in top:
    total += value
    print(value, name)
print(total)

# typofix over neighboor



@@ 65,9 72,9 @@ def index(name):
    if string.isspace():
        return None, None

    key = bbkh.bbkh(string)
    keys = bbkh.bbkh(string)

    return name, key
    return name, keys


async def pool_for_each_par_map(loop, pool, f, p, iterator):


@@ 106,22 113,19 @@ db = plyvel.DB('typofix.okvslite', create_if_missing=True)
def progress(args):
    global total, size

    name, key = args
    name, keys = args

    total += 1

    if name is None:
        return

    key = bbkh.lexode.pack((b'foobar', key, name))
    if len(key) > size:
        print("new max key", len(key))
        size = len(key)

    db.put(key, b'')
    for key in keys:
        key = bbkh.lexode.pack((b'foobar', key, name))
        db.put(key, b'')

    if (total % 1_000) == 0:
        print(total, name, size, len(key), int(time() - start))
    if (total % 10_000) == 0:
        print(total, name)


async def main(loop):


@@ 131,13 135,16 @@ async def main(loop):
            loop, pool, progress, index, names
        )

loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
loop.close()
# loop = asyncio.get_event_loop()
# loop.run_until_complete(main(loop))
# loop.close()

start = time()
top = bbkh.search(db, b'foobar', query, score)
top = bbkh.search(db, b'foobar', query, score2)
print(time() - start)

total = 0
for name, value in top:
    total += value
    print(value, name)
print(total)