~amirouche/sudopython

6583e106fb5b0567e85a8f6bfcfeb57eecc63326 — Amirouche 1 year, 5 months ago fb6af91
index: replace lsm-db with plyvel...

... except for pypi.okvslite part because I am lazy.
1 files changed, 16 insertions(+), 7 deletions(-)

M sudopython-index.py
M sudopython-index.py => sudopython-index.py +16 -7
@@ 8,7 8,7 @@ from html2text import HTML2Text
import ulid
import Stemmer
import bbkh

import plyvel

SUBSPACE_PREVIEW = -1
SUBSPACE_BACKWARD = 0


@@ 22,7 22,7 @@ handler.images_to_alt = True
html2text = handler.handle

pypi = LSM('pypi.okvslite')
db = LSM('db.okvslite')
db = plyvel.DB('sudopython.leveldb', create_if_missing=True)

RE_WORD = re.compile(r"[a-z]+")
RE_TOKENS = re.compile(r"[a-z0-9._-]+")


@@ 57,14 57,14 @@ for index, (key, value) in enumerate(pypi):
    # forward index
    counter_words = Counter(words)
    counter_words = tuple(counter_words.items())
    db[lexode.pack((SUBSPACE_FOWARD, uid))] = lexode.pack((document, counter_words))
    db.put(lexode.pack((SUBSPACE_FOWARD, uid)), lexode.pack((document, counter_words)))
    # Store stems with backward index
    for stem in stems:
        db[lexode.pack((SUBSPACE_BACKWARD, stem, uid))] = b''
        db(lexode.pack((SUBSPACE_BACKWARD, stem, uid)), b'')

    # store preview
    preview = ' '.join(document.split())[:1024]
    db[lexode.pack((SUBSPACE_PREVIEW, uid))] = preview.encode('utf8')
    db.put(lexode.pack((SUBSPACE_PREVIEW, uid)), preview.encode('utf8'))

    # update stem counter
    counter += Counter(stems)


@@ 72,9 72,18 @@ for index, (key, value) in enumerate(pypi):
    # Store "tokens" with bbkh
    tokens = set(unidecode(x) for x in RE_TOKENS.findall(document) if 3 <= len(x) <= 255)
    for token in tokens:
        bbkh.index(db, SUBSPACE_BBKH, token)
        name = name.lower()
        tokens = sorted(set(''.join(x if x in bbkh.chars else ' ' for x in name).split()))
        string = ' '.join(token for token in tokens if len(token) > 1)

        if string.isspace():
            continue

        key = bbkh.bbkh(string)
        key = lexode.pack((b'foobar', key, name))
        db.put(key, b'')

db[lexode.pack((SUBSPACE_STEM_DOCUMENT_COUNTER,))] = lexode.pack(tuple(counter.items()))
db.put(lexode.pack((SUBSPACE_STEM_DOCUMENT_COUNTER,)), lexode.pack(tuple(counter.items())))


db.close()