~marnold128/web2text

5a4425b6447d99693717be8c01418f58eb5e1457 — Matt Arnold 3 years ago 43ce9c5
Add crawler api code
1 files changed, 127 insertions(+), 0 deletions(-)

A snarfbot/crawlerapi.py
A snarfbot/crawlerapi.py => snarfbot/crawlerapi.py +127 -0
@@ 0,0 1,127 @@
import logging
import threading
import time
import tldextract
import os, os.path
import codecs
from snarfbot.snarf3k import slugify, snarf
from queue import Queue
from snarfbot.linkview import linkview

"""
Module contains the bulk of the crawler code this is done with two types of thread
One that performs the actual crawling through links and one that extracts and saves data 
to disk. These are implemented in crawl, and extract functions
"""

class StateBox:
    """
    Statebox is a thread-safe (i hope ), data structure for communication between 
    the crawler and the extractor threads. This holds both shared metadata, and the set of
    sites which have already been visited, and parsed. So that infinate crawls queues are avoided.
    Note that this is most likely a bad design; and proper typed message queues would 
    be the computer sciencey way of handling this. So this api might want to change
    or be deleted in the future. If this becomes a thing beyond saving all the fanfiction.
    """

    def __init__(self, origin, inital_list=[], sameorigin=True):
        self.starturl = tldextract.extract(origin)
        self.origin = origin
        self.norecursive = sameorigin
        self.lock = threading.Lock()
        self.seen = set()
        for i in inital_list:
            self.seen.add(i)

    def add(self, uri):
        self.lock.acquire()
        try:
            self.seen.add(uri)
        finally:
            self.lock.release()

    def delete(self, uri):
        """
        docstring
        """
        self.lock.acquire()
        try:
            if uri in self.seen():
                self.seen.remove(uri)
        finally:
            self.lock.release()

    def seenthis(self, uri):
        return uri in self.seen

    def okcrawl(self, uri):
        """
        docstring
        """
        ext = tldextract.extract(uri)
        if not self.norecursive:
            return True
        if ext.registered_domain == self.starturl.registered_domain and self.norecursive:
            return True
        else:
            return False


_end = object()

def crawler(q, sb):

    links = linkview(sb.origin)
    print("Nlinks stage 1: " + str(len(links)))
    for i in links:
        q.put(i)
    if sb.norecursive == True:
        q.put(_end)
    else:
        ## FIXME: Replace with proper recursive algorithm when 
        ## feature complete 
        for i in links:
            print(str(q.qsize()))
            nthdegree = linkview(i)
            for x in nthdegree:
                q.put(x)
    q.put(_end) # extractor should not need this but we will do it anyway.


def extractor(q, sb):

    while not q.empty():
        basedir = os.getcwd()
        task = q.get()
        if task is _end:
            os.chdir(basedir)
            break
        
        else:
            if sb.seenthis(task) or not sb.okcrawl(task):
                q.task_done()
                continue
            etd = tldextract.extract(task)
            dumppath = os.path.join(basedir, etd.registered_domain)
            if os.path.exists(dumppath) and os.path.isdir(dumppath):
                os.chdir(dumppath)
            else:
                os.mkdir(dumppath)
                os.chdir(dumppath)

            pack = snarf(task)
            svsname = slugify(pack[0]) + '.txt'
            fp = codecs.open(svsname, "w", 'utf-8')
            fp.write(pack[1])
            fp.close()
            os.chdir(basedir)
            sb.add(task)
            q.task_done()