@@ 0,0 1,127 @@
+import logging
+import threading
+import time
+import tldextract
+import os, os.path
+import codecs
+from snarfbot.snarf3k import slugify, snarf
+from queue import Queue
+from snarfbot.linkview import linkview
+
+"""
+Module contains the bulk of the crawler code this is done with two types of thread
+One that performs the actual crawling through links and one that extracts and saves data
+to disk. These are implemented in crawl, and extract functions
+"""
+
+class StateBox:
+ """
+ Statebox is a thread-safe (i hope ), data structure for communication between
+ the crawler and the extractor threads. This holds both shared metadata, and the set of
+ sites which have already been visited, and parsed. So that infinate crawls queues are avoided.
+ Note that this is most likely a bad design; and proper typed message queues would
+ be the computer sciencey way of handling this. So this api might want to change
+ or be deleted in the future. If this becomes a thing beyond saving all the fanfiction.
+ """
+
+ def __init__(self, origin, inital_list=[], sameorigin=True):
+ self.starturl = tldextract.extract(origin)
+ self.origin = origin
+ self.norecursive = sameorigin
+ self.lock = threading.Lock()
+ self.seen = set()
+ for i in inital_list:
+ self.seen.add(i)
+
+ def add(self, uri):
+ self.lock.acquire()
+ try:
+ self.seen.add(uri)
+ finally:
+ self.lock.release()
+
+ def delete(self, uri):
+ """
+ docstring
+ """
+ self.lock.acquire()
+ try:
+ if uri in self.seen():
+ self.seen.remove(uri)
+ finally:
+ self.lock.release()
+
+ def seenthis(self, uri):
+ return uri in self.seen
+
+ def okcrawl(self, uri):
+ """
+ docstring
+ """
+ ext = tldextract.extract(uri)
+ if not self.norecursive:
+ return True
+ if ext.registered_domain == self.starturl.registered_domain and self.norecursive:
+ return True
+ else:
+ return False
+
+
+_end = object()
+
+def crawler(q, sb):
+
+ links = linkview(sb.origin)
+ print("Nlinks stage 1: " + str(len(links)))
+ for i in links:
+ q.put(i)
+ if sb.norecursive == True:
+ q.put(_end)
+ else:
+ ## FIXME: Replace with proper recursive algorithm when
+ ## feature complete
+ for i in links:
+ print(str(q.qsize()))
+ nthdegree = linkview(i)
+ for x in nthdegree:
+ q.put(x)
+ q.put(_end) # extractor should not need this but we will do it anyway.
+
+
+def extractor(q, sb):
+
+ while not q.empty():
+ basedir = os.getcwd()
+ task = q.get()
+ if task is _end:
+ os.chdir(basedir)
+ break
+
+ else:
+ if sb.seenthis(task) or not sb.okcrawl(task):
+ q.task_done()
+ continue
+ etd = tldextract.extract(task)
+ dumppath = os.path.join(basedir, etd.registered_domain)
+ if os.path.exists(dumppath) and os.path.isdir(dumppath):
+ os.chdir(dumppath)
+ else:
+ os.mkdir(dumppath)
+ os.chdir(dumppath)
+
+ pack = snarf(task)
+ svsname = slugify(pack[0]) + '.txt'
+ fp = codecs.open(svsname, "w", 'utf-8')
+ fp.write(pack[1])
+ fp.close()
+ os.chdir(basedir)
+ sb.add(task)
+ q.task_done()
+
+
+
+
+
+
+
+