From 0f6bb97425fe196e36d1609bb0c7afa1cb594fa2 Mon Sep 17 00:00:00 2001 From: Matt Arnold Date: Thu, 31 Dec 2020 19:18:30 -0500 Subject: [PATCH] Redesign on a more mark/sweep pattern --- snarfbot/crawlerapi.py | 77 ++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/snarfbot/crawlerapi.py b/snarfbot/crawlerapi.py index 439549c..0140404 100644 --- a/snarfbot/crawlerapi.py +++ b/snarfbot/crawlerapi.py @@ -2,7 +2,8 @@ import logging import threading import time import tldextract -import os, os.path +import os +import os.path import codecs from snarfbot.snarf3k import slugify, snarf from queue import Queue @@ -14,6 +15,7 @@ One that performs the actual crawling through links and one that extracts and sa to disk. These are implemented in crawl, and extract functions """ + class StateBox: """ Statebox is a thread-safe (i hope ), data structure for communication between @@ -24,29 +26,31 @@ class StateBox: or be deleted in the future. If this becomes a thing beyond saving all the fanfiction. """ - def __init__(self, origin, inital_list=[], sameorigin=True): + def __init__(self, origin, inital_seen=[], sameorigin=True, levels=1): self.starturl = tldextract.extract(origin) self.origin = origin self.norecursive = sameorigin self.lock = threading.Lock() self.seen = set() - for i in inital_list: + self.mark = set() + self.levels = levels + for i in inital_seen: self.seen.add(i) - def add(self, uri): + def seen_add(self, uri): self.lock.acquire() try: self.seen.add(uri) finally: self.lock.release() - def delete(self, uri): + def seen_delete(self, uri): """ docstring """ self.lock.acquire() try: - if uri in self.seen(): + if uri in self.seen: self.seen.remove(uri) finally: self.lock.release() @@ -54,6 +58,33 @@ class StateBox: def seenthis(self, uri): return uri in self.seen + def mark_add(self, uri): + """ + docstring + """ + self.lock.acquire() + try: + self.mark.add(uri) + finally: + self.lock.release() + + def mark_delete(self, uri): + """ + docstring + """ + self.lock.acquire() + try: + if uri in self.mark: + self.mark.remove(uri) + finally: + self.lock.release() + + def marked(self, uri): + """ + docstring + """ + return uri in self.mark + def okcrawl(self, uri): """ docstring @@ -69,23 +100,25 @@ class StateBox: _end = object() + def crawler(q, sb): + links = linkview(sb.origin) print("Nlinks stage 1: " + str(len(links))) for i in links: q.put(i) - if sb.norecursive == True: - q.put(_end) - else: - ## FIXME: Replace with proper recursive algorithm when - ## feature complete - for i in links: - print(str(q.qsize())) - nthdegree = linkview(i) - for x in nthdegree: + sb.mark_add(i) + # FIXME: Replace with proper recursive algorithm when + # feature complete + for i in links: + print(str(q.qsize())) + nthdegree = linkview(i) + for x in nthdegree: + if sb.okcrawl(x) and not sb.marked(x): q.put(x) - q.put(_end) # extractor should not need this but we will do it anyway. + sb.mark_add(x) + q.put(_end) # extractor should not need this but we will do it anyway. def extractor(q, sb): @@ -96,7 +129,7 @@ def extractor(q, sb): if task is _end: os.chdir(basedir) break - + else: if sb.seenthis(task) or not sb.okcrawl(task): q.task_done() @@ -115,13 +148,5 @@ def extractor(q, sb): fp.write(pack[1]) fp.close() os.chdir(basedir) - sb.add(task) + sb.seen_add(task) q.task_done() - - - - - - - - -- 2.45.2