~marnold128/web2text

f0caf134705878a5a9f1ec4ee4842a4837ae6de7 — Matt Arnold 2 years ago 0f6bb97
* Set a proper user agent, and add a safety valve so infinate recursions don't happen
2 files changed, 10 insertions(+), 5 deletions(-)

M snarfbot/crawlerapi.py
M snarfbot/linkview.py
M snarfbot/crawlerapi.py => snarfbot/crawlerapi.py +6 -2
@@ 33,7 33,7 @@ class StateBox:
        self.lock = threading.Lock()
        self.seen = set()
        self.mark = set()
        self.levels = levels
        self.maxlevel = levels
        for i in inital_seen:
            self.seen.add(i)



@@ 103,7 103,7 @@ _end = object()

def crawler(q, sb):


    level = 0
    links = linkview(sb.origin)
    print("Nlinks stage 1: " + str(len(links)))
    for i in links:


@@ 111,13 111,17 @@ def crawler(q, sb):
        sb.mark_add(i)
        # FIXME: Replace with proper recursive algorithm when
        # feature complete
    level += 1
    for i in links:
        print(str(q.qsize()))
        if not level < sb.maxlevel:
            break
        nthdegree = linkview(i)
        for x in nthdegree:
            if sb.okcrawl(x) and not sb.marked(x):
                q.put(x)
                sb.mark_add(x)
            level += 1
    q.put(_end)  # extractor should not need this but we will do it anyway.



M snarfbot/linkview.py => snarfbot/linkview.py +4 -3
@@ 1,9 1,10 @@
from bs4 import BeautifulSoup
import urllib.request as urllib2
import re

def linkview(url):
    html_page = urllib2.urlopen(url)
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; SnarfBot; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
def linkview(url, user_agent=DEFAULT_USER_AGENT):
    req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
    html_page = urllib2.urlopen(req)
    soup = BeautifulSoup(html_page, features="lxml")
    links = []