M snarfbot/crawlerapi.py => snarfbot/crawlerapi.py +6 -2
@@ 33,7 33,7 @@ class StateBox:
self.lock = threading.Lock()
self.seen = set()
self.mark = set()
- self.levels = levels
+ self.maxlevel = levels
for i in inital_seen:
self.seen.add(i)
@@ 103,7 103,7 @@ _end = object()
def crawler(q, sb):
-
+ level = 0
links = linkview(sb.origin)
print("Nlinks stage 1: " + str(len(links)))
for i in links:
@@ 111,13 111,17 @@ def crawler(q, sb):
sb.mark_add(i)
# FIXME: Replace with proper recursive algorithm when
# feature complete
+ level += 1
for i in links:
print(str(q.qsize()))
+ if not level < sb.maxlevel:
+ break
nthdegree = linkview(i)
for x in nthdegree:
if sb.okcrawl(x) and not sb.marked(x):
q.put(x)
sb.mark_add(x)
+ level += 1
q.put(_end) # extractor should not need this but we will do it anyway.
M snarfbot/linkview.py => snarfbot/linkview.py +4 -3
@@ 1,9 1,10 @@
from bs4 import BeautifulSoup
import urllib.request as urllib2
import re
-
-def linkview(url):
- html_page = urllib2.urlopen(url)
+DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; SnarfBot; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
+def linkview(url, user_agent=DEFAULT_USER_AGENT):
+ req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
+ html_page = urllib2.urlopen(req)
soup = BeautifulSoup(html_page, features="lxml")
links = []