bb4d0980dad80279523bb17162ea0e7eace4325a — Emil Oppeln-Bronikowski 1 year, 20 days ago ed1f1ae
Faux user-agents and recording of last update
4 files changed, 29 insertions(+), 3 deletions(-)

M __main__.py
M collector.py
M feeds.py
M requirements.txt
M __main__.py => __main__.py +1 -1
@@ 3,7 3,7 @@ import sys
 import collector
 import exceptions
 
-for feed in collector.feeds():
+for feed in collector.get_feeds():
 
     try:
         feed = collector.parse_feed(feed)

M collector.py => collector.py +19 -2
@@ 7,12 7,16 @@ import os
 import progressbar
 import filetype
 import config
+import feeds
+from fake_useragent import UserAgent
 
 widgets = [progressbar.Percentage(), progressbar.Bar()]
 
 configuration = config.Application('application')
 subscribed_feeds = config.Feeds('feeds')
 
+ua = UserAgent()
+
 def parse_feed(feed_uri):
 
     parsed = feedparser.parse(feed_uri)


@@ 22,13 26,21 @@ def parse_feed(feed_uri):
 
     return parsed
 
-def feeds():
+def get_feeds():
     return subscribed_feeds['podcasts'].keys()
 
 def extract_media(feed):
+
     if not feed.entries:
         raise exceptions.NoEntries()
+
     print("→ {}".format(feed.feed.title))
+    last_seen = feeds.last_seen(feed.href)
+    if last_seen:
+        if last_seen == feed.updated_parsed:
+            print("Already saw everything")
+            return True
+
     for entry in feed.entries:
         if not entry.links:
             continue


@@ 36,13 48,18 @@ def extract_media(feed):
             if link['rel'] == 'enclosure' and link['type'].startswith('audio/'):
                 download_media(entry['id'], link['href'], entry['title'])
         break
+    feeds.last_seen(feed.href, feed.updated_parsed)
 
 def download_media(media_id, media_uri, title=None):
 
     hashed_media_id = hashlib.sha1()
     hashed_media_id.update(media_id.encode('utf-8'))
 
-    with requests.get(media_uri, stream=True) as resp:
+    headers = {'User-Agent': ua.random}
+
+    with requests.get(media_uri, stream=True, allow_redirects=True, headers=headers) as resp:
+        if resp.status_code != 200:
+            return False
         bar = progressbar.ProgressBar(
             max_value=int(resp.headers['content-length']),
             widgets=widgets

M feeds.py => feeds.py +8 -0
@@ 1,4 1,5 @@
 import config
+import time
 
 subscribed_feeds = config.Feeds('feeds')
 


@@ 12,3 13,10 @@ def add(feed):
 def remove(feed):
     del subscribed_feeds['podcasts'][feed]
     subscribed_feeds.save()
+
+def last_seen(feed, dt=None):
+    if dt:
+        subscribed_feeds['podcasts'][feed]['last_checked'] = dt
+        subscribed_feeds.save()
+    else:
+        return time.struct_time(subscribed_feeds['podcasts'][feed]['last_checked'])

M requirements.txt => requirements.txt +1 -0
@@ 4,3 4,4 @@ requests
 progressbar2
 filetype
 appdirs
+fake-useragent