bb4d0980dad80279523bb17162ea0e7eace4325a — Emil Oppeln-Bronikowski 11 months ago ed1f1ae
Faux user-agents and recording of last update
4 files changed, 29 insertions(+), 3 deletions(-)

M __main__.py
M collector.py
M feeds.py
M requirements.txt
M __main__.py => __main__.py +1 -1
@@ 3,7 3,7 @@ import collector
  import exceptions
  
- for feed in collector.feeds():
+ for feed in collector.get_feeds():
  
      try:
          feed = collector.parse_feed(feed)

M collector.py => collector.py +19 -2
@@ 7,12 7,16 @@ import progressbar
  import filetype
  import config
+ import feeds
+ from fake_useragent import UserAgent
  
  widgets = [progressbar.Percentage(), progressbar.Bar()]
  
  configuration = config.Application('application')
  subscribed_feeds = config.Feeds('feeds')
  
+ ua = UserAgent()
+ 
  def parse_feed(feed_uri):
  
      parsed = feedparser.parse(feed_uri)


@@ 22,13 26,21 @@       return parsed
  
- def feeds():
+ def get_feeds():
      return subscribed_feeds['podcasts'].keys()
  
  def extract_media(feed):
+ 
      if not feed.entries:
          raise exceptions.NoEntries()
+ 
      print("→ {}".format(feed.feed.title))
+     last_seen = feeds.last_seen(feed.href)
+     if last_seen:
+         if last_seen == feed.updated_parsed:
+             print("Already saw everything")
+             return True
+ 
      for entry in feed.entries:
          if not entry.links:
              continue


@@ 36,13 48,18 @@ if link['rel'] == 'enclosure' and link['type'].startswith('audio/'):
                  download_media(entry['id'], link['href'], entry['title'])
          break
+     feeds.last_seen(feed.href, feed.updated_parsed)
  
  def download_media(media_id, media_uri, title=None):
  
      hashed_media_id = hashlib.sha1()
      hashed_media_id.update(media_id.encode('utf-8'))
  
-     with requests.get(media_uri, stream=True) as resp:
+     headers = {'User-Agent': ua.random}
+ 
+     with requests.get(media_uri, stream=True, allow_redirects=True, headers=headers) as resp:
+         if resp.status_code != 200:
+             return False
          bar = progressbar.ProgressBar(
              max_value=int(resp.headers['content-length']),
              widgets=widgets

M feeds.py => feeds.py +8 -0
@@ 1,4 1,5 @@ import config
+ import time
  
  subscribed_feeds = config.Feeds('feeds')
  


@@ 12,3 13,10 @@ def remove(feed):
      del subscribed_feeds['podcasts'][feed]
      subscribed_feeds.save()
+ 
+ def last_seen(feed, dt=None):
+     if dt:
+         subscribed_feeds['podcasts'][feed]['last_checked'] = dt
+         subscribed_feeds.save()
+     else:
+         return time.struct_time(subscribed_feeds['podcasts'][feed]['last_checked'])

M requirements.txt => requirements.txt +1 -0
@@ 4,3 4,4 @@ progressbar2
  filetype
  appdirs
+ fake-useragent