~travisshears/korallenriff-feed

korallenriff-feed/feed.py -rw-r--r-- 3.7 KiB
40ec5dc5 — Travis Shears add docker deploy info 1 year, 3 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from datetime import datetime, timezone
import logging
from pprint import pprint

import requests
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator
import boto3

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

s3 = boto3.resource('s3')


def _get_url():
    # from https://www.delftstack.com/howto/python/python-current-year/
    currentDateTime = datetime.now(timezone.utc)
    date = currentDateTime.date()
    year = date.strftime("%Y")
    return f"https://www.korallenriff.de/archivKompakt/{year}.html"

def _retrive_articles_from_soup(soup):
    date_els = soup.select("div.artikel li strong")
    articles = []
    for date_el in date_els:
        date_str = str(date_el.string)
        list_element_el = date_el.parent
        anchor_el = list_element_el.find('a')
        anchor_str = anchor_el.get_text()
        anchor_url = anchor_el['href']
        article_pub_date_raw = datetime.strptime(date_str, '%d.%m.%Y')
        article_pub_date = article_pub_date_raw.replace(tzinfo=timezone.utc)
        articles.append({
            'url': f"https://www.korallenriff.de{anchor_url}",
            'txt': anchor_str,
            'date': article_pub_date
        })
    articles.reverse()
    return articles

def _generate_feed(articles):
    fg = FeedGenerator()
    fg.title('Korallenriff Archiv')
    fg.subtitle('Alles deutsche Meeresaquaristik')
    fg.description('Scrape from of the Korallenriff articles')
    fg.author({'name':'mehrere'})
    fg.language('de')
    fg.id('https://www.korallenriff.de/archivKompakt/XXXX.html')
    fg.link( href=_get_url(), rel='alternate' )
    fg.logo('https://www.korallenriff.de/bilder/korallenriffde.gif')
    fg.link( href='https://travisshears.com/personal/korallenriff-feed/rss.xml', rel='self' )

    for article in articles:
        fe = fg.add_entry()
        fe.id(article['url'])
        fe.title(article['txt'])
        fe.link(href=article['url'])
        fe.published(article['date'])
    return fg.rss_str(pretty=True)

def _upload_to_s3(rss_feed):
    try:
        s3.Bucket('travisshears.personal').put_object(Key='personal/korallenriff-feed/rss.xml', Body=rss_feed)
        logger.info('new rss file uploaded to s3')
    except:
        logger.error('problem uploading rss file to s3')

def _is_new_feed(feed):
    old_feed_s3_obj = s3.Object('travisshears.personal', 'personal/korallenriff-feed/rss.xml')
    old_feed=old_feed_s3_obj.get()['Body'].read()
    old_feed_soup = BeautifulSoup(old_feed, "xml")
    old_items = old_feed_soup.find_all('item')
    old_ids = []
    for old_item in old_items:
        old_id_el = old_item.find('guid');
        old_ids.append(old_id_el.get_text())

    new_feed_soup = BeautifulSoup(feed, "xml")
    new_items = new_feed_soup.find_all('item')
    new_ids = []
    for new_item in new_items:
        new_id_el = new_item.find('guid');
        new_ids.append(new_id_el.get_text())

    if len(old_ids) != len(new_ids):
        return True
    
    for i, old_id in enumerate(old_ids):
        try:
            new_id = new_ids[i]
        except:
            return True
        
        if new_id != old_id:
            return True

    return False

def get_feed():
    try:
        url = _get_url()
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        articles = _retrive_articles_from_soup(soup)
        feed = _generate_feed(articles)
        if _is_new_feed(feed):
            logger.info('Uploading new feed')
            _upload_to_s3(feed)
        else:
            logger.info('Skipping upload as feed is the same')
    except Exception as e:
        logger.error('Error:'+ str(e))


if __name__ == '__main__':
    get_feed()