~lown/openorb

2f5ac84670db510365093ef8958343c58f145ecd — Raphael Kabo a month ago c2a3668
feat: better error handling for problematic feed URLs
2 files changed, 71 insertions(+), 38 deletions(-)

M app/crawler.py
M app/templates/base.html
M app/crawler.py => app/crawler.py +38 -28
@@ 17,6 17,7 @@ r = redis.Redis(host=config["redis_host"], port=6379, decode_responses=True)


class FeedParserOutput(TypedDict):
    bozo: int
    feed: dict
    entries: list[dict]



@@ 63,35 64,39 @@ def create_db():
def get_or_insert_feed(feed: dict, config_url: str) -> tuple[int, bool]:
    conn = sqlite3.connect("./data/index.db")
    c = conn.cursor()
    maybe_feed = c.execute(
        "SELECT * FROM feeds WHERE config_url = ?", (config_url,)).fetchone()
    if maybe_feed:
        print("Feed already exists: " + feed["title"])
        return (maybe_feed[0], True)
    feed_url = [link["href"] for link in feed["links"]
                if link["rel"] == "self"] if "links" in feed else []
    if len(feed_url) > 0:
        feed_url = feed_url[0]
    else:
        feed_url = feed["link"]
    feed_title = feed["title"] if "title" in feed and feed["title"] != "" else urlparse(
        feed["link"]).netloc
    c.execute("INSERT INTO feeds (site_url, feed_url, config_url, title) VALUES (?, ?, ?, ?)",
              (feed["link"], feed_url, config_url, feed_title))
    conn.commit()
    conn.close()
    print("Inserted feed: " + feed["title"])
    last_row_id = c.lastrowid
    if last_row_id:
    try:
        maybe_feed = c.execute(
            "SELECT * FROM feeds WHERE config_url = ?", (config_url,)).fetchone()
        if maybe_feed:
            print("Feed already exists: " + feed["title"])
            return (maybe_feed[0], True)
        feed_url = [link["href"] for link in feed["links"]
                    if link["rel"] == "self"] if "links" in feed else []
        if len(feed_url) > 0:
            feed_url = feed_url[0]
        else:
            feed_url = feed["link"]
        feed_title = feed["title"] if "title" in feed and feed["title"] != "" else urlparse(
            feed["link"]).netloc
        c.execute("INSERT INTO feeds (site_url, feed_url, config_url, title) VALUES (?, ?, ?, ?)",
                (feed["link"], feed_url, config_url, feed_title))
        conn.commit()
        conn.close()
        print("Inserted feed: " + feed["title"])
        last_row_id = c.lastrowid
        if last_row_id:
            return (last_row_id, False)
        last_row_id = c.execute(
            "SELECT id FROM feeds WHERE config_url = ?", (config_url,)).fetchone()[0]
        return (last_row_id, False)
    last_row_id = c.execute(
        "SELECT id FROM feeds WHERE config_url = ?", (config_url,)).fetchone()[0]
    return (last_row_id, False)

# From https://www.alexmolas.com/2024/02/05/a-search-engine-in-80-lines.html
    except Exception as e:
        print(f"Error inserting feed {config_url} {str(e)}")


def clean_content(html_content):
    """
    From https://www.alexmolas.com/2024/02/05/a-search-engine-in-80-lines.html
    """
    soup = BeautifulSoup(html_content, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()


@@ 187,10 192,15 @@ def get_feed_last_updated(feed: FeedParserOutput) -> datetime:

def insert_feeds(feeds: list[tuple[str, FeedParserOutput]]):
    for feed in feeds:
        if "feed" not in feed[1] or not feed[1]["feed"]:
            print("Skipping feed due to parsing error")
        print("==================")
        if feed[1]["bozo"] or "feed" not in feed[1] or not feed[1]["feed"]:
            print(f"Skipping feed {feed[0]} due to parsing error")
            continue
        feed_result = get_or_insert_feed(feed[1]["feed"], feed[0])
        if not feed_result:
            print(f"Skipping feed {feed[0]} due to insertion error")
            continue
        (feed_id, feed_existed) = get_or_insert_feed(feed[1]["feed"], feed[0])
        feed_id, feed_existed = feed_result
        # Check if the feed last updated time is later than our last crawl time for this feed
        # If it is, we should crawl it again
        if feed_existed:

M app/templates/base.html => app/templates/base.html +33 -10
@@ 1,16 1,32 @@
<html lang="en">
    <head>
        <title>OpenOrb</title>
        <meta charset="utf-8">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <link rel="icon" href="{{ url_for('static', filename='crystal-ball.svg') }}" type="image/svg+xml">
        <meta name="description" content="A curated search engine for Atom and RSS feeds.">
        <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
        <meta charset="utf-8" />
        <meta name="viewport" content="width=device-width, initial-scale=1" />
        <link
            rel="icon"
            href="{{ url_for('static', filename='crystal-ball.svg') }}"
            type="image/svg+xml"
        />
        <meta
            name="description"
            content="A curated search engine for Atom and RSS feeds."
        />
        <link
            rel="stylesheet"
            href="{{ url_for('static', filename='style.css') }}"
        />
        <script src="https://unpkg.com/htmx.org@1.9.11"></script>
    </head>
    <body>
        <header>
            <h1><img id="header__icon" src="{{ url_for('static', filename='crystal-ball.svg') }}" /> OpenOrb</h1>
            <h1>
                <img
                    id="header__icon"
                    src="{{ url_for('static', filename='crystal-ball.svg') }}"
                />
                OpenOrb
            </h1>
            <nav>
                <ul hx-boost="true">
                    <li><a href="{{ url_for('search') }}">Search</a></li>


@@ 20,12 36,19 @@
            </nav>
        </header>
        {% if config.curator %}
            <p>This instance of OpenOrb is curated by {% if config.curator_url %}<a href="{{ config.curator_url }}">{{ config.curator }}</a>{% else %}{{ config.curator }}{% endif %}.</p>
        {% endif %}
        {% block content %}{% endblock %}
        <p>
            This instance of OpenOrb is curated by {% if config.curator_url %}<a
                href="{{ config.curator_url }}"
                >{{ config.curator }}</a
            >{% else %}{{ config.curator }}{% endif %}.
        </p>
        {% endif %} {% block content %}{% endblock %}
        <footer>
            {% block footer %}{% endblock %}
            <p>OpenOrb v1.2.0. View the source at <a href="https://git.sr.ht/~lown/openorb">SourceHut</a>.
            <p>
                OpenOrb v1.2.1. View the source at
                <a href="https://git.sr.ht/~lown/openorb">SourceHut</a>.
            </p>
        </footer>
    </body>
</html>