~mbuechse/gemtext-to-html

425cc06accb87ef4b5efbdd05c186ea7a7be9615 — Matthias B├╝chse 2 years ago 416e87b
added generation of rss.xml
1 files changed, 95 insertions(+), 20 deletions(-)

M convert_dir.py
M convert_dir.py => convert_dir.py +95 -20
@@ 1,6 1,9 @@
#!/usr/bin/env python
# This is based on work by huntingb. https://github.com/huntingb/gemtext-html-converter
import datetime
from email.utils import format_datetime
import os
import re
import sys

TAGS_DICT = {


@@ 18,17 21,73 @@ r"""<!DOCTYPE HTML>
<head>
  <meta charset="UTF-8">
  <title>%%title%%</title>
  <link rel="alternate" type="application/rss+xml" title="rss" href="rss.xml">
  <meta name=viewport content="width=device-width, initial-scale=1">
  <style>body {font-family:sans-serif;max-width:40em;margin:auto;} html {overflow-y:scroll;}
  @media (max-width:50rem) {body {margin:0px 20px;}}</style>
</head>
<body>
<p>This page is an HTML mirror of the <a href="%%fn%%">original gemini page</a>.</p><hr>
<p>This page is an HTML mirror of the <a href="%%gem_root_url%%%%fn%%">original gemini page</a>.</p><hr>
%%body%%
<hr>
<p>Subscribe to my <a href="rss.xml">RSS feed</a>.</p>
</body>
</html>
"""
ROOT_URL = "gemini://halfbigdata.eu:47060/"
POST_TIME = " 12:00:00 +01:00"
DATE_REGEX = re.compile(r"^\d\d\d\d-\d\d-\d\d")
# rss is generated from index.gmi
RSS = \
r"""<rss version="2.0"><channel>
<title>%%title%%</title><link>%%http_root_url%%</link>
<description></description>
%%items%%
</channel></rss>
"""
RSS_ITEM = \
"""
<item><title>%%title%%</title><link>%%http_root_url%%%%fn%%</link><pubDate>%%date%%</pubDate></item>
"""
GEM_ROOT_URL = "gemini://halfbigdata.eu:47060/"
HTTP_ROOT_URL = "http://halfbigdata.eu/"


class TemplateProcessor(object):
    # don't use str.format, because HTML contains so many curly braces
    def __init__(self, template, defaults=None, delimiter="%%"):
        self.mapper = {}
        self.components = template.split(delimiter)
        # so, components now alternates between literal text and variable names
        # variables are on even indices if the template starts with delimiter
        variable_parity = int(not template.startswith(delimiter))
        for idx, tl in enumerate(self.components):
            if idx & 1 != variable_parity:
                continue
            self.mapper[tl] = idx

    def substitute(self, **value_mapping):
        for key, value in value_mapping.items():
            idx = self.mapper.get(key)
            if idx is None:
                continue
            self.components[idx] = value

    def realize(self, **value_mapping):
        # destructively update template (because why not, it's not like we are doing concurrency)
        self.substitute(**value_mapping)
        sys.stderr.write(f"{self.mapper} {value_mapping}")
        return "".join(self.components)


def convert_gem_link(meat):
    # `meat` is the gemtext link line without the prefix "=>"
    href, inner = [x.strip() for x in meat.split(maxsplit=1)]
    # links to local gemini files should be converted to the corresponding html
    if href.endswith(".gmi") and not (
        href.startswith("http://") or href.startswith("gemini://") or href.startswith("/")
    ):
        href = href[:-4] + ".html"
    return href, inner


def convert_gemtext(lines):


@@ 61,12 120,7 @@ def convert_gemtext(lines):
            else:
                inner = gmi_line[2:].strip()
            if "{href}" in pattern:
                href, inner = inner.split(maxsplit=1)
                # links to local gemini files should be converted to the corresponding html
                if href.endswith(".gmi") and not (
                    href.startswith("http://") or href.startswith("gemini://") or href.startswith("/")
                ):
                    href = href[:-4] + ".html"
                href, inner = convert_gem_link(inner)
            if not title and "<h1>" in pattern:
                title = inner
            if ("<li>" in pattern) != in_list:


@@ 81,13 135,28 @@ def convert_gemtext(lines):
    return title, htmllines


def convert_gemtext_to_rss_items(gem_lines, rss_item_template):
    # separate processing (redundant, if you will) of index.gmi for rss.xml
    # but: separation of concerns
    for line in gem_lines:
        if not line.startswith("=>"):
            continue
        href, inner = convert_gem_link(line[2:])
        m = DATE_REGEX.match(inner)
        iso_date = m.group() if m else "2022-02-22"
        title = inner[m.span()[1]:].strip() if m else inner
        # convert ISO date to this nonsense RFC format (why, oh why)
        sys.stderr.write(f"{datetime.datetime.fromisoformat(iso_date + POST_TIME)}\n")
        date = format_datetime(datetime.datetime.fromisoformat(iso_date + POST_TIME))
        yield rss_item_template.realize(title=title, date=date, fn=href)


def process_dir(path):
    # don't use template.format, because HTML contains so many curly braces
    template = HTML.split("%%")
    indexmapper = {"title": None, "body": None, "fn": None}
    for i, tl in enumerate(template):
        if tl in indexmapper:
            indexmapper[tl] = i
    html_template = TemplateProcessor(HTML)
    rss_template = TemplateProcessor(RSS)
    rss_item_template = TemplateProcessor(RSS_ITEM)
    for template in (html_template, rss_template, rss_item_template):
        template.substitute(gem_root_url=GEM_ROOT_URL, http_root_url=HTTP_ROOT_URL)
    for fn in os.listdir(path):
        if not fn.endswith(".gmi"):
            continue


@@ 95,15 164,21 @@ def process_dir(path):
        # in the old days this would have been sacrilege, but memory abounds and Python instructions are expensive
        with open(os.path.join(path, fn), "r") as fileobj:
            gemtext = fileobj.read()
        title, htmllines = convert_gemtext(gemtext.splitlines())
        # destructively update template (because why not, it's not like we are doing concurrency)
        template[indexmapper["title"]] = title or "(untitled)"
        template[indexmapper["fn"]] = ROOT_URL + fn
        template[indexmapper["body"]] = "\n".join(htmllines)
        gem_lines = gemtext.splitlines()
        title, html_lines = convert_gemtext(gem_lines)
        if not title:
            title = "(untitled)"
        html_text = html_template.realize(title=title, fn=fn, body="\n".join(html_lines))
        # likewise, write out the whole HTML file at once
        with open(os.path.join(path, fn[:-4] + ".html"), "w") as fileobj:
            fileobj.write("".join(template))
            fileobj.write(html_text)
        sys.stderr.write(f"done processing: {fn}\n")
        if fn == "index.gmi":
            rss_items = convert_gemtext_to_rss_items(gem_lines, rss_item_template)
            rss_text = rss_template.realize(title=title, items="\n".join(rss_items))
            with open(os.path.join(path, "rss.xml"), "w") as fileobj:
                fileobj.write(rss_text)
            sys.stderr.write(f"done writing rss.xml\n")


if __name__ == "__main__":