936715f089620389cf0b7913e34e48cf4ae8a8ad — Charles Daniels 2 months ago 6a8db65
add url2bib
3 files changed, 133 insertions(+), 0 deletions(-)

A utils/url2bib/README.md
A utils/url2bib/install.sh
A utils/url2bib/url2bib.py
A utils/url2bib/README.md => utils/url2bib/README.md +3 -0
@@ 0,0 1,3 @@
# url2bib

This is a port of [html2bibtex](https://github.com/dmstern/html2biblatex) to Python, with some additional convenience features.

A utils/url2bib/install.sh => utils/url2bib/install.sh +18 -0
@@ 0,0 1,18 @@

set -e
set -u

cd "$(dirname "$0")"

timestamp() {
	date -u +"%Y-%m-%dT%H:%M:%SZ"

printf 'ts="%s" util=url2bib msg="copying file" operation=copy src="%s" dest="%s"\n' "$(timestamp)" "./url2bib.py" "$BIN_DIR/url2bib"
cp "./url2bib.py" "$BIN_DIR/url2bib"

printf 'ts="%s" util=url2bib msg="making file executable" operation=chmod file="%s"\n' "$(timestamp)" "$BIN_DIR/url2bib"
chmod +x "$BIN_DIR/url2bib"

printf 'ts="%s" util=url2bib msg="finished installation for url2bib"\n' "$(timestamp)"

A utils/url2bib/url2bib.py => utils/url2bib/url2bib.py +112 -0
@@ 0,0 1,112 @@
#!/usr/bin/env python3

import argparse
import datetime
import dateutil
import io
import json
import lxml.html
import re
import requests
import sys
import yaml

# pip3 install --user htmldate
import htmldate

def log(msg):

def extract_title(doc):
    for node in doc.xpath("//title"):
        # This will break if there are multiple title nodes... what were you
        # expecting?
        return node.text

    # If there is no title node, go for the first h1.
    for node in [node for node in doc.xpath("//h1")]:
        return node.text

def extract_authors(doc):
    authors = []
    for node in doc.xpath("//meta[@name = 'author']"):
        if "content" not in node.attrib:


    return authors

def extract_bib(url):
    response = requests.get(url)
    doc = lxml.html.fromstring(response.text)

    pubdate = None
    if "Last-Modified" in response.headers:
            pubdate = dateutil.parser.parse(response.headers["Last-Modified"])
        except Exception as e:
            log("caught exception while parsing Last-Modified header '{}': {}".format(response.headers["Last-Modified"], e))
            pubdate = None

        result = htmldate.core.find_date(doc, url=url)
        pubdate = dateutil.parser.parse(result)
    except Exception as e:
        log("caught exception from htmldate: {}".format(e))

    authors = extract_authors(doc)

    bib = {
        "title": extract_title(doc),
        "urldate": datetime.datetime.now(),
        "url": url,

    if pubdate is not None:
        bib["date"] = pubdate

    if len(authors) > 0:
        bib["authors"] = authors

    return bib

def bib2bibtex(bib):
    keydate = str(bib["urldate"].year)
    if "date" in bib:
        keydate = str(bib["date"].year)

    keybase = re.sub(r'[^0-9a-zA-Z]', "", bib["title"])
    if ("authors" in bib) and (len(bib["authors"]) > 0):
        # prefer author last name
        keybase = bib["authors"][0].split()[-1]

    key = keybase + keydate

    s = "@online {{{}}} {{\n".format(key)
    s += "\ttitle = {{{}}}\n".format(bib["title"])
    s += "\turldate = {{{}}}\n".format(bib["urldate"].strftime("%Y-%m-%d"))
    s += "\turl = {{{}}}\n".format(bib["url"])
    if "authors" in bib:
        s += "\tauthor = {{{}}}\n".format(" and ".join(bib["authors"]))
    if "date" in bib:
        s += "\tdate = {{{}}}\n".format(bib["date"].strftime("%Y-%m-%d"))
        s += "\tyear = {{{}}}\n".format(bib["date"].year)
    s += "}\n"

    return s

def main():
    parser = argparse.ArgumentParser(description="A tool for generating BibTeX / YAML formatted bibliography entries from URLs.")

    parser.add_argument("url", type=str, help="URL to generate bibliography for")

    args = parser.parse_args()

    bib = extract_bib(args.url)

if __name__ == "__main__":