~charles/dotfiles

ref: 42ae9b59869483135b3070a0f00deb773c85adb0 dotfiles/utils/url2bib/url2bib.py -rw-r--r-- 2.9 KiB
42ae9b59Charles Daniels add new-reference 3 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3

import argparse
import datetime
import dateutil
import io
import json
import lxml.html
import re
import requests
import sys
import yaml

# pip3 install --user htmldate
import htmldate

def log(msg):
    sys.stdout.flush()
    sys.stderr.write("{}\n".format(msg))
    sys.stderr.flush()

def extract_title(doc):
    for node in doc.xpath("//title"):
        # This will break if there are multiple title nodes... what were you
        # expecting?
        return node.text

    # If there is no title node, go for the first h1.
    for node in [node for node in doc.xpath("//h1")]:
        return node.text

def extract_authors(doc):
    authors = []
    for node in doc.xpath("//meta[@name = 'author']"):
        if "content" not in node.attrib:
            continue

        authors.append(node.attrib["content"])

    return authors

def extract_bib(url):
    response = requests.get(url)
    doc = lxml.html.fromstring(response.text)

    pubdate = None
    if "Last-Modified" in response.headers:
        try:
            pubdate = dateutil.parser.parse(response.headers["Last-Modified"])
        except Exception as e:
            log("caught exception while parsing Last-Modified header '{}': {}".format(response.headers["Last-Modified"], e))
            pubdate = None

    try:
        result = htmldate.core.find_date(doc, url=url)
        pubdate = dateutil.parser.parse(result)
    except Exception as e:
        log("caught exception from htmldate: {}".format(e))

    authors = extract_authors(doc)

    bib = {
        "title": extract_title(doc),
        "urldate": datetime.datetime.now(),
        "url": url,
    }

    if pubdate is not None:
        bib["date"] = pubdate

    if len(authors) > 0:
        bib["authors"] = authors

    return bib

def bib2bibtex(bib):
    keydate = str(bib["urldate"].year)
    if "date" in bib:
        keydate = str(bib["date"].year)

    keybase = re.sub(r'[^0-9a-zA-Z]', "", bib["title"])
    if ("authors" in bib) and (len(bib["authors"]) > 0):
        # prefer author last name
        keybase = bib["authors"][0].split()[-1]

    key = keybase + keydate

    s = "@online {{{},\n".format(key)
    s += "\ttitle = {{{}}},\n".format(bib["title"])
    s += "\turldate = {{{}}},\n".format(bib["urldate"].strftime("%Y-%m-%d"))
    s += "\turl = {{{}}},\n".format(bib["url"])
    if "authors" in bib:
        s += "\tauthor = {{{}}},\n".format(" and ".join(bib["authors"]))
    if "date" in bib:
        s += "\tdate = {{{}}},\n".format(bib["date"].strftime("%Y-%m-%d"))
        s += "\tyear = {{{}}}\n".format(bib["date"].year)
    s += "}\n"

    return s

def main():
    parser = argparse.ArgumentParser(description="A tool for generating BibTeX / YAML formatted bibliography entries from URLs.")

    parser.add_argument("url", type=str, help="URL to generate bibliography for")

    args = parser.parse_args()

    bib = extract_bib(args.url)
    print(bib2bibtex(bib))

if __name__ == "__main__":
    main()