~kvakil/sf-legistar-scrape

6f517e59722f4448ef09051932c40d010cd094cb — Keyhan Vakil 1 year, 5 months ago
initial commit
7 files changed, 82 insertions(+), 0 deletions(-)

A .gitignore
A clean.py
A get_entities.py
A legistar.db
A make_sqlite.sh
A scrape.py
A templates/table-legistar-legistar.html
A  => .gitignore +2 -0
@@ 1,2 @@
.ipynb_checkpoints/
venv/

A  => clean.py +25 -0
@@ 1,25 @@
import sys
import json

def convert_date_to_europe(date):
    if not date:
        return date
    m, d, y = date.split('/')
    y = int(y)
    m = int(m)
    d = int(d)
    return f'{y:04d}-{m:02d}-{d:02d}'

for line in sys.stdin.readlines():
    rows = json.loads(line)
    for row in rows:
        new_row = {
            'FileNumber': row['File\u00a0#'],
            'Type': row['Type'],
            'Status': row['Status'],
            'Introduced': convert_date_to_europe(row['Introduced']),
            'FinalAction': convert_date_to_europe(row['Final\u00a0Action']),
            'Title': row['Title'],
            'url': row['url'],
        }
        print(json.dumps(new_row))

A  => get_entities.py +10 -0
@@ 1,10 @@
import json
import sys
import spacy
nlp = spacy.load('en_core_web_trf')

for line in sys.stdin:
    row = json.loads(line)
    file_number = row['FileNumber']
    for entity in nlp(row['Title']).ents:
        print(json.dumps({"FileNumber": file_number, "Label": entity.label_, "Text": entity.text}))

A  => legistar.db +0 -0
A  => make_sqlite.sh +11 -0
@@ 1,11 @@
#!/bin/bash
set -Eeuox pipefail
python3 scrape.py > results.json
python3 clean.py < results.json > cleaned.json
rm -f legistar.db
sqlite-utils insert --pk FileNumber --nl --alter --analyze legistar.db legistar cleaned.json
sqlite-utils create-index legistar.db legistar Type
sqlite-utils create-index legistar.db legistar Status
sqlite-utils create-index legistar.db legistar Introduced
sqlite-utils create-index legistar.db legistar FinalAction
sqlite-utils enable-fts legistar.db legistar Title

A  => scrape.py +15 -0
@@ 1,15 @@
import urllib3
urllib3.disable_warnings()

from legistar.bills import LegistarBillScraper
from json import dumps

def scrape_bills():
    s = LegistarBillScraper()
    s.BASE_URL = 'https://sfgov.legistar.com/'
    s.LEGISLATION_URL = 'https://sfgov.legistar.com/Legislation.aspx'
    for page in s.searchLegislation():
        print(dumps(list(s.parseSearchResults(page))))


scrape_bills()

A  => templates/table-legistar-legistar.html +19 -0
@@ 1,19 @@
{% extends "default:table.html" %}

{% block content %}
<div style="width: 100vw">
<script type="application/vnd.vegalite+json">
{
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "width": "container",
    "description": "introduced by date",
    "mark": "bar",
    "encoding": {
        "x": {"timeUnit": "year", "field": "Introduced", "type": "temporal"},
        "y": {"aggregate": "count"}
    }
}
</script>
</div>
{{ super() }}
{% endblock %}