f75cad632265b786e4022e8276c7c0b87c93601f — Timothée Floure 6 months ago b2308ec master
Fetch and generate pages from Rawhide SQLite DB
7 files changed, 281 insertions(+), 118 deletions(-)

A .gitignore
A fetch-repository-dbs.py
A generate-html.py
D generate-html.raku
A templates/index.html.j2
A templates/package.html.j2
A .gitignore => .gitignore +1 -0
@@ 0,0 1,1 @@

A README.md => README.md +5 -0
@@ 0,0 1,5 @@
# fedora-packages-static

* replaces current moshka thingy
* static -> less moving parts, less maintance, simpler, indexing via external
* see mail thread

A fetch-repository-dbs.py => fetch-repository-dbs.py +172 -0
@@ 0,0 1,172 @@

import requests
import xml.etree.ElementTree as ET
import shutil
import tempfile
import sqlite3
import os
import argparse
import hashlib

repomd_xml_namespace = {
    'repo': 'http://linux.duke.edu/metadata/repo',
    'rpm': 'http://linux.duke.edu/metadata/rpm',
padding = 22

KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos'
# Enforce, or not, checking the SSL certs

def needs_update(local_file, remote_sha, sha_type):
    ''' Compare sha of a local and remote file.
    Return True if our local file needs to be updated.

    if not os.path.isfile(local_file):
        # If we have never downloaded this before, then obviously it has
        # "changed"
        return True

    # Old old epel5 doesn't even know which sha it is using..
    if sha_type == 'sha':
        sha_type = 'sha1'

    hash = getattr(hashlib, sha_type)()
    with open(local_file, 'rb') as f:

    local_sha = hash.hexdigest()
    if local_sha != remote_sha:
        return True

    return False

def download_db(name, repomd_url, archive):
    print(f'{name.ljust(padding)} Downloading file: {repomd_url} to {archive}')
    response = requests.get(repomd_url, verify=DL_VERIFY)
    with open(archive, 'wb') as stream:

def decompress_db(name, archive, location):
    ''' Decompress the given XZ archive at the specified location. '''
    print(f'{name.ljust(padding)} Extracting {archive} to {location}')
    if archive.endswith('.xz'):
        import lzma
        with lzma.open(archive) as inp, open(location, 'wb') as out:
    elif archive.endswith('.tar.gz'):
        import tarfile
        with tarfile.open(archive) as tar:
    elif archive.endswith('.gz'):
        import gzip
        with gzip.open(archive, 'rb') as inp, open(location, 'wb') as out:
    elif archive.endswith('.bz2'):
        import bz2
        with bz2.open(archive) as inp, open(location, 'wb') as out:
        raise NotImplementedError(archive)

def index_db(name, tempdb):
    print(f'{name.ljust(padding)} Indexing file: {tempdb}')

    if tempdb.endswith('primary.sqlite'):
        conn = sqlite3.connect(tempdb)
        conn.execute('CREATE INDEX packageSource ON packages (rpm_sourcerpm)')

def install_db(name, src, dest):
    print(f'{name.ljust(padding)} Installing {src} to {dest}.')
    shutil.move(src, dest)

def handle(repo, target_dir):
    url, name = repo
    repomd_url = f'{url}/repomd.xml'
    response = requests.get(repomd_url, verify=DL_VERIFY)
    if not response:
        print(f'{name.ljust(padding)} !! Failed to get {repomd_url!r} {response!r}')

    # Parse the xml doc and get a list of locations and their shasum.
    files = ((
        node.find('repo:location', repomd_xml_namespace),
        node.find('repo:open-checksum', repomd_xml_namespace),
    ) for node in ET.fromstring(response.text))

    # Extract out the attributes that we're really interested in.
    files = (
        (f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type'])
        for f, s in files if f is not None and s is not None

    # Filter down to only sqlite dbs
    files = ((f, s, t) for f, s, t in files if '.sqlite' in f)

    # We need to ensure the primary db comes first so we can build a pkey cache
    primary_first = lambda item: 'primary' not in item[0]
    files = sorted(files, key=primary_first)

    # Primary-key caches built from the primary dbs so we can make sense
    # of the contents of the filelist and changelog dbs.
    cache1, cache2 = {}, {}

    if not files:
        print(f'No sqlite database could be found in {url}')

    for filename, shasum, shatype in files:
        repomd_url = f'{url}/{filename}'

        # First, determine if the file has changed by comparing hash
        db = None
        if 'primary.sqlite' in filename:
            db = f'{name}-primary.sqlite'
        elif 'filelists.sqlite' in filename:
            db = f'{name}-filelists.sqlite'
        elif 'other.sqlite' in filename:
            db = f'{name}-other.sqlite'

        # Have we downloaded this before?  Did it change?
        destfile = os.path.join(target_dir, db)
        if not needs_update(destfile, shasum, shatype):
            print(f'{name.ljust(padding)} No change of {repomd_url}')

        # If it has changed, then download it and move it into place.
        tempargs = dict(prefix='mdapi-', dir='/var/tmp')
        with tempfile.TemporaryDirectory(**tempargs) as working_dir:
            tempdb = os.path.join(working_dir, db)
            archive = os.path.join(working_dir, filename)

            download_db(name, repomd_url, archive)
            decompress_db(name, archive, tempdb)
            index_db(name, tempdb)
            install_db(name, tempdb, destfile)

def main():
    # Handle command-line arguments.
    parser = argparse.ArgumentParser(
            description='Fetch SQL metadata databases of Fedora/EPEL repositories')
            '--target-dir', dest='target_dir', action='store', required=True)

    args = parser.parse_args()

    # Define repositories to sync with.
    repositories = []

        (f'{KOJI_REPO}/rawhide/latest/x86_64/repodata', 'koji')

    # Fetch databases.
    for repo in repositories:
        handle(repo, args.target_dir)

if __name__ == '__main__':

A generate-html.py => generate-html.py +51 -0
@@ 0,0 1,51 @@

import os
import sqlite3
from jinja2 import Environment, PackageLoader, select_autoescape


def save_to(path, content):
    with open(path, 'w') as fh:

def main():
    db = "koji-primary.sqlite"

    env = Environment(
            loader=PackageLoader('generate-html', 'templates'),
    index = env.get_template('index.html.j2')

    os.makedirs(os.path.join(OUTPUT_DIR, "pkgs"), exist_ok=True)
    save_to(os.path.join(OUTPUT_DIR, 'index.html'), index.render())

    conn = sqlite3.connect('koji-primary.sqlite')
    conn.row_factory = sqlite3.Row
    c = conn.cursor()

    c.execute('SELECT COUNT(*) FROM packages')
    package_count = c.fetchone()[0]
    print("Found {} packages in Rawhide.".format(package_count))

    count = 0
    for pkg in c.execute('SELECT * FROM packages'):
        html_path = os.path.join(OUTPUT_DIR, 'pkgs', pkg["name"] + ".html")
        html_template = env.get_template('package.html.j2')
        html_content = html_template.render(
        save_to(html_path, html_content)
        count += 1

        if (count % 100 == 0):
            print("Processed {}/{} packages.".format(count, package_count))

if __name__ == '__main__':

D generate-html.raku => generate-html.raku +0 -118
@@ 1,118 0,0 @@
# Fetch (basic) package metadata from various sources and store in JSON file.
# This data will be used to generate static pages.
# Complex data such as requires & friend could be either fetched here or
# asynchronously from the client's browser.
# What do we extract, and where:
# PDC: store every package name in Fedora, old and new.
#   - Name
#   - Dist-git URL
# MDAPI: interface to repositories.
#   - Summary
#   - Description
#   - Upstream URL
# Bugzilla and Bhodi URLs can begenerated from PDC name. Where is license? Does
# not show up by default in MDAPI request...

use Cro::HTTP::Client;

class Package {
	has $.name;
	has $.dist_git_url is rw;
	has $.summary is rw;
	has $.description is rw;
	has $.upstream is rw;

sub generate_index(@pkgs) {
	my $header = q:to/END/;
	<!DOCTYPE html>
	<h1>Fedora Package Index</h1>

	my @links;
	for @pkgs -> $pkg {
		@links.push("<li><a href=\"{$pkg.name}.html\">{$pkg.name}</a></li>");

	my $footer = q:to/END/;

	return $header ~ join("\n", @links) ~ $footer;

sub generate_package_page($pkg) {
	my $header = q:to/END/;
	<!DOCTYPE html>

	my @lines = [
		"<h1>Package: {$pkg.name}</h1>",
		"<li><a href=\"{$pkg.upstream}\">{$pkg.upstream}</a></li>",
		"<li><a href=\"{$pkg.dist_git_url}\">{$pkg.dist_git_url}</a></li>",

	my $footer = q:to/END/;

	return $header ~ join("\n", @lines) ~ $footer;

my @pkgs;
my $client = Cro::HTTP::Client.new(
	headers => [
		User-agent => 'Cro'

print "Fetching packages from PDC...";

my $pdc_resp = await Cro::HTTP::Client.get('https://pdc.fedoraproject.org/rest_api/v1/global-components/');
my $pdc_body = await $pdc_resp.body;

for $pdc_body<results> -> @entries {
	for @entries -> $entry {
		@pkgs.push(Package.new(name => $entry<name>, dist_git_url => $entry<dist_git_web_url>));

say " extracted {@pkgs.elems} names.";

for @pkgs -> $pkg {
		my $body;
		try {
			say "Fetching {$pkg.name} metadata from mdapi...";
			my $resp = await Cro::HTTP::Client.get("https://mdapi.fedoraproject.org/rawhide/pkg/{$pkg.name}");
			$body = await $resp.body;

			$pkg.summary = $body<summary>;
			$pkg.description = $body<description>;
			$pkg.upstream = $body<url>;

say "Generating HTML...";

mkdir "html";
spurt "html/index.html", generate_index(@pkgs);
for @pkgs -> $pkg {
	spurt "html/{$pkg.name}.html", generate_package_page($pkg);

A templates/index.html.j2 => templates/index.html.j2 +30 -0
@@ 0,0 1,30 @@
<!DOCTYPE html>
		<title>Fedora packages</title>

		<h1>Fedora packages</h1>
		<form action="http://yacysearchserver-pkgs-playground.apps.os.fedorainfracloud.org/yacysearch.html" method="get">
			<input type="search" placeholder="Search" name="query" aria-label="Search">
			<button  type="submit">Search</button>

		<h2>Useful links</h2>

			<li>Generated on: ???</li>
			<li>Package count: ???</li>
			<li>Releases: ???</li>


A templates/package.html.j2 => templates/package.html.j2 +22 -0
@@ 0,0 1,22 @@
<!DOCTYPE html>
		<title>Package: {{name}}</title>
		<h1>Package: {{name}}</h1>

			<li>Summary: {{summary}}</li>
			<li>description: {{description}}</li>
			<li>maintainer: </li>
			<li>upstream: </li>
			<li>license: </li>

		<h2>Useful links</h2>