~fnux/fedora-pkgs-playground

f75cad632265b786e4022e8276c7c0b87c93601f — Timothée Floure 6 months ago b2308ec master
Fetch and generate pages from Rawhide SQLite DB
7 files changed, 281 insertions(+), 118 deletions(-)

A .gitignore
A README.md
A fetch-repository-dbs.py
A generate-html.py
D generate-html.raku
A templates/index.html.j2
A templates/package.html.j2
A .gitignore => .gitignore +1 -0
@@ 0,0 1,1 @@
output/

A README.md => README.md +5 -0
@@ 0,0 1,5 @@
# fedora-packages-static

* replaces current moshka thingy
* static -> less moving parts, less maintance, simpler, indexing via external
* see mail thread

A fetch-repository-dbs.py => fetch-repository-dbs.py +172 -0
@@ 0,0 1,172 @@
#!/usr/bin/python3

import requests
import xml.etree.ElementTree as ET
import shutil
import tempfile
import sqlite3
import os
import argparse
import hashlib

repomd_xml_namespace = {
    'repo': 'http://linux.duke.edu/metadata/repo',
    'rpm': 'http://linux.duke.edu/metadata/rpm',
}
padding = 22

KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos'
# Enforce, or not, checking the SSL certs
DL_VERIFY = True

def needs_update(local_file, remote_sha, sha_type):
    ''' Compare sha of a local and remote file.
    Return True if our local file needs to be updated.
    '''

    if not os.path.isfile(local_file):
        # If we have never downloaded this before, then obviously it has
        # "changed"
        return True

    # Old old epel5 doesn't even know which sha it is using..
    if sha_type == 'sha':
        sha_type = 'sha1'

    hash = getattr(hashlib, sha_type)()
    with open(local_file, 'rb') as f:
        hash.update(f.read())

    local_sha = hash.hexdigest()
    if local_sha != remote_sha:
        return True

    return False

def download_db(name, repomd_url, archive):
    print(f'{name.ljust(padding)} Downloading file: {repomd_url} to {archive}')
    response = requests.get(repomd_url, verify=DL_VERIFY)
    response.raise_for_status()
    with open(archive, 'wb') as stream:
        stream.write(response.content)

def decompress_db(name, archive, location):
    ''' Decompress the given XZ archive at the specified location. '''
    print(f'{name.ljust(padding)} Extracting {archive} to {location}')
    if archive.endswith('.xz'):
        import lzma
        with lzma.open(archive) as inp, open(location, 'wb') as out:
            out.write(inp.read())
    elif archive.endswith('.tar.gz'):
        import tarfile
        with tarfile.open(archive) as tar:
            tar.extractall(path=location)
    elif archive.endswith('.gz'):
        import gzip
        with gzip.open(archive, 'rb') as inp, open(location, 'wb') as out:
            out.write(inp.read())
    elif archive.endswith('.bz2'):
        import bz2
        with bz2.open(archive) as inp, open(location, 'wb') as out:
            out.write(inp.read())
    else:
        raise NotImplementedError(archive)

def index_db(name, tempdb):
    print(f'{name.ljust(padding)} Indexing file: {tempdb}')

    if tempdb.endswith('primary.sqlite'):
        conn = sqlite3.connect(tempdb)
        conn.execute('CREATE INDEX packageSource ON packages (rpm_sourcerpm)')
        conn.commit()
        conn.close()

def install_db(name, src, dest):
    print(f'{name.ljust(padding)} Installing {src} to {dest}.')
    shutil.move(src, dest)

def handle(repo, target_dir):
    url, name = repo
    repomd_url = f'{url}/repomd.xml'
    response = requests.get(repomd_url, verify=DL_VERIFY)
    if not response:
        print(f'{name.ljust(padding)} !! Failed to get {repomd_url!r} {response!r}')
        return

    # Parse the xml doc and get a list of locations and their shasum.
    files = ((
        node.find('repo:location', repomd_xml_namespace),
        node.find('repo:open-checksum', repomd_xml_namespace),
    ) for node in ET.fromstring(response.text))

    # Extract out the attributes that we're really interested in.
    files = (
        (f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type'])
        for f, s in files if f is not None and s is not None
    )

    # Filter down to only sqlite dbs
    files = ((f, s, t) for f, s, t in files if '.sqlite' in f)

    # We need to ensure the primary db comes first so we can build a pkey cache
    primary_first = lambda item: 'primary' not in item[0]
    files = sorted(files, key=primary_first)

    # Primary-key caches built from the primary dbs so we can make sense
    # of the contents of the filelist and changelog dbs.
    cache1, cache2 = {}, {}

    if not files:
        print(f'No sqlite database could be found in {url}')

    for filename, shasum, shatype in files:
        repomd_url = f'{url}/{filename}'

        # First, determine if the file has changed by comparing hash
        db = None
        if 'primary.sqlite' in filename:
            db = f'{name}-primary.sqlite'
        elif 'filelists.sqlite' in filename:
            db = f'{name}-filelists.sqlite'
        elif 'other.sqlite' in filename:
            db = f'{name}-other.sqlite'

        # Have we downloaded this before?  Did it change?
        destfile = os.path.join(target_dir, db)
        if not needs_update(destfile, shasum, shatype):
            print(f'{name.ljust(padding)} No change of {repomd_url}')
            continue

        # If it has changed, then download it and move it into place.
        tempargs = dict(prefix='mdapi-', dir='/var/tmp')
        with tempfile.TemporaryDirectory(**tempargs) as working_dir:
            tempdb = os.path.join(working_dir, db)
            archive = os.path.join(working_dir, filename)

            download_db(name, repomd_url, archive)
            decompress_db(name, archive, tempdb)
            index_db(name, tempdb)
            install_db(name, tempdb, destfile)

def main():
    # Handle command-line arguments.
    parser = argparse.ArgumentParser(
            description='Fetch SQL metadata databases of Fedora/EPEL repositories')
    parser.add_argument(
            '--target-dir', dest='target_dir', action='store', required=True)

    args = parser.parse_args()

    # Define repositories to sync with.
    repositories = []

    repositories.append(
        (f'{KOJI_REPO}/rawhide/latest/x86_64/repodata', 'koji')
    )

    # Fetch databases.
    for repo in repositories:
        handle(repo, args.target_dir)

if __name__ == '__main__':
    main()

A generate-html.py => generate-html.py +51 -0
@@ 0,0 1,51 @@
#!/usr/bin/python3

import os
import sqlite3
from jinja2 import Environment, PackageLoader, select_autoescape

OUTPUT_DIR='output/'

def save_to(path, content):
    with open(path, 'w') as fh:
        fh.write(content)

def main():
    db = "koji-primary.sqlite"

    env = Environment(
            loader=PackageLoader('generate-html', 'templates'),
            autoescape=select_autoescape(['html'])
            )
    index = env.get_template('index.html.j2')


    os.makedirs(os.path.join(OUTPUT_DIR, "pkgs"), exist_ok=True)
    save_to(os.path.join(OUTPUT_DIR, 'index.html'), index.render())

    conn = sqlite3.connect('koji-primary.sqlite')
    conn.row_factory = sqlite3.Row
    c = conn.cursor()

    c.execute('SELECT COUNT(*) FROM packages')
    package_count = c.fetchone()[0]
    print("Found {} packages in Rawhide.".format(package_count))

    count = 0
    for pkg in c.execute('SELECT * FROM packages'):
        html_path = os.path.join(OUTPUT_DIR, 'pkgs', pkg["name"] + ".html")
        html_template = env.get_template('package.html.j2')
        html_content = html_template.render(
                name=pkg["name"],
                summary=pkg["description"],
                description=pkg["description"],
                )
        save_to(html_path, html_content)
        count += 1

        if (count % 100 == 0):
            print("Processed {}/{} packages.".format(count, package_count))


if __name__ == '__main__':
    main()

D generate-html.raku => generate-html.raku +0 -118
@@ 1,118 0,0 @@
#!/usr/bin/raku
#
# Fetch (basic) package metadata from various sources and store in JSON file.
# This data will be used to generate static pages.
#
# Complex data such as requires & friend could be either fetched here or
# asynchronously from the client's browser.
#
# What do we extract, and where:
#
# PDC: store every package name in Fedora, old and new.
#   - Name
#   - Dist-git URL
# MDAPI: interface to repositories.
#   - Summary
#   - Description
#   - Upstream URL
#
# Bugzilla and Bhodi URLs can begenerated from PDC name. Where is license? Does
# not show up by default in MDAPI request...

use Cro::HTTP::Client;

class Package {
	has $.name;
	has $.dist_git_url is rw;
	has $.summary is rw;
	has $.description is rw;
	has $.upstream is rw;
}

sub generate_index(@pkgs) {
	my $header = q:to/END/;
	<!DOCTYPE html>
	<html>
	<body>
	<h1>Fedora Package Index</h1>
	<ul>
	END

	my @links;
	for @pkgs -> $pkg {
		@links.push("<li><a href=\"{$pkg.name}.html\">{$pkg.name}</a></li>");
	}

	my $footer = q:to/END/;
	</ul>
	</body>
	</html>
	END

	return $header ~ join("\n", @links) ~ $footer;
}

sub generate_package_page($pkg) {
	my $header = q:to/END/;
	<!DOCTYPE html>
	<html>
	<body>
	END

	my @lines = [
		"<h1>Package: {$pkg.name}</h1>",
		"<ul>",
		"<li>{$pkg.summary}</li>",
		"<li><a href=\"{$pkg.upstream}\">{$pkg.upstream}</a></li>",
		"<li><a href=\"{$pkg.dist_git_url}\">{$pkg.dist_git_url}</a></li>",
		"</ul>",
		"<p>{$pkg.description}</p>"
	];

	my $footer = q:to/END/;
	</body>
	</html>
	END

	return $header ~ join("\n", @lines) ~ $footer;
}

my @pkgs;
my $client = Cro::HTTP::Client.new(
	headers => [
		User-agent => 'Cro'
	]);

print "Fetching packages from PDC...";

my $pdc_resp = await Cro::HTTP::Client.get('https://pdc.fedoraproject.org/rest_api/v1/global-components/');
my $pdc_body = await $pdc_resp.body;

for $pdc_body<results> -> @entries {
	for @entries -> $entry {
		@pkgs.push(Package.new(name => $entry<name>, dist_git_url => $entry<dist_git_web_url>));
	}
}

say " extracted {@pkgs.elems} names.";

for @pkgs -> $pkg {
		my $body;
		try {
			say "Fetching {$pkg.name} metadata from mdapi...";
			my $resp = await Cro::HTTP::Client.get("https://mdapi.fedoraproject.org/rawhide/pkg/{$pkg.name}");
			$body = await $resp.body;

			$pkg.summary = $body<summary>;
			$pkg.description = $body<description>;
			$pkg.upstream = $body<url>;
		}
}

say "Generating HTML...";

mkdir "html";
spurt "html/index.html", generate_index(@pkgs);
for @pkgs -> $pkg {
	spurt "html/{$pkg.name}.html", generate_package_page($pkg);
}

A templates/index.html.j2 => templates/index.html.j2 +30 -0
@@ 0,0 1,30 @@
<!DOCTYPE html>
<html>
	<head>
		<title>Fedora packages</title>
	</head>
	<body>

		<h1>Fedora packages</h1>
		<form action="http://yacysearchserver-pkgs-playground.apps.os.fedorainfracloud.org/yacysearch.html" method="get">
			<input type="search" placeholder="Search" name="query" aria-label="Search">
			<button  type="submit">Search</button>
		</form>

		<h2>Useful links</h2>
		<ul>
			<li>Bhodi</li>
			<li>Koji</li>
			<li>Bugzilla</li>
			<li>Dist-git</li>
		</ul>

		<h2>Stats</h2>
		<ul>
			<li>Generated on: ???</li>
			<li>Package count: ???</li>
			<li>Releases: ???</li>

		</ul>
	</body>
</html>

A templates/package.html.j2 => templates/package.html.j2 +22 -0
@@ 0,0 1,22 @@
<!DOCTYPE html>
<html>
	<head>
		<title>Package: {{name}}</title>
	</head>
	<body>
		<h1>Package: {{name}}</h1>

		<ul>
			<li>Summary: {{summary}}</li>
			<li>description: {{description}}</li>
			<li>maintainer: </li>
			<li>upstream: </li>
			<li>license: </li>
		</ul>

		<h2>Useful links</h2>
		<ul>
			<li>bodhi</li>
		</ul>
	</body>
</html>