~amirouche/babylia

73c44e71fb34a204051ccb4cdb1c32c45b7bac41 — Amirouche 8 months ago
babylia-hn: initial version.
1 files changed, 97 insertions(+), 0 deletions(-)

A babylia-hn.py
A  => babylia-hn.py +97 -0
@@ 1,97 @@
#!/usr/bin/env python3
"""https://stackoverflow.com/a/40548567/140837"""
import sys
import zlib
from urllib.parse import urlparse
from collections import Counter
from mmap import PAGESIZE
import json

CHUNKSIZE = PAGESIZE


# This is a generator that yields *decompressed* chunks from
# a gzip file. This is also called a stream or lazy list.
# It's done like so to avoid to have the whole file into memory
# Read more about Python generators to understand how it works.
# cf. `yield` keyword.
def gzip_to_chunks(filename):
    decompressor = zlib.decompressobj(zlib.MAX_WBITS + 16)
    with open(filename, 'rb') as f:
        chunk = f.read(CHUNKSIZE)

        while chunk:
            out = decompressor.decompress(chunk)
            yield out
            chunk = f.read(CHUNKSIZE)

        out = decompressor.flush()

        yield out


# Again the following is a generator (see the `yield` keyword).
# What id does is iterate over an *iterable* of strings and yields
# rows from the file

# (hint: `gzip_to_chunks(filename)` returns a generator of strings)
# (hint: a generator is also an iterable)

# You can verify that by calling `chunks_to_rows` with a list of
# strings, where every strings is a chunk of the VCF file.
# (hint: a list is also an iterable)

# inline doc follows
def chunks_to_lines(chunks):
    row = b''  # we will add the chars making a single row to this variable
    for chunk in chunks:  # iterate over the strings/chuncks yielded by gzip_to_chunks
        for char in chunk:  # iterate over all chars from the string
            if char == b'\n'[0]:  # hey! this is the end of the row!
                yield row.decode('utf8')
                row = b''  # start a new row
            else:
                row += int.to_bytes(char, 1, byteorder='big')  # Otherwise we are in the middle of the row
        # at this point the program has read all the chunk
    # at this point the program has read all the file without loading it fully in memory at once
    # That said, there's maybe still something in row
    if row:
        yield row.decode('utf-8')


def lines_to_json(lines):
    for line in lines:
        yield json.loads(line)


def read(filename):
    return lines_to_json(chunks_to_lines(gzip_to_chunks(filename)))


counter = dict()
score_sum = 0
total_count = 0

for item in read(sys.argv[1]):
    if item["type"] != "story" or item.get('url') is None:
        continue
    url = item['url']
    url = urlparse(url)
    domain = url.netloc
    if domain.isspace():
        continue
    scheme = url.scheme
    url = scheme + "://" + domain

    try:
        count = counter[url]
    except KeyError:
        counter[url] = count = dict(score=0, count=0)

    count["score"] += item["score"]
    score_sum += item["score"]
    count["count"] += 1
    total_count += 1


for url, v in sorted(counter.items(), key=lambda x: x[1]["score"] / x[1]["count"]):
    print(url, v["score"] / v["count"])