~sircmpwn/hnstats

23c53b246658a5dc5dfe486cf7406e16ea14a4a9 — Drew DeVault 6 years ago 7596422
Improve cleanup script, fix status flipping
2 files changed, 21 insertions(+), 5 deletions(-)

M clean.py
M update.py
M clean.py => clean.py +17 -5
@@ 4,13 4,25 @@ from .update import db, ArticleSample

cutoff = datetime.utcnow() - timedelta(days=15)

ArticleSample.query\
    .filter(ArticleSample.rank > 50)\
    .filter(ArticleSample.sample_time < cutoff)\
    .delete()
articles = (Article.query
        .filter(Article.submitted < cutoff)
        .all())

cutoff = datetime.utcnow() - timedelta(days=90)
for article in articles:
    min_rank = min([s.rank for s in article.samples if s.rank != -1] + [500])
    # Stories that have ever hit the front page are considered notable
    if min_rank < 30:
        continue
    samples = (ArticleSample.query
        .filter(ArticleSample.article_id == article.id)
        .filter(ArticleSample.rank > 30)
        .filter(ArticleSample.sample_time < cutoff))
    print("Deleting {} samples for article {}".format(
        samples.count(), article.id))
    samples.delete()

# Delete all samples >90 days
cutoff = datetime.utcnow() - timedelta(days=90)
records = ArticleSample.query\
    .filter(ArticleSample.sample_time < cutoff)\
    .delete()

M update.py => update.py +4 -0
@@ 243,6 243,10 @@ def sample(post):
    else:
        new_status = ArticleStatus.alive
    sample_time = datetime.utcnow()
    if article.status != ArticleStatus.alive and new_status != ArticleStatus.alive:
        # We can't consistently determine the status change in these cases
        # So let's not pollute the logs
        new_status = article.status
    if new_status != article.status:
        print("Article status changed")
        article.updated = datetime.utcnow()