~nova/MAGScrape

6e30052494dc61732ef9332ee6c502dcaeef90d9 — Novalinium 1 year, 2 months ago master
Magnus Archives Statement Scraper
2 files changed, 21 insertions(+), 0 deletions(-)

A cut.pl
A run.sh
A  => cut.pl +13 -0
@@ 1,13 @@
#!/bin/perl
use strict;
use warnings;

do {
    my $fn = <>;
    chomp $fn;
    my @st_header = split /:/, <>;
    my @st_begin = split /:/, <>;
    my @st_end = split /:/, <>;
    <>;
    print "sed -n '$st_header[0],$st_end[0]p' $fn | tee $fn.statement\n";
} until(eof());

A  => run.sh +8 -0
@@ 1,8 @@
#!/bin/sh
MAXEPISODE=160
seq 001 1 $MAXEPISODE | perl -ne 'printf "https://snarp.github.io/magnus_archives_transcripts/episode/%03d.html\n", $_' | xargs -I {} -P 8 curl -O {}
find -name '*.html' -exec sh -c 'elinks -dump-width 10000 -dump {} | tee {}.txt' \; 
rg -pi 'Statement [obe]' [0-9]*.txt | sed -r "s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g" | tee segments.txt
$EDITOR statements.txt
cat segments.txt | perl cut.pl | sh
cat *statement > statements.txt