~nova/accendo

67457cdc046e49099b86b976437eb2fe150f57ef — Novalinium 2 years ago
Scraper, no converter
1 files changed, 38 insertions(+), 0 deletions(-)

A main.sh
A  => main.sh +38 -0
@@ 1,38 @@
#!/bin/bash
POST_ID=$1
PAGE_SIZE=100
trap times EXIT;
mkdir -p ${POST_ID}; cd ${POST_ID}
echo -n "Retrieving page 1 ... "
curl -so ${POST_ID}.1.rp "https://www.glowfic.com/posts/${POST_ID}?per_page=${PAGE_SIZE}"
if [ -f ${POST_ID}.1.rp ]; then
    echo "Done!"
    echo -n "Generating index ... "
else
    echo "Failed!"
    exit 1
fi
MAX_PAGE=$(elinks -dump ${POST_ID}.1.rp | fgrep per_page | cut -d . -f 2 | sort -u | fgrep file | cut -d = -f 2 | cut -d \& -f 1 | sort -n | tail -n 1)
if (( MAX_PAGE \> 1 )); then
    seq 2 ${MAX_PAGE} > ${POST_ID}.index
    echo "Done!"
    echo "Retrieving pages ... "
    xargs -a ${POST_ID}.index -P 4 -I {} -t curl -so ${POST_ID}.{}.rp "https://www.glowfic.com/posts/${POST_ID}?per_page=${PAGE_SIZE}&page={}"
    echo "Done!"
    echo -n "Verifying pages ..."
    for PAGE in `seq 1 ${MAX_PAGE}`; do
        if [ -f ${POST_ID}.${PAGE}.rp ]; then
            echo -n "."
        else
            echo "Failed! ${POST_ID}.${PAGE}.rp not found,"
            exit 1
        fi
    done
    echo "Done!"
    echo -n "Cleaning index ... ";
    rm ${POST_ID}.index
    echo "Done!"
else
    echo "No pages in index! Skipping page retrievals."
fi
echo "Completed retrieval of ${POST_ID}!"