~williamvds/dissertation

5c937e4d9ff2acd8103d107c46389e592f43f7a1 — williamvds 4 years ago 473bbff
ukriDownload: fix downloading single pages and fix some lint warnings
1 files changed, 20 insertions(+), 11 deletions(-)

M scripts/ukriDownload
M scripts/ukriDownload => scripts/ukriDownload +20 -11
@@ 4,6 4,8 @@ PAUSE=4
NUM_PAGES_XPATH='string(/*/@*[local-name()="totalPages"])'
OUTPUT='.'

set -e

# Parse options: https://stackoverflow.com/a/14203146
TYPES=()
while [[ $# -gt 0 ]]; do


@@ 21,11 23,11 @@ done
set -- "${TYPES[@]}"

if [ "${#TYPES[@]}" == 0 ]; then
	echo "No record types specified"
	exit 1
	echo "No types specified: downloading all data"
	TYPES=('org' 'out' 'per' 'pro')
fi

for typ in "$TYPES"; do
for typ in "${TYPES[@]}"; do
	case "$typ" in
		or*) recordType='organisations';;
		ou*) recordType='outcomes';;


@@ 46,30 48,37 @@ for typ in "$TYPES"; do
	}

	function xmlCount {
		awk '{s+=$1} END {print s}' <(xmllint --xpath 'count(/*/*)' $*)
		awk '{s+=$1} END {print s}' <(xmllint --xpath 'count(/*/*)' "$@")
	}

	file1="$(getPageFile 1)"
	curl "$(getPageURL 1)" -so "$file1"
	numPages=$(xmllint --xpath "$NUM_PAGES_XPATH" "$file1")
	merged="$OUTPUT/$recordType.xml"

	if [ -z "$numPages" ]; then
		echo "Downloaded $recordType"
		cp "$file1" "$merged"
		continue
	fi

	echo "Downloading $recordType"
	echo "Total pages: $numPages"

	for i in $(seq 2 $numPages); do
		file="$(getPageFile $i)"
	for i in $(seq 2 "$numPages"); do
		file="$(getPageFile "$i")"
		[ -f "$file" ] && continue
		sleep "$PAUSE"
		echo "Downloading page $i of $numPages"
		curl "$(getPageURL $i)" -so "$file"
		curl "$(getPageURL "$i")" -so "$file"
		[ $? ] || break
	done

	merged="$OUTPUT/$recordType.xml"
	pages="$OUTPUT/$recordType/*.xml"
	pages=("$OUTPUT/$recordType"/*.xml)
	if ! [ -f "$merged" ] || \
	   ! [ "$(xmlCount "$merged")" = "$(xmlCount "$pages")" ]; then
	   ! [ "$(xmlCount "$merged")" = "$(xmlCount "${pages[@]}")" ]; then
		echo "Merging into a single file..."
		"$(basename $0)"/mergeXML "$pages" >"$merged"
		"$(dirname "$0")"/mergeXML "${pages[@]}" >"$merged"
	else
		echo "Merged file contains all records: $merged"
	fi