~q3cpma/misc-tools

0695c6188d0f70c89d8e1757f2463b2e3869a60a — q3cpma 8 months ago 3f76eff
htmldecode: move to root, use genhtab instead of lex
build: sync, clean properly fetched files
10 files changed, 80 insertions(+), 95 deletions(-)

M build.sh
M build_util.sh
A compile_flags.txt
A gen_html_entities_htab.sh
R htmldecode/htmldecode.c => htmldecode.c
D htmldecode/json2lex.sh
D htmldecode/test.sh
R htmldecode/namedref.h => namedref.h
M natsort.c
M test.sh
M build.sh => build.sh +35 -30
@@ 14,7 14,6 @@ then
	done
	exit
fi
[ "$BIN" = htmldecode ] && BIN=htmldecode/htmldecode

# Get configuration from the environment
CC=${CC:-c99}


@@ 41,6 40,21 @@ append_cppflag \
	-DPROG_NAME="$(dquote "$NAME" 2)" \
	-DPROG_VERSION="$(dquote "$gitver" 2)"

case "$BIN" in
	htmlencode|urldecode|urlencode|wcswidth|genhtab)
		SRC="$BIN.c misc.c"
		;;
	mbcut|natsort)
		SRC="$BIN.c misc.c utf8.c"
		;;
	htmldecode)
		SRC="$BIN.c htab.c misc.c utf8.c"
		;;
	*)
		die "$BIN: unknown program"
		;;
esac

# Argument parsing
if "$PGO"
then


@@ 50,9 64,7 @@ else
	then
		case "$(tolower "$1")" in
			clean)
				cclean \
					$([ "$NAME" = htmldecode ] && pecho entities.json namedref.lex) \
					unicode_tolower.h
				cclean UnicodeData.txt unicode_tolower.h entities.json htab.h htab.c
				;;
			install)
				pb_install -m 755 "$PREFIX"/bin/ "$BIN"


@@ 170,30 182,23 @@ then
	esac
fi

! [ -s UnicodeData.txt ] &&
	pb_fetch https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
! [ -s unicode_tolower.h ] &&
	sh gen_unicode_tolower_lut.sh UnicodeData.txt >unicode_tolower.h
if match "$SRC" '(.* )?utf8.c( .*)?'
then
	! [ -s UnicodeData.txt ] &&
		pb_fetch https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
	! [ -s unicode_tolower.h ] &&
		sh gen_unicode_tolower_lut.sh UnicodeData.txt >unicode_tolower.h
fi

if [ "$BIN" = htmldecode ]
then
	! [ -s genhtab ] &&
		BIN=genhtab "$self"
	! [ -s entities.json ] &&
		pb_fetch https://www.w3.org/TR/html5/entities.json
	! [ -s htab.c ] &&
		./gen_html_entities_htab.sh <entities.json | \
		./genhtab -f namedref_to_codep_pair -i namedref.h -l -t codep_pair
fi

case "$NAME" in
	htmlencode|urldecode|urlencode|wcswidth|genhtab)
		SRC=$(pecho "$BIN".c misc.c)
		pb_make
		;;
	mbcut|natsort)
		SRC=$(pecho "$BIN".c misc.c utf8.c)
		pb_make
		;;
	htmldecode)
		cd htmldecode
		! [ -s entities.json ] &&
			pb_fetch https://www.w3.org/TR/html5/entities.json
		! [ -s namedref.lex ] &&
			./json2lex.sh entities.json >namedref.lex
		SRC=$(pecho htmldecode.c namedref.c ../misc.c ../utf8.c)
		pb_make BIN=htmldecode LDLIBS=-lfl
		;;
	*)
		die "$BIN: unknown program"
		;;
esac
pb_make

M build_util.sh => build_util.sh +11 -3
@@ 704,6 704,13 @@ EOF
	exit $1
}

# Convert a make compatible source list (no whitespaces, space delimited) to
# its object counterpart
src2obj()
{
	pecho "${1:-$SRC}" | sed 's#\.c\([[:blank:]]*\)#.o\1#g' | paste -sd' '
}

# C build.sh cleanup function
# Also remove "$@"
cclean()


@@ 714,11 721,13 @@ cclean()
	do
		rmv -f -- "${i#./}"
	done
	rmv -f -- *.o "$(basename -- "$BIN")" *.profraw default.profdata *.gcda

	rmv -f -- "$(basename -- "$BIN")" $(src2obj) *.profraw default.profdata *.gcda
	if [ $# -ne 0 ]
	then
		rmv -f -- "$@"
	fi
	cd - >/dev/null
}

# Compiler flag testing functions


@@ 804,8 813,7 @@ pb_make()
	[ "${JOBS:-}" = 1 ] && JOBS=
	{ cat <<'EOF'; $_append_stdin && cat; } | \
		"${MAKE:-make}" ${JOBS:+-j $JOBS} -f - \
		OBJ="$(pecho "$SRC" | sed 's#\.c\([[:blank:]]*\)#.o\1#g' | paste -sd' ')" \
		CC="$CC" BIN="$BIN" CFLAGS="${CFLAGS:-}" CPPFLAGS="${CPPFLAGS:-}" \
		OBJ="$(src2obj)" CC="$CC" BIN="$BIN" CFLAGS="${CFLAGS:-}" CPPFLAGS="${CPPFLAGS:-}" \
		LDFLAGS="${LDFLAGS:-}" LDLIBS="${LDLIBS:-}" "$@"
.POSIX:
.SUFFIXES:

A compile_flags.txt => compile_flags.txt +11 -0
@@ 0,0 1,11 @@
-std=c11
-pedantic
-Wall
-Wextra
-Wno-char-subscripts
-funsigned-char
-DNDEBUG
-D_DEFAULT_SOURCE
-pthread
-DPROG_NAME="\"\""
-DPROG_VERSION="\"\""

A gen_html_entities_htab.sh => gen_html_entities_htab.sh +11 -0
@@ 0,0 1,11 @@
#!/bin/sh
set -eu

tr -d '&":[]{},' | awk '
	BEGIN {print "{0, 0}"}
	$1 ~ /;$/ \
	{
		sub(";$", "", $1)
		printf "%s\t{0x%x, 0x%x}\n",
			$1, $3, (NF == 6 ? $4 : 0)
	}'

R htmldecode/htmldecode.c => htmldecode.c +6 -5
@@ 7,8 7,9 @@
#include <string.h>
#include <inttypes.h>

#include "../misc.h"
#include "../utf8.h"
#include "misc.h"
#include "utf8.h"
#include "htab.h"
#include "namedref.h"




@@ 59,7 60,7 @@ bool htmldecode(char *in, char *out, size_t len)
			{
			    die("Unterminated named character references not supported");
			}
		    codep_pair cdp = lex_namedref_to_codep_pair(fnd, sc - fnd);
			codep_pair cdp = namedref_to_codep_pair(fnd, sc - fnd);
			if (!cdp.c1 && !cdp.c2)
			{
				fprintf(stderr, "%.*s: unknown named character reference\n",


@@ 88,8 89,8 @@ NORETURN void usage(int exit_status)
		"    %s [OPTIONS]\n"
		"\n"
		"DESCRIPTION\n"
		"    Copy stdin to stdout while decoding all HTML escape sequences (for example, \n"
		"    \"&#x26;\", \"&#38;\" and \"&amp;\" map to '&').\n"
		"    Copy stdin to stdout while decoding all HTML escape sequences to UTF-8 (for \n"
		"    example, \"&#x26;\", \"&#38;\" and \"&amp;\" all translate to '&').\n"
		"\n"
		"OPTIONS\n"
		"    -h\n"

D htmldecode/json2lex.sh => htmldecode/json2lex.sh +0 -34
@@ 1,34 0,0 @@
#!/bin/sh
# Portability: POSIX

cat <<EOF
%option noinput
%option nounput
%{
#include "namedref.h"
codep_pair ret;
%}

%%
$(tr -d '&":[]' <"$1" | awk '
	$1 ~ /;$/\
	{
		sub(".$", "", $1)
		printf "%s,", $1
		printf "ret.c1 = 0x%x;", $4
		if (NF == 8)
			printf ",ret.c2 = 0x%x;", $5
		printf "\n"
	}' | tr ',' '\t')
. ;
%%

codep_pair lex_namedref_to_codep_pair(const char *p, size_t len)
{
	YY_BUFFER_STATE yy_buf = yy_scan_bytes(p, len);
	ret = (codep_pair){0, 0};
	yylex();
	yy_delete_buffer(yy_buf);
	return ret;
}
EOF

D htmldecode/test.sh => htmldecode/test.sh +0 -14
@@ 1,14 0,0 @@
#!/bin/sh
set -eu
cd -- "$(dirname -- "$0")"

out=$(echo 'a&amp;b&#x26;c&#38;d&boxV;t=&fjlig;abc' | ./htmldecode)
set +e
[ "$out" = "a&b&c&d║t=fjabc" ]
if [ $? -eq 0 ]
then
	printf '%bPASS%b\n' "$(tput setaf 2)$(tput bold)" "$(tput sgr0)"
else
	printf '%bFAIL%b\n' "$(tput setaf 1)$(tput bold)" "$(tput sgr0)"
fi
set -e

R htmldecode/namedref.h => namedref.h +2 -6
@@ 2,13 2,9 @@

#include <stdint.h>

//Used to return more than one unicode char (for characters like '⋛')
//If c2 is 0, only c1 contains something.
// Used to return more than one unicode char (for characters like '⋛')
// If c2 is 0, only c1 contains something.
typedef struct codep_pair
{
	uint32_t c1, c2;
} codep_pair;


//Returns {0, 0} if p doesn't match any named character reference
codep_pair lex_namedref_to_codep_pair(const char *p, size_t len);

M natsort.c => natsort.c +1 -0
@@ 58,6 58,7 @@ NORETURN void usage(int exit_status)
		"DESCRIPTION\n"
		"    Sort input lines case insensitively while comparing numbers (as in '[0-9]+')\n"
		"    by their value.\n"
		"    Only supports UTF-8 locales.\n"
		"\n"
		"OPTIONS\n"
		"    -h\n"

M test.sh => test.sh +3 -3
@@ 153,9 153,9 @@ EOF
			check_strcmp "$out" "$expect"
			;;
		htmldecode)
			printf 'htmldecode: '
			htmldecode/test.sh
			continue
			out=$(echo 'a&amp;b&#x26;c&#38;d&boxV;t=&fjlig;abc' | ./$i)
			expect='a&b&c&d║t=fjabc'
			check_strcmp "$out" "$expect"
			;;
		*)
		    echo "$i: unknown test"