~sircmpwn/ctools

f2d583864b4c95633f8897b3e1d44b2fa5352899 — Gabor Koszegi 1 year, 2 days ago 9de1ea7
Implement fold
8 files changed, 459 insertions(+), 1 deletions(-)

M STATUS
M doc/ctools.7.scd
A doc/fold.1.scd
M doc/meson.build
M meson.build
A src/fold.c
A test/fold
M test/meson.build
M STATUS => STATUS +1 -1
@@ 52,7 52,7 @@ T       expr
      N fg
    W   file
T       find
T       fold
  D     fold
    W   fort77
T       fuser
T       gencat

M doc/ctools.7.scd => doc/ctools.7.scd +2 -0
@@ 43,6 43,8 @@ shell environment. These tools are used for tasks such as:
:  Run command with a specified environment
|  *false*(1)
:  Exit with status code 1
|  *fold*(1)
:  Fold lines of input files
|  *head*(1)
:  Print the beginning of files
|  *link*(1)

A doc/fold.1.scd => doc/fold.1.scd +47 -0
@@ 0,0 1,47 @@
fold(1) "ctools"

# NAME

fold - fold lines of input files

# SYNOPSIS

*fold* [-bs] [-w _width_] [_file_...]

# DESCRIPTION

*fold* will break the lines of every input _file_ into segments which will have
at most _width_ columns. By default, the column positions will be calculated in
the sense of characters' display width.

If no input file is given or "-" is listed as a filename, _stdin_ will be used
as input.

# OPTIONS

*-b*
	Count _width_ in bytes.

*-s*
	If a word should be broken at the end of the segment, the segment will end
	after its last blank character, if such character exists.

*-w* _width_
	_width_ specifies the maximum length of the segments. The default width is
	80 columns.

# UNSPECIFIED BEHAVIOR

The POSIX standard does not unambiguously specify the behavior of this command
under certain conditions. Under such conditions, the ctools implementation of
*fold* behaves as follows:

- If *-w* is a not a positive integer, fold will print an error and exit with a
  non-zero status code.

# DISCLAIMER

This command is part of ctools and is compatible with POSIX-1.2017, and may
optionally support XSI extensions. This man page is not intended to be a
complete reference, and where it disagrees with the specification, the
specification takes precedence.

M doc/meson.build => doc/meson.build +1 -0
@@ 16,6 16,7 @@ man_files = [
	'echo.1',
	'env.1',
	'false.1',
	'fold.1',
	'head.1',
	'link.1',
	'logname.1',

M meson.build => meson.build +1 -0
@@ 23,6 23,7 @@ oneshots = [
	'echo',
	'env',
	'false',
	'fold',
	'head',
	'logname',
	'nice', # Included in base but only effective under XSI

A src/fold.c => src/fold.c +280 -0
@@ 0,0 1,280 @@
#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

static const unsigned int MODE_BYTES = 1;
static const unsigned int MODE_BLANK = 2;

static const size_t UTF8_MAX_LEN = 6;

static void
usage(void)
{
	fprintf(stderr, "usage: fold [-bs] [-w width] [file...]\n");
}

/*
 *  Ret. val. | Meaning
 * -----------+------------------------------------------------------------
 *     > 0    | valid (actual length)
 *      0     | undecidable (nbytes < encoded length)
 *     < 0    | invalid (-1 * seq. length that is guaranteed to be invalid)
 */
static int
utf8_len(unsigned char *seq, const int nbytes)
{
	/* undecidable */
	if (nbytes < 1) {
		return 0;
	}

	/* ASCII */
	if (seq[0] < 0x80) {
		return 1;
	}

	/* seq begins with continuation byte or
	 * undefined leading byte */
	if (seq[0] < 0xc0 || seq[0] >= 0xfe) {
		return -1;
	}

	/* lower and upper bounds for second byte
	 * which can be modified in special cases */
	unsigned char lb = 0x80;
	unsigned char ub = 0xc0;
	/* length encoded in leading byte */
	int bmax;

	if (seq[0] < 0xe0) {
		bmax = 2;
		/* these can only start overlong sequences */
		if (seq[0] < 0xc2) {
			lb = 0xc0;
		}
	} else if (seq[0] < 0xf0) {
		bmax = 3;
		/* 0xe0 may start overlong sequences */
		if (seq[0] == 0xe0) {
			lb = 0xa0;
		/* 0xed may start UTF-16 surrogate halves */
		} else if (seq[0] == 0xed) {
			ub = 0xa0;
		}
	} else if (seq[0] < 0xf8) {
		bmax = 4;
		/* 0xf0 may start overlong sequences */
		if (seq[0] == 0xf0) {
			lb = 0x90;
		/* 0xf4 may code codepoints over Unicode's limit */
		} else if (seq[0] == 0xf4) {
			ub = 0x90;
		} else if (seq[0] >= 0xf5) {
			lb = 0xc0;
		}
	} else if (seq[0] < 0xfc) {
		bmax = 5;
		lb = 0xc0;
	} else {
		bmax = 6;
		lb = 0xc0;
	}

	/* undecidable */
	if (nbytes < bmax) {
		return 0;
	}

	/* count valid continuation bytes */
	int vbytes = 1;
	for (int i = 1; i < bmax; ++i) {
		if (seq[i] >= 0x80 && seq[i] < 0xc0) {
			++vbytes;
		}
	}

	if (vbytes != bmax || seq[1] < lb || seq[1] >= ub) {
		vbytes *= -1;
	}

	return vbytes;
}

static int
write_segment(const unsigned char * const buf, const ssize_t n)
{
	ssize_t offs = 0;
	while (offs < n) {
		ssize_t o = write(STDOUT_FILENO, buf, n);
		if (o < 0) {
			perror("stdout");
			return 1;
		}
		offs += o;
	}

	return 0;
}

static int
fold(char * const path, const long w, const unsigned int mode)
{
	assert(BUFSIZ > UTF8_MAX_LEN);

	int fd;
	if (path[0] == '-' && path[1] == '\0') {
		fd = STDIN_FILENO;
	} else {
		if ((fd = open(path, O_RDONLY)) < 0) {
			perror(path);
			return 1;
		}
	}

	ssize_t n;
	unsigned char buf[BUFSIZ];
	/* indices (position in buf) and pointers (position in segment)
	 * NOTE: the index (bi) and the pointer (bp) of the last blank char in the
	 *       segment should point to the position after the char in question */
	ssize_t i, ri = 0, wb, we, cp = 0, bi, bp = w;
	ssize_t cstep;
	int l;
	while ((n = read(fd, &buf[ri], sizeof(buf) - ri - 1)) > 0) {
		n += ri;
		ri = 0;
		wb = 0;
		for (i = 0; i < n; i += l) {
			cstep = 0;
			if ((mode & MODE_BYTES) != 0) {
				l = 1;
				if (buf[i] == '\n') {
					cp = 0;
				} else {
					cstep = 1;
				}
			} else if ((l = utf8_len(&buf[i], n - i)) == 1) {
				if (buf[i] == '\b') {
					if (cp > 0) {
						--cp;
					}
				} else if (buf[i] == '\n' || buf[i] == '\r') {
					cp = 0;
				} else if (buf[i] == '\t') {
					cstep = 9 - cp % 8;
				} else {
					cstep = 1;
				}
			} else if (l > 1) {
				cstep = 1;
			} else if (l == 0 && i > n / 2) {
				ri = n - i;
				memcpy(buf, &buf[i], ri);
				break;
			} else {
				fprintf(stderr, "utf-8: invalid byte sequence\n");
				close(fd);
				return 1;
			}

			ssize_t cp_next = cp + cstep;

			if (cp_next > w) {
				if ((mode & MODE_BLANK) != 0 && bp < w
						&& isblank(buf[i]) == 0) {
					we = bi;
					cp = cp_next - bp;
					bp = w;
				} else {
					we = i;
					cp = (mode & MODE_BYTES) == 0 && buf[i] == '\t' ? 9
						: cstep;
				}

				unsigned char swap = buf[we];
				buf[we] = '\n';
				if (write_segment(&buf[wb], we - wb + 1) != 0) {
					close(fd);
					return 1;
				}
				buf[we] = swap;

				wb = we;
			} else {
				if ((mode & MODE_BLANK) != 0 && isblank(buf[i]) != 0) {
					bi = i + l;
					bp = cp_next;
				}
				cp = cp_next;
			}
		}

		if (write_segment(&buf[wb], n - ri - wb) != 0) {
			close(fd);
			return 1;
		}
	}

	close(fd);

	if (n < 0) {
		perror(fd == STDIN_FILENO ? "stdin" : path);
		return 1;
	}

	return 0;
}

int
main(int argc, char *argv[])
{
	long width = 80;
	unsigned int mode = 0;

	char opt;
	while ((opt = getopt(argc, argv, "bsw:")) != -1) {
		switch (opt) {
		case 'b':
			mode |= MODE_BYTES;
			break;
		case 's':
			mode |= MODE_BLANK;
			break;
		case 'w':
			width = strtol(optarg, NULL, 10);
			break;
		default:
			usage();
			return 1;
		}
	}

	if (optind > argc) {
		usage();
		return 1;
	}

	if (width <= 0) {
		fprintf(stderr, "-w: argument is not a positive integer\n");
		return 1;
	}

	char *dash[] = {"-"};
	if (optind == argc) {
		optind = 0;
		argc = 1;
		argv = dash;
	}

	for (int i = optind; i < argc; ++i) {
		if (fold(argv[i], width, mode) != 0) {
			return 1;
		}
	}

	return 0;
}

A test/fold => test/fold +126 -0
@@ 0,0 1,126 @@
#!/bin/sh
tool="fold"
. "$HARNESS"

should_handle_one_file() (
	printf "abcdefghijklmnopqrstuvwxyz" >"$TMPDIR"/test-one-file
	exp="$(printf "abcdefgh\nijklmnop\nqrstuvwx\nyz")"
	res="$(fold -w 8 "$TMPDIR"/test-one-file)"
	[ "$res" = "$exp" ]
)

should_handle_two_files() (
	printf "abcdefghijklmno\n" >"$TMPDIR"/test-two-files-1
	printf "pqrstuvwxyz\n" >"$TMPDIR"/test-two-files-2
	exp="$(printf "pqrstuv\nwxyz\nabcdefg\nhijklmn\no\n")"
	res="$(fold -w 7 "$TMPDIR"/test-two-files-2 "$TMPDIR"/test-two-files-1)"
	[ "$res" = "$exp" ]
)

should_handle_stdin() (
	exp="$(printf "STDIN tes\nt line1\nstdin TES\nT line 2")"
	res1="$(printf "STDIN test line1\nstdin TEST line 2" | fold -w 9)"
	res2="$(printf "STDIN test line1\nstdin TEST line 2" | fold -w 9 -)"
	[ "$res1" = "$res2" ] && [ "$res1" = "$exp" ]
)

should_handle_utf8() (
	exp="$(printf "Ώ¥あ∀𝜔\n∊ℝ 𝜉(\n𝜔)≿⌨")"
	res="$(printf "Ώ¥あ∀𝜔∊ℝ 𝜉(𝜔)≿⌨" | fold -w 5)"
	[ "$res" = "$exp" ]
)

should_handle_tab() (
	printf "テスト\ttest\tтест\tδοκιμή" >"$TMPDIR"/test-tab
	exp="$(printf "テスト\ttest\tтес\nт\tδοκιμή")"
	res="$(fold -w 20 "$TMPDIR"/test-tab)"

	[ "$res" = "$exp" ]
)

should_handle_backspace() (
	str="abcdefghij\bklmnopqrst\buvwxyzyxwvut\b\tsr"
	exp="$(printf "abcdefghij\bk\nlmnopqrst\buv\nwxyzyxwvut\b\n\ts\nr")"
	res="$(printf "$str" | fold -w 10)"
	[ "$res" = "$exp" ]
)

should_handle_cr() (
	str="abcdef\rghijk\rlmnopqr\rstuv\nwxyzyxwvutsr"
	exp="$(printf "abcdef\rghijk\rlmnopq\nr\rstuv\nwxyzyx\nwvutsr")"
	res="$(printf "$str" | fold -w 6)"
	[ "$res" = "$exp" ]
)

should_handle_b_flag() (
	printf "テ\rスト\ttest\tтес\bт\tδοκιμή" >"$TMPDIR"/test-b-flag-src
	read -r bytes <<-END_BYTES
\343\203\206\015\343\012\
\202\271\343\203\210\012\
\011\164\145\163\164\012\
\011\321\202\320\265\012\
\321\201\010\321\202\012\
\011\316\264\316\277\012\
\316\272\316\271\316\012\
\274\316\256
	END_BYTES
	printf "$bytes" >"$TMPDIR"/test-b-flag-exp
	fold -b -w 5 "$TMPDIR"/test-b-flag-src >"$TMPDIR"/test-b-flag-dst
	cmp -s "$TMPDIR"/test-b-flag-dst "$TMPDIR"/test-b-flag-exp
	[ $? -eq 0 ]
)

should_handle_s_flag() (
	printf "ąbč đê fghi j\tķl mnopqrs" >"$TMPDIR"/test-s-flag

	exp="$(printf "ąbč đê \nfghi j\t\nķl mnopqrs")"
	res="$(fold -s -w 10 "$TMPDIR"/test-s-flag)"
	[ "$res" = "$exp" ] || return 1

	exp="$(printf "ąbč đê\n fghi j\t\nķl \nmnopqrs")"
	res="$(fold -b -s -w 10 "$TMPDIR"/test-s-flag)"
	[ "$res" = "$exp" ]
)

should_handle_w_flag() (
	read src <<-END_STR
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
	END_STR

	read -r exp_str_default <<-END_STR
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\
abcdefghijklmnopqrstuvwxyzab\ncdefghijklmnopqrstuvwxyz
	END_STR

	read -r exp_str_13 <<-END_STR
abcdefghijklm\nnopqrstuvwxyz\nabcdefghijklm\nnopqrstuvwxyz\n\
abcdefghijklm\nnopqrstuvwxyz\nabcdefghijklm\nnopqrstuvwxyz
	END_STR

	exp="$(printf "$exp_str_default")"
	res="$(printf "$src" | fold)"
	[ "$res" = "$exp" ] || return 1

	exp="$(printf "$exp_str_13")"
	res="$(printf "$src" | fold -w 13)"
	[ "$res" = "$exp" ]

	# for non-positive or non-number -w arguments fold should
	# behave unpredictably
)

should_handle_ddash fold /dev/null

runtests \
	should_handle_ddash \
	should_handle_w_flag \
	should_handle_one_file \
	should_handle_two_files \
	should_handle_stdin \
	should_handle_utf8 \
	should_handle_tab \
	should_handle_backspace \
	should_handle_cr \
	should_handle_b_flag \
	should_handle_s_flag

M test/meson.build => test/meson.build +1 -0
@@ 12,6 12,7 @@ test_files = [
	'echo',
	'env',
	'false',
	'fold',
	'head',
	'logname',
	'nice',