f2d583864b4c95633f8897b3e1d44b2fa5352899 — Gabor Koszegi a month ago 9de1ea7
Implement fold
8 files changed, 459 insertions(+), 1 deletions(-)

M STATUS
M doc/ctools.7.scd
A doc/fold.1.scd
M doc/meson.build
M meson.build
A src/fold.c
A test/fold
M test/meson.build
M STATUS => STATUS +1 -1
@@ 52,7 52,7 @@ T       expr
       N fg
     W   file
 T       find
-T       fold
+  D     fold
     W   fort77
 T       fuser
 T       gencat

M doc/ctools.7.scd => doc/ctools.7.scd +2 -0
@@ 43,6 43,8 @@ shell environment. These tools are used for tasks such as:
 :  Run command with a specified environment
 |  *false*(1)
 :  Exit with status code 1
+|  *fold*(1)
+:  Fold lines of input files
 |  *head*(1)
 :  Print the beginning of files
 |  *link*(1)

A doc/fold.1.scd => doc/fold.1.scd +47 -0
@@ 0,0 1,47 @@
+fold(1) "ctools"
+
+# NAME
+
+fold - fold lines of input files
+
+# SYNOPSIS
+
+*fold* [-bs] [-w _width_] [_file_...]
+
+# DESCRIPTION
+
+*fold* will break the lines of every input _file_ into segments which will have
+at most _width_ columns. By default, the column positions will be calculated in
+the sense of characters' display width.
+
+If no input file is given or "-" is listed as a filename, _stdin_ will be used
+as input.
+
+# OPTIONS
+
+*-b*
+	Count _width_ in bytes.
+
+*-s*
+	If a word should be broken at the end of the segment, the segment will end
+	after its last blank character, if such character exists.
+
+*-w* _width_
+	_width_ specifies the maximum length of the segments. The default width is
+	80 columns.
+
+# UNSPECIFIED BEHAVIOR
+
+The POSIX standard does not unambiguously specify the behavior of this command
+under certain conditions. Under such conditions, the ctools implementation of
+*fold* behaves as follows:
+
+- If *-w* is a not a positive integer, fold will print an error and exit with a
+  non-zero status code.
+
+# DISCLAIMER
+
+This command is part of ctools and is compatible with POSIX-1.2017, and may
+optionally support XSI extensions. This man page is not intended to be a
+complete reference, and where it disagrees with the specification, the
+specification takes precedence.

M doc/meson.build => doc/meson.build +1 -0
@@ 16,6 16,7 @@ man_files = [
 	'echo.1',
 	'env.1',
 	'false.1',
+	'fold.1',
 	'head.1',
 	'link.1',
 	'logname.1',

M meson.build => meson.build +1 -0
@@ 23,6 23,7 @@ oneshots = [
 	'echo',
 	'env',
 	'false',
+	'fold',
 	'head',
 	'logname',
 	'nice', # Included in base but only effective under XSI

A src/fold.c => src/fold.c +280 -0
@@ 0,0 1,280 @@
+#include <assert.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+static const unsigned int MODE_BYTES = 1;
+static const unsigned int MODE_BLANK = 2;
+
+static const size_t UTF8_MAX_LEN = 6;
+
+static void
+usage(void)
+{
+	fprintf(stderr, "usage: fold [-bs] [-w width] [file...]\n");
+}
+
+/*
+ *  Ret. val. | Meaning
+ * -----------+------------------------------------------------------------
+ *     > 0    | valid (actual length)
+ *      0     | undecidable (nbytes < encoded length)
+ *     < 0    | invalid (-1 * seq. length that is guaranteed to be invalid)
+ */
+static int
+utf8_len(unsigned char *seq, const int nbytes)
+{
+	/* undecidable */
+	if (nbytes < 1) {
+		return 0;
+	}
+
+	/* ASCII */
+	if (seq[0] < 0x80) {
+		return 1;
+	}
+
+	/* seq begins with continuation byte or
+	 * undefined leading byte */
+	if (seq[0] < 0xc0 || seq[0] >= 0xfe) {
+		return -1;
+	}
+
+	/* lower and upper bounds for second byte
+	 * which can be modified in special cases */
+	unsigned char lb = 0x80;
+	unsigned char ub = 0xc0;
+	/* length encoded in leading byte */
+	int bmax;
+
+	if (seq[0] < 0xe0) {
+		bmax = 2;
+		/* these can only start overlong sequences */
+		if (seq[0] < 0xc2) {
+			lb = 0xc0;
+		}
+	} else if (seq[0] < 0xf0) {
+		bmax = 3;
+		/* 0xe0 may start overlong sequences */
+		if (seq[0] == 0xe0) {
+			lb = 0xa0;
+		/* 0xed may start UTF-16 surrogate halves */
+		} else if (seq[0] == 0xed) {
+			ub = 0xa0;
+		}
+	} else if (seq[0] < 0xf8) {
+		bmax = 4;
+		/* 0xf0 may start overlong sequences */
+		if (seq[0] == 0xf0) {
+			lb = 0x90;
+		/* 0xf4 may code codepoints over Unicode's limit */
+		} else if (seq[0] == 0xf4) {
+			ub = 0x90;
+		} else if (seq[0] >= 0xf5) {
+			lb = 0xc0;
+		}
+	} else if (seq[0] < 0xfc) {
+		bmax = 5;
+		lb = 0xc0;
+	} else {
+		bmax = 6;
+		lb = 0xc0;
+	}
+
+	/* undecidable */
+	if (nbytes < bmax) {
+		return 0;
+	}
+
+	/* count valid continuation bytes */
+	int vbytes = 1;
+	for (int i = 1; i < bmax; ++i) {
+		if (seq[i] >= 0x80 && seq[i] < 0xc0) {
+			++vbytes;
+		}
+	}
+
+	if (vbytes != bmax || seq[1] < lb || seq[1] >= ub) {
+		vbytes *= -1;
+	}
+
+	return vbytes;
+}
+
+static int
+write_segment(const unsigned char * const buf, const ssize_t n)
+{
+	ssize_t offs = 0;
+	while (offs < n) {
+		ssize_t o = write(STDOUT_FILENO, buf, n);
+		if (o < 0) {
+			perror("stdout");
+			return 1;
+		}
+		offs += o;
+	}
+
+	return 0;
+}
+
+static int
+fold(char * const path, const long w, const unsigned int mode)
+{
+	assert(BUFSIZ > UTF8_MAX_LEN);
+
+	int fd;
+	if (path[0] == '-' && path[1] == '\0') {
+		fd = STDIN_FILENO;
+	} else {
+		if ((fd = open(path, O_RDONLY)) < 0) {
+			perror(path);
+			return 1;
+		}
+	}
+
+	ssize_t n;
+	unsigned char buf[BUFSIZ];
+	/* indices (position in buf) and pointers (position in segment)
+	 * NOTE: the index (bi) and the pointer (bp) of the last blank char in the
+	 *       segment should point to the position after the char in question */
+	ssize_t i, ri = 0, wb, we, cp = 0, bi, bp = w;
+	ssize_t cstep;
+	int l;
+	while ((n = read(fd, &buf[ri], sizeof(buf) - ri - 1)) > 0) {
+		n += ri;
+		ri = 0;
+		wb = 0;
+		for (i = 0; i < n; i += l) {
+			cstep = 0;
+			if ((mode & MODE_BYTES) != 0) {
+				l = 1;
+				if (buf[i] == '\n') {
+					cp = 0;
+				} else {
+					cstep = 1;
+				}
+			} else if ((l = utf8_len(&buf[i], n - i)) == 1) {
+				if (buf[i] == '\b') {
+					if (cp > 0) {
+						--cp;
+					}
+				} else if (buf[i] == '\n' || buf[i] == '\r') {
+					cp = 0;
+				} else if (buf[i] == '\t') {
+					cstep = 9 - cp % 8;
+				} else {
+					cstep = 1;
+				}
+			} else if (l > 1) {
+				cstep = 1;
+			} else if (l == 0 && i > n / 2) {
+				ri = n - i;
+				memcpy(buf, &buf[i], ri);
+				break;
+			} else {
+				fprintf(stderr, "utf-8: invalid byte sequence\n");
+				close(fd);
+				return 1;
+			}
+
+			ssize_t cp_next = cp + cstep;
+
+			if (cp_next > w) {
+				if ((mode & MODE_BLANK) != 0 && bp < w
+						&& isblank(buf[i]) == 0) {
+					we = bi;
+					cp = cp_next - bp;
+					bp = w;
+				} else {
+					we = i;
+					cp = (mode & MODE_BYTES) == 0 && buf[i] == '\t' ? 9
+						: cstep;
+				}
+
+				unsigned char swap = buf[we];
+				buf[we] = '\n';
+				if (write_segment(&buf[wb], we - wb + 1) != 0) {
+					close(fd);
+					return 1;
+				}
+				buf[we] = swap;
+
+				wb = we;
+			} else {
+				if ((mode & MODE_BLANK) != 0 && isblank(buf[i]) != 0) {
+					bi = i + l;
+					bp = cp_next;
+				}
+				cp = cp_next;
+			}
+		}
+
+		if (write_segment(&buf[wb], n - ri - wb) != 0) {
+			close(fd);
+			return 1;
+		}
+	}
+
+	close(fd);
+
+	if (n < 0) {
+		perror(fd == STDIN_FILENO ? "stdin" : path);
+		return 1;
+	}
+
+	return 0;
+}
+
+int
+main(int argc, char *argv[])
+{
+	long width = 80;
+	unsigned int mode = 0;
+
+	char opt;
+	while ((opt = getopt(argc, argv, "bsw:")) != -1) {
+		switch (opt) {
+		case 'b':
+			mode |= MODE_BYTES;
+			break;
+		case 's':
+			mode |= MODE_BLANK;
+			break;
+		case 'w':
+			width = strtol(optarg, NULL, 10);
+			break;
+		default:
+			usage();
+			return 1;
+		}
+	}
+
+	if (optind > argc) {
+		usage();
+		return 1;
+	}
+
+	if (width <= 0) {
+		fprintf(stderr, "-w: argument is not a positive integer\n");
+		return 1;
+	}
+
+	char *dash[] = {"-"};
+	if (optind == argc) {
+		optind = 0;
+		argc = 1;
+		argv = dash;
+	}
+
+	for (int i = optind; i < argc; ++i) {
+		if (fold(argv[i], width, mode) != 0) {
+			return 1;
+		}
+	}
+
+	return 0;
+}

A test/fold => test/fold +126 -0
@@ 0,0 1,126 @@
+#!/bin/sh
+tool="fold"
+. "$HARNESS"
+
+should_handle_one_file() (
+	printf "abcdefghijklmnopqrstuvwxyz" >"$TMPDIR"/test-one-file
+	exp="$(printf "abcdefgh\nijklmnop\nqrstuvwx\nyz")"
+	res="$(fold -w 8 "$TMPDIR"/test-one-file)"
+	[ "$res" = "$exp" ]
+)
+
+should_handle_two_files() (
+	printf "abcdefghijklmno\n" >"$TMPDIR"/test-two-files-1
+	printf "pqrstuvwxyz\n" >"$TMPDIR"/test-two-files-2
+	exp="$(printf "pqrstuv\nwxyz\nabcdefg\nhijklmn\no\n")"
+	res="$(fold -w 7 "$TMPDIR"/test-two-files-2 "$TMPDIR"/test-two-files-1)"
+	[ "$res" = "$exp" ]
+)
+
+should_handle_stdin() (
+	exp="$(printf "STDIN tes\nt line1\nstdin TES\nT line 2")"
+	res1="$(printf "STDIN test line1\nstdin TEST line 2" | fold -w 9)"
+	res2="$(printf "STDIN test line1\nstdin TEST line 2" | fold -w 9 -)"
+	[ "$res1" = "$res2" ] && [ "$res1" = "$exp" ]
+)
+
+should_handle_utf8() (
+	exp="$(printf "Ώ¥あ∀𝜔\n∊ℝ 𝜉(\n𝜔)≿⌨")"
+	res="$(printf "Ώ¥あ∀𝜔∊ℝ 𝜉(𝜔)≿⌨" | fold -w 5)"
+	[ "$res" = "$exp" ]
+)
+
+should_handle_tab() (
+	printf "テスト\ttest\tтест\tδοκιμή" >"$TMPDIR"/test-tab
+	exp="$(printf "テスト\ttest\tтес\nт\tδοκιμή")"
+	res="$(fold -w 20 "$TMPDIR"/test-tab)"
+
+	[ "$res" = "$exp" ]
+)
+
+should_handle_backspace() (
+	str="abcdefghij\bklmnopqrst\buvwxyzyxwvut\b\tsr"
+	exp="$(printf "abcdefghij\bk\nlmnopqrst\buv\nwxyzyxwvut\b\n\ts\nr")"
+	res="$(printf "$str" | fold -w 10)"
+	[ "$res" = "$exp" ]
+)
+
+should_handle_cr() (
+	str="abcdef\rghijk\rlmnopqr\rstuv\nwxyzyxwvutsr"
+	exp="$(printf "abcdef\rghijk\rlmnopq\nr\rstuv\nwxyzyx\nwvutsr")"
+	res="$(printf "$str" | fold -w 6)"
+	[ "$res" = "$exp" ]
+)
+
+should_handle_b_flag() (
+	printf "テ\rスト\ttest\tтес\bт\tδοκιμή" >"$TMPDIR"/test-b-flag-src
+	read -r bytes <<-END_BYTES
+\343\203\206\015\343\012\
+\202\271\343\203\210\012\
+\011\164\145\163\164\012\
+\011\321\202\320\265\012\
+\321\201\010\321\202\012\
+\011\316\264\316\277\012\
+\316\272\316\271\316\012\
+\274\316\256
+	END_BYTES
+	printf "$bytes" >"$TMPDIR"/test-b-flag-exp
+	fold -b -w 5 "$TMPDIR"/test-b-flag-src >"$TMPDIR"/test-b-flag-dst
+	cmp -s "$TMPDIR"/test-b-flag-dst "$TMPDIR"/test-b-flag-exp
+	[ $? -eq 0 ]
+)
+
+should_handle_s_flag() (
+	printf "ąbč đê fghi j\tķl mnopqrs" >"$TMPDIR"/test-s-flag
+
+	exp="$(printf "ąbč đê \nfghi j\t\nķl mnopqrs")"
+	res="$(fold -s -w 10 "$TMPDIR"/test-s-flag)"
+	[ "$res" = "$exp" ] || return 1
+
+	exp="$(printf "ąbč đê\n fghi j\t\nķl \nmnopqrs")"
+	res="$(fold -b -s -w 10 "$TMPDIR"/test-s-flag)"
+	[ "$res" = "$exp" ]
+)
+
+should_handle_w_flag() (
+	read src <<-END_STR
+abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\
+abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
+	END_STR
+
+	read -r exp_str_default <<-END_STR
+abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\
+abcdefghijklmnopqrstuvwxyzab\ncdefghijklmnopqrstuvwxyz
+	END_STR
+
+	read -r exp_str_13 <<-END_STR
+abcdefghijklm\nnopqrstuvwxyz\nabcdefghijklm\nnopqrstuvwxyz\n\
+abcdefghijklm\nnopqrstuvwxyz\nabcdefghijklm\nnopqrstuvwxyz
+	END_STR
+
+	exp="$(printf "$exp_str_default")"
+	res="$(printf "$src" | fold)"
+	[ "$res" = "$exp" ] || return 1
+
+	exp="$(printf "$exp_str_13")"
+	res="$(printf "$src" | fold -w 13)"
+	[ "$res" = "$exp" ]
+
+	# for non-positive or non-number -w arguments fold should
+	# behave unpredictably
+)
+
+should_handle_ddash fold /dev/null
+
+runtests \
+	should_handle_ddash \
+	should_handle_w_flag \
+	should_handle_one_file \
+	should_handle_two_files \
+	should_handle_stdin \
+	should_handle_utf8 \
+	should_handle_tab \
+	should_handle_backspace \
+	should_handle_cr \
+	should_handle_b_flag \
+	should_handle_s_flag

M test/meson.build => test/meson.build +1 -0
@@ 12,6 12,7 @@ test_files = [
 	'echo',
 	'env',
 	'false',
+	'fold',
 	'head',
 	'logname',
 	'nice',