From f2d583864b4c95633f8897b3e1d44b2fa5352899 Mon Sep 17 00:00:00 2001 From: Gabor Koszegi Date: Tue, 26 Nov 2019 21:31:11 +0100 Subject: [PATCH] Implement fold --- STATUS | 2 +- doc/ctools.7.scd | 2 + doc/fold.1.scd | 47 ++++++++ doc/meson.build | 1 + meson.build | 1 + src/fold.c | 280 +++++++++++++++++++++++++++++++++++++++++++++++ test/fold | 126 +++++++++++++++++++++ test/meson.build | 1 + 8 files changed, 459 insertions(+), 1 deletion(-) create mode 100644 doc/fold.1.scd create mode 100644 src/fold.c create mode 100755 test/fold diff --git a/STATUS b/STATUS index f50dafb..a4069c3 100644 --- a/STATUS +++ b/STATUS @@ -52,7 +52,7 @@ T expr N fg W file T find -T fold + D fold W fort77 T fuser T gencat diff --git a/doc/ctools.7.scd b/doc/ctools.7.scd index f98cc8f..cdf6926 100644 --- a/doc/ctools.7.scd +++ b/doc/ctools.7.scd @@ -43,6 +43,8 @@ shell environment. These tools are used for tasks such as: : Run command with a specified environment | *false*(1) : Exit with status code 1 +| *fold*(1) +: Fold lines of input files | *head*(1) : Print the beginning of files | *link*(1) diff --git a/doc/fold.1.scd b/doc/fold.1.scd new file mode 100644 index 0000000..507a4b2 --- /dev/null +++ b/doc/fold.1.scd @@ -0,0 +1,47 @@ +fold(1) "ctools" + +# NAME + +fold - fold lines of input files + +# SYNOPSIS + +*fold* [-bs] [-w _width_] [_file_...] + +# DESCRIPTION + +*fold* will break the lines of every input _file_ into segments which will have +at most _width_ columns. By default, the column positions will be calculated in +the sense of characters' display width. + +If no input file is given or "-" is listed as a filename, _stdin_ will be used +as input. + +# OPTIONS + +*-b* + Count _width_ in bytes. + +*-s* + If a word should be broken at the end of the segment, the segment will end + after its last blank character, if such character exists. + +*-w* _width_ + _width_ specifies the maximum length of the segments. The default width is + 80 columns. + +# UNSPECIFIED BEHAVIOR + +The POSIX standard does not unambiguously specify the behavior of this command +under certain conditions. Under such conditions, the ctools implementation of +*fold* behaves as follows: + +- If *-w* is a not a positive integer, fold will print an error and exit with a + non-zero status code. + +# DISCLAIMER + +This command is part of ctools and is compatible with POSIX-1.2017, and may +optionally support XSI extensions. This man page is not intended to be a +complete reference, and where it disagrees with the specification, the +specification takes precedence. diff --git a/doc/meson.build b/doc/meson.build index 17f3cc2..1fc4387 100644 --- a/doc/meson.build +++ b/doc/meson.build @@ -16,6 +16,7 @@ man_files = [ 'echo.1', 'env.1', 'false.1', + 'fold.1', 'head.1', 'link.1', 'logname.1', diff --git a/meson.build b/meson.build index 41c64f2..fbf4840 100644 --- a/meson.build +++ b/meson.build @@ -23,6 +23,7 @@ oneshots = [ 'echo', 'env', 'false', + 'fold', 'head', 'logname', 'nice', # Included in base but only effective under XSI diff --git a/src/fold.c b/src/fold.c new file mode 100644 index 0000000..eb48a2d --- /dev/null +++ b/src/fold.c @@ -0,0 +1,280 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static const unsigned int MODE_BYTES = 1; +static const unsigned int MODE_BLANK = 2; + +static const size_t UTF8_MAX_LEN = 6; + +static void +usage(void) +{ + fprintf(stderr, "usage: fold [-bs] [-w width] [file...]\n"); +} + +/* + * Ret. val. | Meaning + * -----------+------------------------------------------------------------ + * > 0 | valid (actual length) + * 0 | undecidable (nbytes < encoded length) + * < 0 | invalid (-1 * seq. length that is guaranteed to be invalid) + */ +static int +utf8_len(unsigned char *seq, const int nbytes) +{ + /* undecidable */ + if (nbytes < 1) { + return 0; + } + + /* ASCII */ + if (seq[0] < 0x80) { + return 1; + } + + /* seq begins with continuation byte or + * undefined leading byte */ + if (seq[0] < 0xc0 || seq[0] >= 0xfe) { + return -1; + } + + /* lower and upper bounds for second byte + * which can be modified in special cases */ + unsigned char lb = 0x80; + unsigned char ub = 0xc0; + /* length encoded in leading byte */ + int bmax; + + if (seq[0] < 0xe0) { + bmax = 2; + /* these can only start overlong sequences */ + if (seq[0] < 0xc2) { + lb = 0xc0; + } + } else if (seq[0] < 0xf0) { + bmax = 3; + /* 0xe0 may start overlong sequences */ + if (seq[0] == 0xe0) { + lb = 0xa0; + /* 0xed may start UTF-16 surrogate halves */ + } else if (seq[0] == 0xed) { + ub = 0xa0; + } + } else if (seq[0] < 0xf8) { + bmax = 4; + /* 0xf0 may start overlong sequences */ + if (seq[0] == 0xf0) { + lb = 0x90; + /* 0xf4 may code codepoints over Unicode's limit */ + } else if (seq[0] == 0xf4) { + ub = 0x90; + } else if (seq[0] >= 0xf5) { + lb = 0xc0; + } + } else if (seq[0] < 0xfc) { + bmax = 5; + lb = 0xc0; + } else { + bmax = 6; + lb = 0xc0; + } + + /* undecidable */ + if (nbytes < bmax) { + return 0; + } + + /* count valid continuation bytes */ + int vbytes = 1; + for (int i = 1; i < bmax; ++i) { + if (seq[i] >= 0x80 && seq[i] < 0xc0) { + ++vbytes; + } + } + + if (vbytes != bmax || seq[1] < lb || seq[1] >= ub) { + vbytes *= -1; + } + + return vbytes; +} + +static int +write_segment(const unsigned char * const buf, const ssize_t n) +{ + ssize_t offs = 0; + while (offs < n) { + ssize_t o = write(STDOUT_FILENO, buf, n); + if (o < 0) { + perror("stdout"); + return 1; + } + offs += o; + } + + return 0; +} + +static int +fold(char * const path, const long w, const unsigned int mode) +{ + assert(BUFSIZ > UTF8_MAX_LEN); + + int fd; + if (path[0] == '-' && path[1] == '\0') { + fd = STDIN_FILENO; + } else { + if ((fd = open(path, O_RDONLY)) < 0) { + perror(path); + return 1; + } + } + + ssize_t n; + unsigned char buf[BUFSIZ]; + /* indices (position in buf) and pointers (position in segment) + * NOTE: the index (bi) and the pointer (bp) of the last blank char in the + * segment should point to the position after the char in question */ + ssize_t i, ri = 0, wb, we, cp = 0, bi, bp = w; + ssize_t cstep; + int l; + while ((n = read(fd, &buf[ri], sizeof(buf) - ri - 1)) > 0) { + n += ri; + ri = 0; + wb = 0; + for (i = 0; i < n; i += l) { + cstep = 0; + if ((mode & MODE_BYTES) != 0) { + l = 1; + if (buf[i] == '\n') { + cp = 0; + } else { + cstep = 1; + } + } else if ((l = utf8_len(&buf[i], n - i)) == 1) { + if (buf[i] == '\b') { + if (cp > 0) { + --cp; + } + } else if (buf[i] == '\n' || buf[i] == '\r') { + cp = 0; + } else if (buf[i] == '\t') { + cstep = 9 - cp % 8; + } else { + cstep = 1; + } + } else if (l > 1) { + cstep = 1; + } else if (l == 0 && i > n / 2) { + ri = n - i; + memcpy(buf, &buf[i], ri); + break; + } else { + fprintf(stderr, "utf-8: invalid byte sequence\n"); + close(fd); + return 1; + } + + ssize_t cp_next = cp + cstep; + + if (cp_next > w) { + if ((mode & MODE_BLANK) != 0 && bp < w + && isblank(buf[i]) == 0) { + we = bi; + cp = cp_next - bp; + bp = w; + } else { + we = i; + cp = (mode & MODE_BYTES) == 0 && buf[i] == '\t' ? 9 + : cstep; + } + + unsigned char swap = buf[we]; + buf[we] = '\n'; + if (write_segment(&buf[wb], we - wb + 1) != 0) { + close(fd); + return 1; + } + buf[we] = swap; + + wb = we; + } else { + if ((mode & MODE_BLANK) != 0 && isblank(buf[i]) != 0) { + bi = i + l; + bp = cp_next; + } + cp = cp_next; + } + } + + if (write_segment(&buf[wb], n - ri - wb) != 0) { + close(fd); + return 1; + } + } + + close(fd); + + if (n < 0) { + perror(fd == STDIN_FILENO ? "stdin" : path); + return 1; + } + + return 0; +} + +int +main(int argc, char *argv[]) +{ + long width = 80; + unsigned int mode = 0; + + char opt; + while ((opt = getopt(argc, argv, "bsw:")) != -1) { + switch (opt) { + case 'b': + mode |= MODE_BYTES; + break; + case 's': + mode |= MODE_BLANK; + break; + case 'w': + width = strtol(optarg, NULL, 10); + break; + default: + usage(); + return 1; + } + } + + if (optind > argc) { + usage(); + return 1; + } + + if (width <= 0) { + fprintf(stderr, "-w: argument is not a positive integer\n"); + return 1; + } + + char *dash[] = {"-"}; + if (optind == argc) { + optind = 0; + argc = 1; + argv = dash; + } + + for (int i = optind; i < argc; ++i) { + if (fold(argv[i], width, mode) != 0) { + return 1; + } + } + + return 0; +} diff --git a/test/fold b/test/fold new file mode 100755 index 0000000..18d87b5 --- /dev/null +++ b/test/fold @@ -0,0 +1,126 @@ +#!/bin/sh +tool="fold" +. "$HARNESS" + +should_handle_one_file() ( + printf "abcdefghijklmnopqrstuvwxyz" >"$TMPDIR"/test-one-file + exp="$(printf "abcdefgh\nijklmnop\nqrstuvwx\nyz")" + res="$(fold -w 8 "$TMPDIR"/test-one-file)" + [ "$res" = "$exp" ] +) + +should_handle_two_files() ( + printf "abcdefghijklmno\n" >"$TMPDIR"/test-two-files-1 + printf "pqrstuvwxyz\n" >"$TMPDIR"/test-two-files-2 + exp="$(printf "pqrstuv\nwxyz\nabcdefg\nhijklmn\no\n")" + res="$(fold -w 7 "$TMPDIR"/test-two-files-2 "$TMPDIR"/test-two-files-1)" + [ "$res" = "$exp" ] +) + +should_handle_stdin() ( + exp="$(printf "STDIN tes\nt line1\nstdin TES\nT line 2")" + res1="$(printf "STDIN test line1\nstdin TEST line 2" | fold -w 9)" + res2="$(printf "STDIN test line1\nstdin TEST line 2" | fold -w 9 -)" + [ "$res1" = "$res2" ] && [ "$res1" = "$exp" ] +) + +should_handle_utf8() ( + exp="$(printf "Ώ¥あ∀𝜔\n∊ℝ 𝜉(\n𝜔)≿⌨")" + res="$(printf "Ώ¥あ∀𝜔∊ℝ 𝜉(𝜔)≿⌨" | fold -w 5)" + [ "$res" = "$exp" ] +) + +should_handle_tab() ( + printf "テスト\ttest\tтест\tδοκιμή" >"$TMPDIR"/test-tab + exp="$(printf "テスト\ttest\tтес\nт\tδοκιμή")" + res="$(fold -w 20 "$TMPDIR"/test-tab)" + + [ "$res" = "$exp" ] +) + +should_handle_backspace() ( + str="abcdefghij\bklmnopqrst\buvwxyzyxwvut\b\tsr" + exp="$(printf "abcdefghij\bk\nlmnopqrst\buv\nwxyzyxwvut\b\n\ts\nr")" + res="$(printf "$str" | fold -w 10)" + [ "$res" = "$exp" ] +) + +should_handle_cr() ( + str="abcdef\rghijk\rlmnopqr\rstuv\nwxyzyxwvutsr" + exp="$(printf "abcdef\rghijk\rlmnopq\nr\rstuv\nwxyzyx\nwvutsr")" + res="$(printf "$str" | fold -w 6)" + [ "$res" = "$exp" ] +) + +should_handle_b_flag() ( + printf "テ\rスト\ttest\tтес\bт\tδοκιμή" >"$TMPDIR"/test-b-flag-src + read -r bytes <<-END_BYTES +\343\203\206\015\343\012\ +\202\271\343\203\210\012\ +\011\164\145\163\164\012\ +\011\321\202\320\265\012\ +\321\201\010\321\202\012\ +\011\316\264\316\277\012\ +\316\272\316\271\316\012\ +\274\316\256 + END_BYTES + printf "$bytes" >"$TMPDIR"/test-b-flag-exp + fold -b -w 5 "$TMPDIR"/test-b-flag-src >"$TMPDIR"/test-b-flag-dst + cmp -s "$TMPDIR"/test-b-flag-dst "$TMPDIR"/test-b-flag-exp + [ $? -eq 0 ] +) + +should_handle_s_flag() ( + printf "ąbč đê fghi j\tķl mnopqrs" >"$TMPDIR"/test-s-flag + + exp="$(printf "ąbč đê \nfghi j\t\nķl mnopqrs")" + res="$(fold -s -w 10 "$TMPDIR"/test-s-flag)" + [ "$res" = "$exp" ] || return 1 + + exp="$(printf "ąbč đê\n fghi j\t\nķl \nmnopqrs")" + res="$(fold -b -s -w 10 "$TMPDIR"/test-s-flag)" + [ "$res" = "$exp" ] +) + +should_handle_w_flag() ( + read src <<-END_STR +abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\ +abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz + END_STR + + read -r exp_str_default <<-END_STR +abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz\ +abcdefghijklmnopqrstuvwxyzab\ncdefghijklmnopqrstuvwxyz + END_STR + + read -r exp_str_13 <<-END_STR +abcdefghijklm\nnopqrstuvwxyz\nabcdefghijklm\nnopqrstuvwxyz\n\ +abcdefghijklm\nnopqrstuvwxyz\nabcdefghijklm\nnopqrstuvwxyz + END_STR + + exp="$(printf "$exp_str_default")" + res="$(printf "$src" | fold)" + [ "$res" = "$exp" ] || return 1 + + exp="$(printf "$exp_str_13")" + res="$(printf "$src" | fold -w 13)" + [ "$res" = "$exp" ] + + # for non-positive or non-number -w arguments fold should + # behave unpredictably +) + +should_handle_ddash fold /dev/null + +runtests \ + should_handle_ddash \ + should_handle_w_flag \ + should_handle_one_file \ + should_handle_two_files \ + should_handle_stdin \ + should_handle_utf8 \ + should_handle_tab \ + should_handle_backspace \ + should_handle_cr \ + should_handle_b_flag \ + should_handle_s_flag diff --git a/test/meson.build b/test/meson.build index 271687b..6f5af50 100644 --- a/test/meson.build +++ b/test/meson.build @@ -12,6 +12,7 @@ test_files = [ 'echo', 'env', 'false', + 'fold', 'head', 'logname', 'nice', -- 2.26.2