From d15e06d5247bc1ce246d174413fc614eb4fedff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A1=D1=82=D1=80=D0=B0=D1=85=D0=B8=D1=9A=D0=B0=20=D0=A0?= =?UTF-8?q?=D0=B0=D0=B4=D0=B8=D1=9B?= Date: Sun, 23 Jun 2024 20:22:02 +0000 Subject: [PATCH] Use sysconf to obtain max line size; realloc line as needed (fix reading sfeed-generated TSV with potentially huge lines); code cleanup --- defs.h | 12 ++--- table.c | 156 +++++++++++++++++++++++++++++++++----------------------- utf8.c | 84 +++++++++++++++--------------- utf8.h | 4 +- 4 files changed, 140 insertions(+), 116 deletions(-) diff --git a/defs.h b/defs.h index 9aed906..7ee3d23 100644 --- a/defs.h +++ b/defs.h @@ -4,18 +4,16 @@ #include "utf8.h" -#define BUFSIZE 4096 -#define SMALL_BUFSIZE 256 +#define BUF_DEFAULT 4096 +#define BUF_DELTA 512 +#define FORMAT_DEFAULT 256 // #define ANSI_SGR_RESET "\033[0m\033[?25h" #define ANSI_SGR_BOLD_ON "\033[1m" #define ANSI_SGR_BOLD_OFF "\033[0m" -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -typedef unsigned char UBYTE; -typedef unsigned int UINT; -typedef unsigned long ULONG; +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define UNUSED(x) ((void)(x)) typedef enum { CMD_NONE, diff --git a/table.c b/table.c index ec40dc7..9d2702a 100644 --- a/table.c +++ b/table.c @@ -3,6 +3,7 @@ * See the file LICENSE for exact copyright and license details. */ #include +#include #include #include #include @@ -51,26 +52,28 @@ if (ptoken + 2 > token + token_size) \ { \ size_t old_size = token_size; \ - token_size += BUFSIZE; \ + token_size += BUF_DELTA; \ REALLOC(token, char, token_size); \ ptoken = token + old_size - 1; \ } \ *ptoken++ = *parg++; \ } while (0) -#define CHECKSET(format, pformat, format_size, num) \ - do \ - { \ - if (pformat + 2 > format + format_size) \ - { \ - size_t old_size = format_size; \ - format_size += BUFSIZE; \ - REALLOC(format, ULONG, format_size); \ - pformat = format + old_size - 1; \ - } \ - *pformat++ = num; \ +#define CHECKSET(format, pformat, format_size, num) \ + do \ + { \ + if (pformat + 2 > format + format_size) \ + { \ + size_t old_size = format_size; \ + format_size += BUF_DELTA; \ + REALLOC(format, int, format_size); \ + pformat = format + old_size - 1; \ + } \ + *pformat++ = num; \ } while (0) +long line_max; +ssize_t line_size; size_t colno = 0; size_t lineno = 0; int current_symbol_set = TABLE_SYMBOLS_DOUBLE; @@ -81,9 +84,9 @@ size_t table_columns = 0; size_t rune_columns = 80; size_t tab_length = 8; u32 delimiter = L','; -ULONG* format = NULL; +int* format = NULL; size_t format_size = 0; -ULONG format_value = 0; +int format_value = 0; Alignment* align = NULL; int align_header = 0; size_t align_size = 0; @@ -124,7 +127,7 @@ usage(void) void warning(char* fmt, ...) { - char buf[BUFSIZE]; + char buf[line_max]; va_list args; va_start(args, fmt); vsnprintf(buf, sizeof(buf), (const char*)fmt, args); @@ -135,7 +138,7 @@ warning(char* fmt, ...) int error(int code, char* fmt, ...) { - char buf[BUFSIZE]; + char buf[line_max]; va_list args; va_start(args, fmt); vsnprintf(buf, sizeof(buf), (const char*)fmt, args); @@ -320,19 +323,19 @@ set_delimiter(u8* arg, u32* delimiter) } int -set_format(const char* arg, ULONG** format, size_t* format_size) +set_format(const char* arg, int** format, size_t* format_size) { const char* parg = arg; - ULONG* pformat = NULL; + int* pformat = NULL; char* token = NULL; - size_t token_size = BUFSIZE; + size_t token_size = BUF_DEFAULT; char* ptoken = NULL; - ULONG num; + int num; CALLOC(token, char, token_size); ptoken = token; - *format_size = SMALL_BUFSIZE; - CALLOC(*format, ULONG, *format_size); + *format_size = FORMAT_DEFAULT; + CALLOC(*format, int, *format_size); pformat = *format; while (*parg) @@ -578,7 +581,10 @@ number_of_columns(const u8* input, u32 delimiter) while (pinput && *pinput) { - u8_char_to_u32(&uch, pinput, &ch_len); + if (u8_char_to_u32(&uch, pinput, &ch_len)) + warning("Malformed UTF8 at position %td: uch=%02X, " + "pinput=%02X", + pinput - input, uch, *pinput); if (uch == delimiter) result++; pinput += ch_len; @@ -587,8 +593,8 @@ number_of_columns(const u8* input, u32 delimiter) return result; } -UINT -round_div(UINT a, UINT b) +unsigned int +round_div(unsigned int a, unsigned int b) { return (a + (b / 2)) / b; } @@ -596,11 +602,28 @@ round_div(UINT a, UINT b) int main(int argc, char** argv) { + FILE* input = NULL; + Command cmd = CMD_NONE; + u32 uch; + u8* line = NULL; + u8* pline = NULL; + u8* outbuf = NULL; + u8* poutbuf = NULL; + u8* pch_end; + u8 u8ch[7]; char* arg; - Command cmd = CMD_NONE; - char* filename = NULL; - char* sep = NULL; - char* progname = NULL; + char* filename = NULL; + char* sep = NULL; + char* progname = NULL; + char* eol = NULL; + size_t outbufrunelen = 0; + size_t ch_len = 0; + size_t output_lines = 0; + size_t column_start = 0; + int in_quote = 0; + int u8len = 0; + + UNUSED(argc); #ifdef __OpenBSD__ if (pledge("stdio rpath unveil", NULL) < 0) @@ -635,6 +658,16 @@ main(int argc, char** argv) } } + line_max = sysconf(_SC_LINE_MAX); + if (line_max == -1) + { + if (errno) + { + perror(PROGRAMNAME ": sysconf"); + exit(1); + } + line_max = _POSIX2_LINE_MAX; + } do_arg: arg = *++argv; @@ -842,7 +875,6 @@ done_arg: else if (cmd == CMD_VERSION) return version(0); - FILE* input = NULL; if (filename) { #ifdef __OpenBSD__ @@ -864,38 +896,33 @@ done_arg: else input = stdin; - u8* line = NULL; - u8* pline = NULL; - u8* outbuf = NULL; - u8* poutbuf = NULL; - size_t outbufrunelen = 0; - int in_quote = 0; - u32 uch; - u8 u8ch[7]; - size_t u8len = 0; - size_t ch_len = 0; - size_t output_lines = 0; - size_t column_start = 0; - u8* pch_end; - - CALLOC(line, u8, BUFSIZE); - CALLOC(outbuf, u8, BUFSIZE); + line_size = line_max; + CALLOC(line, u8, line_size); + CALLOC(outbuf, u8, line_size); + pline = line; do_input: if (feof(input)) goto done_input; - char* eol = NULL; - if (!fgets((char*)line, BUFSIZE - 1, input)) + if (!fgets((char*)pline, line_size - (pline - line), input)) goto done_input; - eol = strchr((char*)line, '\n'); + eol = strchr((char*)pline, '\n'); if (eol) { - if (msdos && *line != '\n' && *(eol - 1) == '\r') + if (msdos && *pline != '\n' && *(eol - 1) == '\r') *(eol - 1) = 0; *eol = 0; } + else + { + line_size += line_max; + REALLOC(line, u8, line_size); + REALLOC(outbuf, u8, line_size); + pline = line + line_size - line_max - 1; + goto do_input; + } if (!*line && lineno != 0) goto empty_line; @@ -922,8 +949,8 @@ do_input: if (format && !border_mode) { - ULONG* pformat = format; - ULONG format_sum = 0; + int* pformat = format; + int format_sum = 0; while (*pformat) format_sum += *pformat++; pformat = format; @@ -1028,6 +1055,7 @@ empty_line: u8len = u32_char_to_u8(u8ch, *table_symbols[current_symbol_set][5]); u8ch[u8len] = 0; printf("%s\n", u8ch); + pline = line; goto do_input; not_first: @@ -1073,7 +1101,7 @@ continue_line: { in_sgr = 1; if (!no_ansi) - poutbuf += snprintf((char*)poutbuf, BUFSIZE, "%c", + poutbuf += snprintf((char*)poutbuf, BUF_DEFAULT, "%c", *pline); pline++; colno++; @@ -1081,7 +1109,7 @@ continue_line: else if (uch == (u32)L'm') { if (!no_ansi || !in_sgr) - poutbuf += snprintf((char*)poutbuf, BUFSIZE, "%c", + poutbuf += snprintf((char*)poutbuf, BUF_DEFAULT, "%c", *pline); if (!in_sgr) outbufrunelen++; @@ -1095,7 +1123,7 @@ continue_line: { pch_end = pline + ch_len; while (pline != pch_end) - poutbuf += snprintf((char*)poutbuf, BUFSIZE, + poutbuf += snprintf((char*)poutbuf, BUF_DEFAULT, "%c", *pline++); } else @@ -1108,7 +1136,7 @@ continue_line: size_t colwidth = current_column_width(); if (!no_ansi && lineno == 0) - snprintf((char*)poutbuf, BUFSIZE, "%s", + snprintf((char*)poutbuf, BUF_DEFAULT, "%s", ANSI_SGR_BOLD_OFF); print_aligned(outbuf, outbufrunelen, colwidth, @@ -1146,7 +1174,7 @@ continue_line: } if (!no_ansi && lineno == 0) - poutbuf += snprintf((char*)poutbuf, BUFSIZE, "%s", + poutbuf += snprintf((char*)poutbuf, BUF_DEFAULT, "%s", ANSI_SGR_BOLD_ON); pline += ch_len; @@ -1154,7 +1182,7 @@ continue_line: } else if (!ch_len) /* UTF-8 error */ { - snprintf((char*)poutbuf, BUFSIZE, "%c", *pline); + snprintf((char*)poutbuf, BUF_DEFAULT, "%c", *pline); poutbuf++; outbufrunelen++; pline++; @@ -1164,7 +1192,8 @@ continue_line: { while ((current_rune_column + outbufrunelen) % tab_length != 0) { - snprintf((char*)poutbuf, BUFSIZE, "%c", alignment_char); + snprintf((char*)poutbuf, BUF_DEFAULT, "%c", + alignment_char); poutbuf++; outbufrunelen++; } @@ -1175,7 +1204,7 @@ continue_line: { pch_end = pline + ch_len; while (pline != pch_end) - poutbuf += snprintf((char*)poutbuf, BUFSIZE, "%c", + poutbuf += snprintf((char*)poutbuf, BUF_DEFAULT, "%c", *pline++); outbufrunelen++; colno += ch_len; @@ -1193,7 +1222,7 @@ done_line:; : format_value; if (!no_ansi && lineno == 0) - snprintf((char*)poutbuf, BUFSIZE, "%s", + snprintf((char*)poutbuf, BUF_DEFAULT, "%s", ANSI_SGR_BOLD_OFF); print_aligned(outbuf, outbufrunelen, colwidth, @@ -1238,6 +1267,7 @@ done_line:; output_lines++; lineno++; + pline = line; goto do_input; done_input: @@ -1303,11 +1333,11 @@ skip_bottom_border: if (opause) { - char buf[BUFSIZE]; + char buf[line_max]; printf("Press Enter to continue: "); fflush(stdout); clearerr(stdin); - fgets(buf, BUFSIZE, stdin); + fgets(buf, line_max, stdin); } fclose(input); diff --git a/utf8.c b/utf8.c index fc53a54..af1215f 100644 --- a/utf8.c +++ b/utf8.c @@ -20,40 +20,36 @@ * (2^1 = 2 chars) */ -const int -utf_length_table[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 32 */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 64 */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 96 */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 128 */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 160 */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 192 */ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 224 */ - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, /* 256 */ +const int utf_length_table[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, /* 32 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, /* 64 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, /* 96 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, /* 128 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, /* 160 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, /* 192 */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, /* 224 */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 6, 6, 1, 1, /* 256 */ }; -const int -extract_masks[6] = { - XMASK1, XMASK2, XMASK3, XMASK4, XMASK5, XMASK6 -}; +const int extract_masks[6] = {XMASK1, XMASK2, XMASK3, XMASK4, XMASK5, XMASK6}; int -u8_char_to_u32(u32* to, const u8 *from, size_t* from_delta) +u8_char_to_u32(u32* to, const u8* from, size_t* from_delta) { const u8* pfrom = from; if (!from) return 1; *from_delta = 0; - int len = utf_length_table[*from]; - *to = *from & extract_masks[len-1]; + int len = utf_length_table[*from]; + *to = *from & extract_masks[len - 1]; for (int i = 1; i < len; i++) { pfrom = from + i; @@ -67,43 +63,43 @@ u8_char_to_u32(u32* to, const u8 *from, size_t* from_delta) return 0; } -size_t -u32_char_to_u8(u8 *to, const u32 from) +int +u32_char_to_u8(u8* to, const u32 from) { - size_t len = 0; - u8 start = 0; u32 cfrom = from; + u8 start = 0; + int len = 0; if (from >= BOUND6) { start = START6; - len = 6; + len = 6; } else if (from >= BOUND5) { start = START5; - len = 5; + len = 5; } else if (from >= BOUND4) { start = START4; - len = 4; + len = 4; } else if (from >= BOUND3) { start = START3; - len = 3; + len = 3; } else if (from >= BOUND2) { start = START2; - len = 2; + len = 2; } else { start = START1; - len = 1; + len = 1; } - for (int i = len-1; i > 0; i--) + for (int i = len - 1; i > 0; i--) { to[i] = STARTR | (cfrom & XMASKR); cfrom >>= 6; @@ -113,14 +109,14 @@ u32_char_to_u8(u8 *to, const u32 from) } int -u8_to_u32(u32 *to, const u8 *from, size_t* from_delta) +u8_to_u32(u32* to, const u8* from, size_t* from_delta) { const u8* pfrom = from; - u32* pto = to; + u32* pto = to; if (!from) return 1; size_t delta = 0; - *from_delta = 0; + *from_delta = 0; while (*pfrom) { int result = u8_char_to_u32(pto, pfrom, &delta); @@ -134,12 +130,12 @@ u8_to_u32(u32 *to, const u8 *from, size_t* from_delta) return 0; } -size_t -u32_to_u8(u8 *to, const u32* from) +int +u32_to_u8(u8* to, const u32* from) { - u8* pto = to; const u32* pfrom = from; - size_t len = 0; + u8* pto = to; + int len = 0; if (!from) return 0; while (*pfrom) @@ -152,7 +148,7 @@ u32_to_u8(u8 *to, const u32* from) return len; } -size_t +size_t u32_strlen(const u32* s) { const u32* ps = s; diff --git a/utf8.h b/utf8.h index f4a2588..a3a08ce 100644 --- a/utf8.h +++ b/utf8.h @@ -35,7 +35,7 @@ typedef uint8_t u8; typedef uint32_t u32; int u8_char_to_u32(u32* to, const u8* from, size_t* from_delta); -size_t u32_char_to_u8(u8* to, const u32 from); +int u32_char_to_u8(u8* to, const u32 from); int u8_to_u32(u32* to, const u8* from, size_t* from_delta); -size_t u32_to_u8(u8* to, const u32* from); +int u32_to_u8(u8* to, const u32* from); size_t u32_strlen(const u32* s); -- 2.45.2