~nabijaczleweli/voreutils

ref: 72087d46a1d3f95f4e58bc8d4803aa05ab64b9f0 voreutils/cmd/wc.cpp -rw-r--r-- 9.1 KiB
72087d46наб OpenBSD port 2 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
// SPDX-License-Identifier: 0BSD


/* TODO: are we just more exhaustive? compare ltrace asdасдasd with asdasd on GNU wc – it does isprint(?) on the raw and only does mbrtowc when it fails(?)
nabijaczleweli@tarta:~/uwu/apt-file$ time zcat Contents-amd64.gz  > /dev/null

real    0m3.247s
user    0m3.189s
sys     0m0.056s
nabijaczleweli@tarta:~/uwu/apt-file$ time zcat Contents-amd64.gz  | wc
6088070 12202264 544222265

real    0m4.364s
user    0m7.755s
sys     0m0.418s
nabijaczleweli@tarta:~/uwu/apt-file$ time zcat Contents-amd64.gz  | ~/code/voreutils/out/cmd/wc
6088070 12202264 544222265

real    0m21.117s
user    0m24.438s
sys     0m0.456s
*/


#include <algorithm>
#include <clocale>
#include <cstring>
#include <cwchar>
#include <cwctype>
#include <initializer_list>
#include <inttypes.h>
#include <optional>
#include <sys/ioctl.h>
#include <tuple>
#include <vector>
#include <vore-file>
#include <vore-getopt>
#include <vore-optarg>
#include <vore-print>
#if __has_include(<linux/fs.h>)  // Linux
#include <linux/fs.h>
#endif
#if __OpenBSD__
#include <sys/disklabel.h>
#include <sys/dkio.h>
#elif __has_include(<sys/disk.h>)  // NetBSD, FreeBSD, Illumos
#include <sys/disk.h>
#include <sys/disklabel.h>
#include <sys/types.h>
#endif


#define USAGE(self)                           \
	"usage: %s [-lwmcL] [file]...\n"            \
	"       %s [-lwmcL] --files0-from=files\n", \
	    self, self


static const char * const default_stdin[] = {reinterpret_cast<const char *>(default_stdin), nullptr};  // sentinel


template <class T>
struct record {
	T newlines, words, characters, bytes, max_width;

	constexpr bool operator==(const record & rhs) noexcept {
		return this->newlines == rhs.newlines && this->words == rhs.words && this->characters == rhs.characters && this->bytes == rhs.bytes &&
		       this->max_width == rhs.max_width;
	}
};
static const constexpr record<bool> record_none{};
static const constexpr record<bool> record_justbytes{.bytes = true};

int main(int argc, char * const * argv) {
	std::setlocale(LC_ALL, "");  // TODO: locale!

	record<bool> cfg{};
	const char * files_file{};
	for(auto && [arg, val] : vore::opt::get{argc,
	                                        argv,
	                                        "lwcmL",
	                                        {{"lines", no_argument, nullptr, 'l'},
	                                         {"words", no_argument, nullptr, 'w'},
	                                         {"chars", no_argument, nullptr, 'm'},
	                                         {"bytes", no_argument, nullptr, 'c'},
	                                         {"max-line-length", no_argument, nullptr, 'L'},
	                                         {"files0-from", required_argument, nullptr, 'F'}}})
		switch(arg) {
			case 'l':
				cfg.newlines = true;
				break;
			case 'w':
				cfg.words = true;
				break;
			case 'm':
				cfg.characters = true;
				break;
			case 'c':
				cfg.bytes = true;
				break;
			case 'L':
				cfg.max_width = true;
				break;
			case 'F':  // --files0-from
				files_file = val;
				break;
			default:
				return std::fprintf(stderr, USAGE(argv[0])), 1;
		}
	if(cfg == record_none)
		cfg.newlines = cfg.words = cfg.bytes = true;
	if(files_file && *(argv + optind))
		return std::fprintf(stderr, USAGE(argv[0])), 1;


	std::vector<std::pair<std::pair<std::optional<record<std::uint64_t>>, int>, const char *>> results;

	auto process = [&](const char * file) {
		vore::file::fd<true> fd{file == *default_stdin ? "-" : file, O_RDONLY | O_CLOEXEC};
		if(fd == -1) {
			results.emplace_back(std::make_pair(std::nullopt, errno), file == *default_stdin ? nullptr : file);
			return;
		}

		record<std::uint64_t> acc{};
		bool done{};

		struct stat sb;
		if(cfg == record_justbytes && !fstat(fd, &sb)) {
			// TODO: this probably wants to live elsewhere
			if(S_ISREG(sb.st_mode)) {
				acc.bytes = sb.st_size;
				done      = true;
			} else if(S_ISCHR(sb.st_mode) || S_ISBLK(sb.st_mode)) {
				auto ret = -1;
#ifdef BLKGETSIZE64  // Linux
				std::uint64_t sz;
				ret = ioctl(fd, BLKGETSIZE64, &sz);
#elif defined(DIOCGMEDIASIZE)  // NetBSD disk(9), FreeBSD disk(4), Illumos
#if __sun
				std::uint64_t sz;
#else
				off_t sz;
#endif
				ret = ioctl(fd, DIOCGMEDIASIZE, &sz);
#elif __OpenBSD__
				struct disklabel dl;
				auto & sz = dl.d_secsize;
				ret       = ioctl(fd, DIOCGDINFO, &dl);
#endif

				if(ret != -1) {
					acc.bytes = sz;
					done      = true;
				}
			}
		}

		if(!done) {
			char buf[64 * 1024];
			std::mbstate_t ctx{};
			std::optional<bool> was_space;
			std::uint64_t cur_width{};
			for(ssize_t rd, leftover{};;) {
				while((rd = read(fd, buf + leftover, sizeof(buf) - leftover)) == -1 && errno == EINTR)
					;
				if(rd == -1) {
					results.emplace_back(std::make_pair(std::move(acc), errno), file == *default_stdin ? nullptr : file);
					return;
				}
				if(!rd)
					break;
				rd += leftover;

				if(cfg.newlines)
					acc.newlines += std::count(buf, buf + rd, '\n');

				acc.bytes += rd - leftover;

				if(cfg.words || cfg.characters || cfg.max_width) {
					auto cur = buf;
					for(wchar_t c; cur != buf + rd;)
						switch(auto r = std::mbrtowc(&c, cur, buf + rd - cur, &ctx)) {
							case static_cast<std::size_t>(-2):  // incomplete
								goto break2;
							case static_cast<std::size_t>(-1):  // EILSEQ: reset, try to go past
								++cur;
								std::mbrtowc(nullptr, nullptr, 0, &ctx);
								break;
							case 0:
								r = 1;
								[[fallthrough]];
							default:
								cur += r;

								if(cfg.words) {
									auto sp = std::iswspace(c);
									if(was_space && sp && !*was_space)
										++acc.words;
									was_space = sp;
								}

								++acc.characters;

								if(cfg.max_width) {
									if(c == L'\n' || c == L'\r' || c == L'\f') {
										acc.max_width = std::max(acc.max_width, cur_width);
										cur_width     = 0;
									} else if(c == L'\t')
										cur_width += 8 - cur_width % 8;
									else
										cur_width += std::max(wcwidth(c), 0);
								}
								break;
						}
				break2:;

					leftover = buf + rd - cur;
					std::memmove(buf, cur, leftover);
				}
			}

			if(was_space && !*was_space)
				++acc.words;
			acc.max_width = std::max(acc.max_width, cur_width);
		}

		results.emplace_back(std::make_pair(std::move(acc), 0), file == *default_stdin ? nullptr : file);
	};

	if(files_file) {
		vore::file::FILE<true> files{files_file, "re"};
		if(!files)
			return std::fprintf(stderr, "%s: %s: %s\n", argv[0], files_file, std::strerror(errno)), 1;

		char * file{};
		std::size_t filecap{};
		for(ssize_t len; (len = getdelim(&file, &filecap, '\0', files)) != -1;)
			process(strdup(file) ?: "(?)");
		std::free(file);  // emplace_back("total")
	} else
		for(auto file : vore::opt::args{*(argv + optind) ? (argv + optind) : default_stdin})
			process(file);


	if(results.size() > 1) {
		record<std::uint64_t> acc{};
		for(auto && [res, _] : results)
			if(res.first) {
				acc.newlines += res.first->newlines;
				acc.words += res.first->words;
				acc.characters += res.first->characters;
				acc.bytes += res.first->bytes;
				acc.max_width = std::max(acc.max_width, res.first->max_width);
			}
		results.emplace_back(std::make_pair(std::move(acc), 0), "total");
	}

	record<int> widths{};
	if(results.size() > 1) {
		auto log10 = [](auto i) {
			std::uint8_t ret{};
			while(i) {
				i /= 10;
				++ret;
			}
			return ret;
		};

		record<std::uint64_t> acc{};
		for(auto && [res, _] : results)
			if(res.first) {
				acc.newlines   = std::max(acc.newlines, res.first->newlines);
				acc.words      = std::max(acc.words, res.first->words);
				acc.characters = std::max(acc.characters, res.first->characters);
				acc.bytes      = std::max(acc.bytes, res.first->bytes);
				acc.max_width  = std::max(acc.max_width, res.first->max_width);
			}

		widths.newlines   = log10(1 + acc.newlines);
		widths.words      = log10(1 + acc.words);
		widths.characters = log10(1 + acc.characters);
		widths.bytes      = log10(1 + acc.bytes);
		widths.max_width  = log10(1 + acc.max_width);
	}

	for(auto && [res, file] : results) {
		if(res.second)
			std::fprintf(stderr, "%s: %s%s%s\n", argv[0], file ?: "", file ? ": " : "", std::strerror(res.second));

		if(res.first) {
			auto first = true;
			for(auto && [c, w, r] : std::initializer_list<std::tuple<bool, int, std::uint64_t>>{{cfg.newlines, widths.newlines, res.first->newlines},
			                                                                                    {cfg.words, widths.words, res.first->words},
			                                                                                    {cfg.characters, widths.characters, res.first->characters},
			                                                                                    {cfg.bytes, widths.bytes, res.first->bytes},
			                                                                                    {cfg.max_width, widths.max_width, res.first->max_width}})
				if(c) {
					if(!first)
						vore::fputc(' ', stdout);
					std::fprintf(stdout, "%*" PRIu64 "", w, r);
					first = false;
				}
			if(file)
				std::fprintf(stdout, " %s", file);
			vore::fputc('\n', stdout);
		}
	}


	return vore::flush_stdout(argv[0]) || std::any_of(std::begin(results), std::end(results), [](auto && res) { return res.first.second; });
}