~q3cpma/misc-tools

ref: 313cd570e4b81c5f6749356698ce617c65181b61 misc-tools/utf8.c -rw-r--r-- 5.6 KiB
313cd570q3cpma README nit 10 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#include <stdint.h>

#include "misc.h"
#include "utf8.h"


/* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
   See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const uint8_t utf8d[] =
{
	/* The first part of the table maps bytes to character classes that
	   to reduce the size of the transition table and create bitmasks. */
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

	/* The second part is a transition table that maps a combination
	   of a state of the automaton and a character class to a state. */
	0,12,24,36,60,96,84,12,12,12,48,72,  12,12,12,12,12,12,12,12,12,12,12,12,
	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	12,36,12,12,12,12,12,12,12,12,12,12,
};

/* Return UTF8_ACCEPT if a valid character was read, UTF8_REJECT if an invalid
   sequence is encountered, any other value if more bytes are needed */
static inline uint32_t utf8_decode(uint32_t *state, uint32_t *codep,
	uint32_t byte)
{
	const uint32_t type = utf8d[byte];

	*codep = (*state != UTF8_ACCEPT) ?
		(byte & 0x3fu) | (*codep << 6) :
		(0xff >> type) & (byte);

	return *state = utf8d[256 + *state + type];
}


static inline void print_str_hex(const char *s, size_t len)
{
	fprintf(stderr, "0x");
	for (size_t i = 0; i <= len; ++i)
	{
		fprintf(stderr, "%02x", (uint8_t)s[i]);
	}
}

int64_t utf8_decode_step(const char **buf)
{
	uint32_t codep, state = UTF8_ACCEPT;
	const char *p = *buf;

	for (; **buf; ++*buf)
	{
		switch (utf8_decode(&state, &codep, (uint8_t)**buf))
		{
			case UTF8_ACCEPT:
				++*buf;
				return codep;

			case UTF8_REJECT:
				print_str_hex(p, *buf - p);
				fprintf(stderr, ": malformed UTF-8 sequence\n");
				return UTF8_STEP_MALFORMED;
		}
	}
	if (state != UTF8_ACCEPT)
	{
		print_str_hex(p, *buf - p);
		fprintf(stderr, ": incomplete UTF-8 sequence\n");
		return UTF8_STEP_INCOMPLETE;
	}
	return '\0';
}

uint8_t utf8_encode(uint32_t codep, char *out)
{
	if (codep <= 0x7f)
	{
		*out++ = (char)codep & 0x7f;
		return 1;
	}
	else if (codep <= 0x7ff)
	{
		*out++ = (char)(0xc0 | (codep >> 6));
		*out++ = (char)(0x80 | (codep & 0x3f));
		return 2;
	}
	else if (codep <= 0xffff)
	{
		*out++ = (char)(0xe0 | (codep >> 12));
		*out++ = (char)(0x80 | ((codep >> 6) & 0x3f));
		*out++ = (char)(0x80 | (codep & 0x3f));
		return 3;
	}
	else if (codep <= 0x1fffff)
	{
		*out++ = (char)(0xf0 | (codep >> 18));
		*out++ = (char)(0x80 | ((codep >> 12) & 0x3f));
		*out++ = (char)(0x80 | ((codep >> 6) & 0x3f));
		*out++ = (char)(0x80 | (codep & 0x3f));
		return 4;
	}
	die("Can't handle codepoints bigger than 0x1fffff");
	return 0; // -Wreturn-type
}

static const uint8_t utf8_len_lut[256] =
{
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};

uint8_t utf8_codep_len(char firstbyte)
{
	return utf8_len_lut[(uint8_t)firstbyte];
}

void utf8_step(const char **buf)
{
	*buf += utf8_codep_len(**buf);
}

void utf8_unstep(const char **buf)
{
	for (--*buf; !utf8_codep_len(**buf); --*buf)
		;
}


uint32_t codep_tolower(uint32_t c)
{
#include "unicode_tolower.h"
	return c < ARRAY_SIZE(unicode_tolower_lut) ? unicode_tolower_lut[c] : c;
}

/* Code taken from git with minor cosmetic modifications */
/* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */
struct interval
{
	uint32_t first;
	uint32_t last;
};

/* Auxiliary function for binary search in interval table */
static int bisearch(uint32_t ucs, const struct interval *table, int max)
{
	int min = 0;
	int mid;

	if (ucs < table[0].first || ucs > table[max].last)
	{
		return 0;
	}
	while (max >= min)
	{
		mid = min + (max - min) / 2;
		if (ucs > table[mid].last)
		{
			min = mid + 1;
		}
		else if (ucs < table[mid].first)
		{
			max = mid - 1;
		}
		else
		{
			return 1;
		}
	}

	return 0;
}

int codep_width(uint32_t ch)
{
	/* Sorted list of non-overlapping intervals of non-spacing characters */
#include "unicode_width.h"

	/* Test for 8-bit control characters */
	if (ch == 0)
	{
		return 0;
	}
	if (ch < 32 || (ch >= 0x7f && ch < 0xa0))
	{
		return -1;
	}

	/* Binary search in table of non-spacing characters */
	if (bisearch(ch, zero_width,
			sizeof(zero_width) / sizeof(struct interval) - 1))
	{
		return 0;
	}

	/* Binary search in table of double width characters */
	if (bisearch(ch, double_width,
			sizeof(double_width) / sizeof(struct interval) - 1))
	{
		return 2;
	}
	return 1;
}