~q3cpma/misc-tools

ref: 023b3736bfe7bc4caeebda9a5d07d060870f0128 misc-tools/utf8.h -rw-r--r-- 2.8 KiB
023b3736q3cpma Fix gen_unicode_tolower_lut.sh typo 7 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#pragma once

#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>

#define UTF8_STEP_MALFORMED  -1
#define UTF8_STEP_INCOMPLETE -2

/* Try to decode the UTF-8 sequence pointed to by `*buf` while incrementing it
 * to point to after the decoded sequence. Returns either the
 * corresponding codepoint if everything went fine, or one of the above defined
 * errors.
 *
 * buf: the pointer to the string starting with an UTF-8 sequence. It'll be
 *		modified to point to the char after the decoded sequence
 *
 * Return: the corresponding codepoint if everything went fine, or one of the
 *		   above defined errors otherwise.
 */
int64_t utf8_decode_step(const char **buf);

/* Same as utf8_step(), but aborts when encountering an error */
static inline uint32_t xutf8_decode_step(const char **buf)
{
	const int64_t ret = utf8_decode_step(buf);
	switch (ret)
	{
		case UTF8_STEP_MALFORMED:
		case UTF8_STEP_INCOMPLETE:
			abort();

		default:
			return (uint32_t)ret;
	}
}

/* Encode codep to out and return the number of bytes used */
uint8_t utf8_encode(uint32_t codep, char *out);

/* Get the number of bytes in an UTF-8 sequence from its first byte. Returns 0
 * if a continuing byte was passed */
uint8_t utf8_codep_len(char firstbyte);

/* Step one codepoint forward, imply that *buf is on the beginning of a
 * codepoint. Doesn't check for end of string; useless if buf is a well
 * encoded UTF-8 string */
void utf8_step(const char **buf);

/* Step back to point to the previous UTF-8 sequence. Doesn't check for
 * beginning of buffer special case: your task */
void utf8_unstep(const char **buf);

/* Return the lowercase version of titlecase and uppercase codepoints
 * using a LUT */
uint32_t codep_tolower(uint32_t c);

/* The following function define the column width of an ISO 10646
 * character as follows:
 *
 *	  - The null character (U+0000) has a column width of 0.
 *
 *	  - Other C0/C1 control characters and DEL will lead to a return
 *		value of -1.
 *
 *	  - Non-spacing and enclosing combining characters (general
 *		category code Mn or Me in the Unicode database) have a
 *		column width of 0.
 *
 *	  - SOFT HYPHEN (U+00AD) has a column width of 1.
 *
 *	  - Other format characters (general category code Cf in the Unicode
 *		database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
 *
 *	  - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
 *		have a column width of 0.
 *
 *	  - Spacing characters in the East Asian Wide (W) or East Asian
 *		Full-width (F) category as defined in Unicode Technical
 *		Report #11 have a column width of 2.
 *
 *	  - All remaining characters (including all printable
 *		ISO 8859-1 and WGL4 characters, Unicode control characters,
 *		etc.) have a column width of 1.
 *
 * This implementation assumes that uint32_t characters are encoded
 * in ISO 10646.
 */
int codep_width(uint32_t ch);