~andreafeletto/zic

c591d8ae4d85ccfaa311365db22e7d4d5ab09ec3 — Andrea Feletto 5 months ago main
port tokenizer
11 files changed, 1431 insertions(+), 0 deletions(-)

A .editorconfig
A .gitignore
A LICENSE
A README.md
A config.mk
A main.c
A makefile
A tokenizer.c
A tokenizer.h
A unicode.c
A unicode.h
A  => .editorconfig +18 -0
@@ 1,18 @@
root = true

[*]
end_of_line = lf
insert_final_newline = true
charset = utf-8

[*.{c,h}]
trim_trailing_whitespace = true
indent_style = tab
indent_size = 8
max_line_length = 100

[makefile]
trim_trailing_whitespace = true
indent_style = tab
indent_size = 8
max_line_length = 100

A  => .gitignore +2 -0
@@ 1,2 @@
zic
*.o

A  => LICENSE +21 -0
@@ 1,21 @@
MIT License

Copyright (c) Andrea Feletto

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

A  => README.md +32 -0
@@ 1,32 @@

# [Zic]

Alternative [zig] compiler targeting [c99], written in c99 with no external
dependencies other than libc.
It is currently a work in progress.

## Build

```
git clone https://git.sr.ht/~andreafeletto/zic
cd zic
make
```

## Contributing

Please join the [#andreafeletto] IRC channel to ask for help or to give
feedback.
You are welcome to send patches to the [mailing list] or report bugs on the
[issue tracker].
If you aren't familiar with `git send-email`, you can use the [web interface]
or learn about it by following this excellent [tutorial].

[Zic]: https://sr.ht/~andreafeletto/zic
[zig]: https://ziglang.org
[c99]: https://en.cppreference.com/w/c/99
[#andreafeletto]: ircs://irc.libera.chat/#andreafeletto
[mailing list]: https://lists.sr.ht/~andreafeletto/public-inbox
[issue tracker]: https://todo.sr.ht/~andreafeletto/zic
[web interface]: https://git.sr.ht/~andreafeletto/zic/send-email
[tutorial]: https://git-send-email.io

A  => config.mk +6 -0
@@ 1,6 @@
DESTDIR ?=
PREFIX ?= /usr/local

CC = cc

CFLAGS = -std=c99 -Werror -Wall -Wextra -pedantic -O2

A  => main.c +9 -0
@@ 1,9 @@

#include <stdlib.h>

int
main(int argc, char **argv) {
	(void)(argc);
	(void)(argv);
	return EXIT_SUCCESS;
}

A  => makefile +24 -0
@@ 1,24 @@
.POSIX:

include config.mk

SRC := main.c tokenizer.c unicode.c
OBJ := ${SRC:.c=.o}

all: zic

zic: ${OBJ}
	${CC} -o $@ ${OBJ} ${LDFLAGS}

${OBJ}: tokenizer.h unicode.h

install:
	install -Dm755 zic -t "${DESTDIR}${PREFIX}/bin"

uninstall:
	rm -f "${DESTDIR}${PREFIX}/bin/zic"

clean:
	rm -f zic ${OBJ}

.PHONY: all install uninstall clean

A  => tokenizer.c +1054 -0
@@ 1,1054 @@
#include "tokenizer.h"

#include <ctype.h>
#include <string.h>

#include "unicode.h"

enum zic_tokenizer_state {
	ZIC_TOKENIZER_STATE_START,
	ZIC_TOKENIZER_STATE_IDENTIFIER,
	ZIC_TOKENIZER_STATE_BUILTIN,
	ZIC_TOKENIZER_STATE_STRING_LITERAL,
	ZIC_TOKENIZER_STATE_STRING_LITERAL_BACKSLASH,
	ZIC_TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE,
	ZIC_TOKENIZER_STATE_CHAR_LITERAL,
	ZIC_TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH,
	ZIC_TOKENIZER_STATE_CHAR_LITERAL_HEX_ESCAPE,
	ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE_SAW_U,
	ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE,
	ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_INVALID,
	ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE,
	ZIC_TOKENIZER_STATE_CHAR_LITERAL_END,
	ZIC_TOKENIZER_STATE_BACKSLASH,
	ZIC_TOKENIZER_STATE_EQUAL,
	ZIC_TOKENIZER_STATE_BANG,
	ZIC_TOKENIZER_STATE_PIPE,
	ZIC_TOKENIZER_STATE_MINUS,
	ZIC_TOKENIZER_STATE_MINUS_PERCENT,
	ZIC_TOKENIZER_STATE_MINUS_PIPE,
	ZIC_TOKENIZER_STATE_ASTERISK,
	ZIC_TOKENIZER_STATE_ASTERISK_PERCENT,
	ZIC_TOKENIZER_STATE_ASTERISK_PIPE,
	ZIC_TOKENIZER_STATE_SLASH,
	ZIC_TOKENIZER_STATE_LINE_COMMENT_START,
	ZIC_TOKENIZER_STATE_LINE_COMMENT,
	ZIC_TOKENIZER_STATE_DOC_COMMENT_START,
	ZIC_TOKENIZER_STATE_DOC_COMMENT,
	ZIC_TOKENIZER_STATE_INT,
	ZIC_TOKENIZER_STATE_INT_EXPONENT,
	ZIC_TOKENIZER_STATE_INT_PERIOD,
	ZIC_TOKENIZER_STATE_FLOAT,
	ZIC_TOKENIZER_STATE_FLOAT_EXPONENT,
	ZIC_TOKENIZER_STATE_AMPERSAND,
	ZIC_TOKENIZER_STATE_CARET,
	ZIC_TOKENIZER_STATE_PERCENT,
	ZIC_TOKENIZER_STATE_PLUS,
	ZIC_TOKENIZER_STATE_PLUS_PERCENT,
	ZIC_TOKENIZER_STATE_PLUS_PIPE,
	ZIC_TOKENIZER_STATE_ANGLE_BRACKET_LEFT,
	ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT,
	ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE,
	ZIC_TOKENIZER_STATE_ANGLE_BRACKET_RIGHT,
	ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT,
	ZIC_TOKENIZER_STATE_PERIOD,
	ZIC_TOKENIZER_STATE_PERIOD_2,
	ZIC_TOKENIZER_STATE_PERIOD_ASTERISK,
	ZIC_TOKENIZER_STATE_SAW_AT_SIGN,
};

static struct zic_tokenizer_maybe_tag
zic_tokenizer_tag_from_keyword(char *keyword, size_t size) {
	struct zic_tokenizer_maybe_tag result;

	result.flag = true;
	if (strncmp(keyword, "addrspace", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ADDRSPACE;
	} else if (strncmp(keyword, "align", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ALIGN;
	} else if (strncmp(keyword, "allowzero", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ALLOWZERO;
	} else if (strncmp(keyword, "and", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_AND;
	} else if (strncmp(keyword, "anyframe", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ANYFRAME;
	} else if (strncmp(keyword, "anytype", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ANYTYPE;
	} else if (strncmp(keyword, "asm", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ASM;
	} else if (strncmp(keyword, "async", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ASYNC;
	} else if (strncmp(keyword, "await", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_AWAIT;
	} else if (strncmp(keyword, "break", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_BREAK;
	} else if (strncmp(keyword, "callconv", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_CALLCONV;
	} else if (strncmp(keyword, "catch", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_CATCH;
	} else if (strncmp(keyword, "comptime", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_COMPTIME;
	} else if (strncmp(keyword, "const", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_CONST;
	} else if (strncmp(keyword, "continue", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_CONTINUE;
	} else if (strncmp(keyword, "defer", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_DEFER;
	} else if (strncmp(keyword, "else", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ELSE;
	} else if (strncmp(keyword, "enum", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ENUM;
	} else if (strncmp(keyword, "errdefer", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ERRDEFER;
	} else if (strncmp(keyword, "error", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ERROR;
	} else if (strncmp(keyword, "export", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_EXPORT;
	} else if (strncmp(keyword, "extern", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_EXTERN;
	} else if (strncmp(keyword, "fn", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_FN;
	} else if (strncmp(keyword, "for", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_FOR;
	} else if (strncmp(keyword, "if", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_IF;
	} else if (strncmp(keyword, "inline", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_INLINE;
	} else if (strncmp(keyword, "noalias", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_NOALIAS;
	} else if (strncmp(keyword, "noinline", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_NOINLINE;
	} else if (strncmp(keyword, "nosuspend", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_NOSUSPEND;
	} else if (strncmp(keyword, "opaque", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_OPAQUE;
	} else if (strncmp(keyword, "or", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_OR;
	} else if (strncmp(keyword, "orelse", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ORELSE;
	} else if (strncmp(keyword, "packed", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_PACKED;
	} else if (strncmp(keyword, "pub", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_PUB;
	} else if (strncmp(keyword, "resume", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_RESUME;
	} else if (strncmp(keyword, "return", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_RETURN;
	} else if (strncmp(keyword, "linksection", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_LINKSECTION;
	} else if (strncmp(keyword, "struct", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_STRUCT;
	} else if (strncmp(keyword, "suspend", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_SUSPEND;
	} else if (strncmp(keyword, "switch", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_SWITCH;
	} else if (strncmp(keyword, "test", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_TEST;
	} else if (strncmp(keyword, "threadlocal", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_THREADLOCAL;
	} else if (strncmp(keyword, "try", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_TRY;
	} else if (strncmp(keyword, "union", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_UNION;
	} else if (strncmp(keyword, "unreachable", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_UNREACHABLE;
	} else if (strncmp(keyword, "usingnamespace", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_USINGNAMESPACE;
	} else if (strncmp(keyword, "var", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_VAR;
	} else if (strncmp(keyword, "volatile", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_VOLATILE;
	} else if (strncmp(keyword, "while", size)) {
		result.tag = ZIC_TOKENIZER_TAG_KEYWORD_WHILE;
	}

	return result;
}

void
zic_tokenizer_init(struct zic_tokenizer *tokenizer, unsigned char *buffer) {
	tokenizer->buffer = buffer;
	tokenizer->buffer_size = strlen((char *)buffer);
	tokenizer->index = strncmp((char *)buffer, "\xEF\xBB\xBF", 3) == 0 ? 3 : 0;
	tokenizer->pending_invalid_token.flag = false;
}

static size_t
zic_tokenizer_get_invalid_character_length(struct zic_tokenizer *tokenizer) {
	unsigned char c0, *bytes;
	int length, codepoint;

	c0 = tokenizer->buffer[tokenizer->index];
	if (c0 < 128) {
		if (c0 == '\r') {
			if (tokenizer->index + 1 < tokenizer->buffer_size
					&& tokenizer->buffer[tokenizer->index + 1] == '\n') {
				return 0;
			} else {
				return 1;
			}
		} else if (iscntrl(c0)) {
			return 1;
		}
		return 0;
	}

	length = zic_unicode_utf8_byte_sequence_length(c0);
	if (length < 0) {
		return 1;
	}
	if (tokenizer->index + length > tokenizer->buffer_size) {
		return tokenizer->buffer_size - tokenizer->index;
	}
	bytes = tokenizer->buffer + tokenizer->index;
	switch (length) {
	case 2:
		codepoint = zic_unicode_utf8_decode_2(bytes);
		if (codepoint < 0 || codepoint == 0x85) {
			return length;
		}
		break;
	case 3:
		codepoint = zic_unicode_utf8_decode_3(bytes);
		if (codepoint < 0 || codepoint == 0x2028 || codepoint == 0x2029) {
			return length;
		}
		break;
	case 4:
		codepoint = zic_unicode_utf8_decode_4(bytes);
		if (codepoint < 0) {
			return length;
		}
		break;
	}
	tokenizer->index += length - 1;
	return 0;
}

static void
zic_tokenizer_check_literal_character(struct zic_tokenizer *tokenizer) {
	size_t invalid_length;

	if (tokenizer->pending_invalid_token.flag) {
		return;
	}
	invalid_length = zic_tokenizer_get_invalid_character_length(tokenizer);

	if (invalid_length == 0) {
		return;
	}
	tokenizer->pending_invalid_token.flag = true;
	tokenizer->pending_invalid_token.token.tag = ZIC_TOKENIZER_TAG_INVALID;
	tokenizer->pending_invalid_token.token.start = tokenizer->index;
	tokenizer->pending_invalid_token.token.end = tokenizer->index + invalid_length;
}

struct zic_tokenizer_token
zic_tokenizer_next(struct zic_tokenizer *tokenizer) {
	enum zic_tokenizer_state state;
	struct zic_tokenizer_token result;
	unsigned char c;
	struct zic_tokenizer_maybe_tag keyword;
	size_t seen_escape_digits, remaining_code_units;

	if (tokenizer->pending_invalid_token.flag) {
		tokenizer->pending_invalid_token.flag = false;
		return tokenizer->pending_invalid_token.token;
	}

	state = ZIC_TOKENIZER_STATE_START;
	result.tag = ZIC_TOKENIZER_TAG_EOF;
	result.start = tokenizer->index;

	for (;; tokenizer->index++) {
		c = tokenizer->buffer[tokenizer->index];
		switch (state) {
		case ZIC_TOKENIZER_STATE_START:
			if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
				state = ZIC_TOKENIZER_STATE_IDENTIFIER;
				result.tag = ZIC_TOKENIZER_TAG_IDENTIFIER;
				break;
			}
			if ('0' <= c && c <= '9') {
				state = ZIC_TOKENIZER_STATE_INT;
				result.tag = ZIC_TOKENIZER_TAG_NUMBER_LITERAL;
				break;
			}
			switch (c) {
                        case 0:
				if (tokenizer->index != tokenizer->buffer_size) {
					result.tag = ZIC_TOKENIZER_TAG_INVALID;
					result.start = tokenizer->index++;
					result.end = tokenizer->index;
					return result;
				}
				goto finish;
			case ' ':
			case '\n':
			case '\t':
			case '\r':
				result.start = tokenizer->index + 1;
				break;
			case '"':
				state = ZIC_TOKENIZER_STATE_STRING_LITERAL;
				result.tag = ZIC_TOKENIZER_TAG_STRING_LITERAL;
				break;
			case '\'':
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL;
				break;
			case '@':
				state = ZIC_TOKENIZER_STATE_SAW_AT_SIGN;
				break;
			case '=':
				state = ZIC_TOKENIZER_STATE_EQUAL;
				break;
			case '!':
				state = ZIC_TOKENIZER_STATE_BANG;
				break;
			case '|':
				state = ZIC_TOKENIZER_STATE_PIPE;
				break;
			case '(':
				result.tag = ZIC_TOKENIZER_TAG_L_PAREN;
				tokenizer->index++;
				goto finish;
			case ')':
				result.tag = ZIC_TOKENIZER_TAG_R_PAREN;
				tokenizer->index++;
				goto finish;
			case '[':
				result.tag = ZIC_TOKENIZER_TAG_L_BRACKET;
				tokenizer->index++;
				goto finish;
			case ']':
				result.tag = ZIC_TOKENIZER_TAG_R_BRACKET;
				tokenizer->index++;
				goto finish;
			case ';':
				result.tag = ZIC_TOKENIZER_TAG_SEMICOLON;
				tokenizer->index++;
				goto finish;
			case ',':
				result.tag = ZIC_TOKENIZER_TAG_COMMA;
				tokenizer->index++;
				goto finish;
			case '?':
				result.tag = ZIC_TOKENIZER_TAG_QUESTION_MARK;
				tokenizer->index++;
				goto finish;
			case ':':
				result.tag = ZIC_TOKENIZER_TAG_COLON;
				tokenizer->index++;
				goto finish;
			case '%':
				state = ZIC_TOKENIZER_STATE_PERCENT;
				break;
			case '*':
				state = ZIC_TOKENIZER_STATE_ASTERISK;
				break;
			case '+':
				state = ZIC_TOKENIZER_STATE_PLUS;
				break;
			case '<':
				state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_LEFT;
				break;
			case '>':
				state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_RIGHT;
				break;
			case '^':
				state = ZIC_TOKENIZER_STATE_CARET;
				break;
			case '\\':
				state = ZIC_TOKENIZER_STATE_BACKSLASH;
				result.tag = ZIC_TOKENIZER_TAG_MULTILINE_STRING_LITERAL_LINE;
				break;
			case '{':
				result.tag = ZIC_TOKENIZER_TAG_L_BRACE;
				tokenizer->index++;
				goto finish;
			case '}':
				result.tag = ZIC_TOKENIZER_TAG_R_BRACE;
				tokenizer->index++;
				goto finish;
			case '~':
				result.tag = ZIC_TOKENIZER_TAG_TILDE;
				tokenizer->index++;
				goto finish;
			case '.':
				state = ZIC_TOKENIZER_STATE_PERIOD;
				break;
			case '-':
				state = ZIC_TOKENIZER_STATE_MINUS;
				break;
			case '/':
				state = ZIC_TOKENIZER_STATE_SLASH;
				break;
			case '&':
				state = ZIC_TOKENIZER_STATE_AMPERSAND;
				break;
			default:
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				result.end = tokenizer->index++;
				return result;
			}
			break;
		case ZIC_TOKENIZER_STATE_SAW_AT_SIGN:
			if (c == '"') {
				result.tag = ZIC_TOKENIZER_TAG_IDENTIFIER;
				state = ZIC_TOKENIZER_STATE_STRING_LITERAL;
			} else if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
				result.tag = ZIC_TOKENIZER_TAG_BUILTIN;
				state = ZIC_TOKENIZER_STATE_BUILTIN;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_AMPERSAND:
			if (c == '=') {
				result.tag = ZIC_TOKENIZER_TAG_AMPERSAND_EQUAL;
				tokenizer->index++;
				goto finish;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_AMPERSAND;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_ASTERISK:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_ASTERISK_EQUAL;
				tokenizer->index++;
				goto finish;
			case '*':
				result.tag = ZIC_TOKENIZER_TAG_ASTERISK_ASTERISK;
				tokenizer->index++;
				goto finish;
			case '%':
				state = ZIC_TOKENIZER_STATE_ASTERISK_PERCENT;
				break;
			case '|':
				state = ZIC_TOKENIZER_STATE_ASTERISK_PIPE;
				break;
			default:
				result.tag = ZIC_TOKENIZER_TAG_ASTERISK;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_ASTERISK_PERCENT:
			if (c == '=') {
				result.tag = ZIC_TOKENIZER_TAG_ASTERISK_PERCENT_EQUAL;
				tokenizer->index++;
				goto finish;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_ASTERISK_PERCENT;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_ASTERISK_PIPE:
			if (c == '=') {
				result.tag = ZIC_TOKENIZER_TAG_ASTERISK_PIPE_EQUAL;
				tokenizer->index++;
				goto finish;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_ASTERISK_PIPE;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_PERCENT:
			if (c == '=') {
				result.tag = ZIC_TOKENIZER_TAG_PERCENT_EQUAL;
				tokenizer->index++;
				goto finish;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_PERCENT;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_PLUS:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_PLUS_EQUAL;
				tokenizer->index++;
				goto finish;
			case '+':
				result.tag = ZIC_TOKENIZER_TAG_PLUS_PLUS;
				tokenizer->index++;
				goto finish;
			case '%':
				state = ZIC_TOKENIZER_STATE_PLUS_PERCENT;
				break;
			case '|':
				state = ZIC_TOKENIZER_STATE_PLUS_PIPE;
				break;
			default:
				result.tag = ZIC_TOKENIZER_TAG_PLUS;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_PLUS_PERCENT:
			if (c == '=') {
				result.tag = ZIC_TOKENIZER_TAG_PLUS_PERCENT_EQUAL;
				tokenizer->index++;
				goto finish;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_PLUS_PERCENT;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_PLUS_PIPE:
			if (c == '=') {
				result.tag = ZIC_TOKENIZER_TAG_PLUS_PIPE_EQUAL;
				tokenizer->index++;
				goto finish;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_PLUS_PIPE;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_CARET:
			if (c == '=') {
				result.tag = ZIC_TOKENIZER_TAG_CARET_EQUAL;
				tokenizer->index++;
				goto finish;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_CARET;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_IDENTIFIER:
			if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_') {
				break;
			}
			keyword = zic_tokenizer_tag_from_keyword(
				(char *)(tokenizer->buffer + result.start),
				tokenizer->index - result.start);
			if (keyword.flag) {
				result.tag = keyword.tag;
			}
			goto finish;
		case ZIC_TOKENIZER_STATE_BUILTIN:
			if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_') {
				break;
			}
			goto finish;
		case ZIC_TOKENIZER_STATE_BACKSLASH:
			if (c == '\\') {
				state = ZIC_TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_STRING_LITERAL:
			switch (c) {
			case '\\':
				state = ZIC_TOKENIZER_STATE_STRING_LITERAL_BACKSLASH;
				break;
			case '"':
				tokenizer->index++;
				goto finish;
			case 0:
				if (tokenizer->index == tokenizer->buffer_size) {
					result.tag = ZIC_TOKENIZER_TAG_INVALID;
					goto finish;
				} else {
					zic_tokenizer_check_literal_character(tokenizer);
				}
				break;
			case '\n':
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			default:
				zic_tokenizer_check_literal_character(tokenizer);
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_STRING_LITERAL_BACKSLASH:
			if (c == 0 || c == '\n') {
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			} else {
				result.tag = ZIC_TOKENIZER_TAG_STRING_LITERAL;
			}
			break;
		case ZIC_TOKENIZER_STATE_CHAR_LITERAL:
			if (c == '\'' || (0x80 <= c && c <= 0xBF) || c >= 0xF8) {
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			} else if (0xC0 <= c && c <= 0xDF) {
				remaining_code_units = 1;
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE;
				break;
			} else if (0xE0 <= c && c <= 0xEF) {
				remaining_code_units = 2;
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE;
				break;
			} else if (0xF0 <= c && c <= 0xF7) {
				remaining_code_units = 3;
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE;
				break;
			}
			switch (c) {
			case 0:
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			case '\\':
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH;
				break;
			case '\n':
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			default:
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH:
			switch (c) {
			case '0':
			case '\n':
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			case 'x':
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_HEX_ESCAPE;
				seen_escape_digits = 0;
				break;
			case 'u':
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE_SAW_U;
				break;
			default:
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_CHAR_LITERAL_HEX_ESCAPE:
			if (('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
				seen_escape_digits++;
				if (seen_escape_digits == 2) {
					state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
				}
			} else {
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE_SAW_U:
			switch (c) {
			case '0':
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			case '{':
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE;
				break;
			default:
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_INVALID;
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE:
			if (('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
				break;
			}
			switch (c) {
			case '0':
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			case '}':
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
				break;
			default:
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_INVALID;
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_INVALID:
			if (('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
				break;
			}
			goto finish;
		case ZIC_TOKENIZER_STATE_CHAR_LITERAL_END:
			switch (c) {
			case '\'':
				result.tag = ZIC_TOKENIZER_TAG_CHAR_LITERAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE:
			if (0x80 <= c && c <= 0xBF) {
				remaining_code_units--;
				if (remaining_code_units == 0) {
					state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
				}
			} else {
				result.tag = ZIC_TOKENIZER_TAG_INVALID;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE:
			switch (c) {
			case 0:
				goto finish;
			case '\n':
				tokenizer->index++;
				goto finish;
			case '\t':
				break;
			default:
				zic_tokenizer_check_literal_character(tokenizer);
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_BANG:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_BANG_EQUAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_BANG;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_PIPE:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_PIPE_EQUAL;
				tokenizer->index++;
				goto finish;
			case '|':
				result.tag = ZIC_TOKENIZER_TAG_PIPE_PIPE;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_PIPE;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_EQUAL:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_EQUAL_EQUAL;
				tokenizer->index++;
				goto finish;
			case '>':
				result.tag = ZIC_TOKENIZER_TAG_EQUAL_ANGLE_BRACKET_RIGHT;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_EQUAL;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_MINUS:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_MINUS_EQUAL;
				tokenizer->index++;
				goto finish;
			case '>':
				result.tag = ZIC_TOKENIZER_TAG_ARROW;
				tokenizer->index++;
				goto finish;
			case '%':
				state = ZIC_TOKENIZER_STATE_MINUS_PERCENT;
				break;
			case '|':
				state = ZIC_TOKENIZER_STATE_MINUS_PIPE;
				break;
			default:
				result.tag = ZIC_TOKENIZER_TAG_MINUS;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_MINUS_PERCENT:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_MINUS_PERCENT_EQUAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_MINUS_PERCENT;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_MINUS_PIPE:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_MINUS_PIPE_EQUAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_MINUS_PIPE;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_LEFT:
			switch (c) {
			case '<':
				state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
				break;
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_LEFT_EQUAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_LEFT;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL;
				tokenizer->index++;
				goto finish;
			case '|':
				state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
				break;
			default:
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_RIGHT:
			switch (c) {
			case '>':
				state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
				break;
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_RIGHT_EQUAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_RIGHT;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT:
			switch (c) {
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_PERIOD:
			switch (c) {
			case '.':
				state = ZIC_TOKENIZER_STATE_PERIOD_2;
				break;
			case '*':
				state = ZIC_TOKENIZER_STATE_PERIOD_ASTERISK;
				break;
			default:
				result.tag = ZIC_TOKENIZER_TAG_PERIOD;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_PERIOD_2:
			switch (c) {
			case '.':
				result.tag = ZIC_TOKENIZER_TAG_ELLIPSIS3;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_ELLIPSIS2;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_PERIOD_ASTERISK:
			switch (c) {
			case '*':
				result.tag = ZIC_TOKENIZER_TAG_INVALID_PERIODASTERISKS;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_PERIOD_ASTERISK;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_SLASH:
			switch (c) {
			case '/':
				state = ZIC_TOKENIZER_STATE_LINE_COMMENT_START;
				break;
			case '=':
				result.tag = ZIC_TOKENIZER_TAG_SLASH_EQUAL;
				tokenizer->index++;
				goto finish;
			default:
				result.tag = ZIC_TOKENIZER_TAG_SLASH;
				goto finish;
			}
			break;
		case ZIC_TOKENIZER_STATE_LINE_COMMENT_START:
			switch (c) {
			case 0:
				if (tokenizer->index != tokenizer->buffer_size) {
					result.tag = ZIC_TOKENIZER_TAG_INVALID;
					tokenizer->index++;
				}
				goto finish;
			case '/':
				state = ZIC_TOKENIZER_STATE_DOC_COMMENT_START;
				break;
			case '!':
				result.tag = ZIC_TOKENIZER_TAG_CONTAINER_DOC_COMMENT;
				state = ZIC_TOKENIZER_STATE_DOC_COMMENT;
				break;
			case '\n':
				state = ZIC_TOKENIZER_STATE_START;
				result.start = tokenizer->index + 1;
				break;
			case '\t':
				state = ZIC_TOKENIZER_STATE_LINE_COMMENT;
				break;
			default:
				state = ZIC_TOKENIZER_STATE_LINE_COMMENT;
				zic_tokenizer_check_literal_character(tokenizer);
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_DOC_COMMENT_START:
			switch (c) {
			case '/':
				state = ZIC_TOKENIZER_STATE_LINE_COMMENT;
				break;
			case 0:
			case '\n':
				result.tag = ZIC_TOKENIZER_TAG_DOC_COMMENT;
				goto finish;
			case '\t':
				state = ZIC_TOKENIZER_STATE_DOC_COMMENT;
				result.tag = ZIC_TOKENIZER_TAG_DOC_COMMENT;
				break;
			default:
				state = ZIC_TOKENIZER_STATE_DOC_COMMENT;
				result.tag = ZIC_TOKENIZER_TAG_DOC_COMMENT;
				zic_tokenizer_check_literal_character(tokenizer);
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_LINE_COMMENT:
			switch (c) {
			case 0:
				if (tokenizer->index != tokenizer->buffer_size) {
					result.tag = ZIC_TOKENIZER_TAG_INVALID;
					tokenizer->index++;
				}
				goto finish;
			case '\n':
				state = ZIC_TOKENIZER_STATE_START;
				result.start = tokenizer->index + 1;
				break;
			case '\t':
				break;
			default:
				zic_tokenizer_check_literal_character(tokenizer);
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_DOC_COMMENT:
			switch (c) {
			case 0:
			case '\n':
				goto finish;
			case '\t':
				break;
			default:
				zic_tokenizer_check_literal_character(tokenizer);
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_INT:
			if (c == 'e' || c == 'E' || c == 'p' || c == 'P') {
				state = ZIC_TOKENIZER_STATE_INT_EXPONENT;
				break;
			}
			if (('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
				break;
			}
			if (c == '.') {
				state = ZIC_TOKENIZER_STATE_INT_PERIOD;
				break;
			}
			goto finish;
		case ZIC_TOKENIZER_STATE_INT_EXPONENT:
			switch (c) {
			case '-':
			case '+':
				state = ZIC_TOKENIZER_STATE_FLOAT;
				break;
			default:
				tokenizer->index--;
				state = ZIC_TOKENIZER_STATE_INT;
				break;
			}
			break;
		case ZIC_TOKENIZER_STATE_INT_PERIOD:
			if (c == 'e' || c == 'E' || c == 'p' || c == 'P') {
				state = ZIC_TOKENIZER_STATE_FLOAT_EXPONENT;
				break;
			}
			if (('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
				state = ZIC_TOKENIZER_STATE_FLOAT;
				break;
			}
			tokenizer->index--;
			goto finish;
		case ZIC_TOKENIZER_STATE_FLOAT:
			if (c == 'e' || c == 'E' || c == 'p' || c == 'P') {
				state = ZIC_TOKENIZER_STATE_FLOAT_EXPONENT;
				break;
			}
			if (('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
				break;
			}
			goto finish;
		case ZIC_TOKENIZER_STATE_FLOAT_EXPONENT:
			switch (c) {
			case '-':
			case '+':
				state = ZIC_TOKENIZER_STATE_FLOAT;
				break;
			default:
				tokenizer->index--;
				state = ZIC_TOKENIZER_STATE_FLOAT;
				break;
			}
			break;
		}
	}

finish:
	if (result.tag == ZIC_TOKENIZER_TAG_EOF) {
		if (tokenizer->pending_invalid_token.flag) {
			tokenizer->pending_invalid_token.flag = false;
			return tokenizer->pending_invalid_token.token;
		}
		result.end = tokenizer->index;
	}

	result.end = tokenizer->index;
	return result;
}
\ No newline at end of file

A  => tokenizer.h +156 -0
@@ 1,156 @@
#ifndef ZIC_TOKENIZER_H
#define ZIC_TOKENIZER_H

#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>

enum zic_tokenizer_tag {
	ZIC_TOKENIZER_TAG_INVALID,
	ZIC_TOKENIZER_TAG_INVALID_PERIODASTERISKS,
	ZIC_TOKENIZER_TAG_IDENTIFIER,
	ZIC_TOKENIZER_TAG_STRING_LITERAL,
	ZIC_TOKENIZER_TAG_MULTILINE_STRING_LITERAL_LINE,
	ZIC_TOKENIZER_TAG_CHAR_LITERAL,
	ZIC_TOKENIZER_TAG_EOF,
	ZIC_TOKENIZER_TAG_BUILTIN,
	ZIC_TOKENIZER_TAG_BANG,
	ZIC_TOKENIZER_TAG_PIPE,
	ZIC_TOKENIZER_TAG_PIPE_PIPE,
	ZIC_TOKENIZER_TAG_PIPE_EQUAL,
	ZIC_TOKENIZER_TAG_EQUAL,
	ZIC_TOKENIZER_TAG_EQUAL_EQUAL,
	ZIC_TOKENIZER_TAG_EQUAL_ANGLE_BRACKET_RIGHT,
	ZIC_TOKENIZER_TAG_BANG_EQUAL,
	ZIC_TOKENIZER_TAG_L_PAREN,
	ZIC_TOKENIZER_TAG_R_PAREN,
	ZIC_TOKENIZER_TAG_SEMICOLON,
	ZIC_TOKENIZER_TAG_PERCENT,
	ZIC_TOKENIZER_TAG_PERCENT_EQUAL,
	ZIC_TOKENIZER_TAG_L_BRACE,
	ZIC_TOKENIZER_TAG_R_BRACE,
	ZIC_TOKENIZER_TAG_L_BRACKET,
	ZIC_TOKENIZER_TAG_R_BRACKET,
	ZIC_TOKENIZER_TAG_PERIOD,
	ZIC_TOKENIZER_TAG_PERIOD_ASTERISK,
	ZIC_TOKENIZER_TAG_ELLIPSIS2,
	ZIC_TOKENIZER_TAG_ELLIPSIS3,
	ZIC_TOKENIZER_TAG_CARET,
	ZIC_TOKENIZER_TAG_CARET_EQUAL,
	ZIC_TOKENIZER_TAG_PLUS,
	ZIC_TOKENIZER_TAG_PLUS_PLUS,
	ZIC_TOKENIZER_TAG_PLUS_EQUAL,
	ZIC_TOKENIZER_TAG_PLUS_PERCENT,
	ZIC_TOKENIZER_TAG_PLUS_PERCENT_EQUAL,
	ZIC_TOKENIZER_TAG_PLUS_PIPE,
	ZIC_TOKENIZER_TAG_PLUS_PIPE_EQUAL,
	ZIC_TOKENIZER_TAG_MINUS,
	ZIC_TOKENIZER_TAG_MINUS_EQUAL,
	ZIC_TOKENIZER_TAG_MINUS_PERCENT,
	ZIC_TOKENIZER_TAG_MINUS_PERCENT_EQUAL,
	ZIC_TOKENIZER_TAG_MINUS_PIPE,
	ZIC_TOKENIZER_TAG_MINUS_PIPE_EQUAL,
	ZIC_TOKENIZER_TAG_ASTERISK,
	ZIC_TOKENIZER_TAG_ASTERISK_EQUAL,
	ZIC_TOKENIZER_TAG_ASTERISK_ASTERISK,
	ZIC_TOKENIZER_TAG_ASTERISK_PERCENT,
	ZIC_TOKENIZER_TAG_ASTERISK_PERCENT_EQUAL,
	ZIC_TOKENIZER_TAG_ASTERISK_PIPE,
	ZIC_TOKENIZER_TAG_ASTERISK_PIPE_EQUAL,
	ZIC_TOKENIZER_TAG_ARROW,
	ZIC_TOKENIZER_TAG_COLON,
	ZIC_TOKENIZER_TAG_SLASH,
	ZIC_TOKENIZER_TAG_SLASH_EQUAL,
	ZIC_TOKENIZER_TAG_COMMA,
	ZIC_TOKENIZER_TAG_AMPERSAND,
	ZIC_TOKENIZER_TAG_AMPERSAND_EQUAL,
	ZIC_TOKENIZER_TAG_QUESTION_MARK,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_LEFT,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_LEFT_EQUAL,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_RIGHT,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_RIGHT_EQUAL,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT,
	ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL,
	ZIC_TOKENIZER_TAG_TILDE,
	ZIC_TOKENIZER_TAG_NUMBER_LITERAL,
	ZIC_TOKENIZER_TAG_DOC_COMMENT,
	ZIC_TOKENIZER_TAG_CONTAINER_DOC_COMMENT,
	ZIC_TOKENIZER_TAG_KEYWORD_ADDRSPACE,
	ZIC_TOKENIZER_TAG_KEYWORD_ALIGN,
	ZIC_TOKENIZER_TAG_KEYWORD_ALLOWZERO,
	ZIC_TOKENIZER_TAG_KEYWORD_AND,
	ZIC_TOKENIZER_TAG_KEYWORD_ANYFRAME,
	ZIC_TOKENIZER_TAG_KEYWORD_ANYTYPE,
	ZIC_TOKENIZER_TAG_KEYWORD_ASM,
	ZIC_TOKENIZER_TAG_KEYWORD_ASYNC,
	ZIC_TOKENIZER_TAG_KEYWORD_AWAIT,
	ZIC_TOKENIZER_TAG_KEYWORD_BREAK,
	ZIC_TOKENIZER_TAG_KEYWORD_CALLCONV,
	ZIC_TOKENIZER_TAG_KEYWORD_CATCH,
	ZIC_TOKENIZER_TAG_KEYWORD_COMPTIME,
	ZIC_TOKENIZER_TAG_KEYWORD_CONST,
	ZIC_TOKENIZER_TAG_KEYWORD_CONTINUE,
	ZIC_TOKENIZER_TAG_KEYWORD_DEFER,
	ZIC_TOKENIZER_TAG_KEYWORD_ELSE,
	ZIC_TOKENIZER_TAG_KEYWORD_ENUM,
	ZIC_TOKENIZER_TAG_KEYWORD_ERRDEFER,
	ZIC_TOKENIZER_TAG_KEYWORD_ERROR,
	ZIC_TOKENIZER_TAG_KEYWORD_EXPORT,
	ZIC_TOKENIZER_TAG_KEYWORD_EXTERN,
	ZIC_TOKENIZER_TAG_KEYWORD_FN,
	ZIC_TOKENIZER_TAG_KEYWORD_FOR,
	ZIC_TOKENIZER_TAG_KEYWORD_IF,
	ZIC_TOKENIZER_TAG_KEYWORD_INLINE,
	ZIC_TOKENIZER_TAG_KEYWORD_NOALIAS,
	ZIC_TOKENIZER_TAG_KEYWORD_NOINLINE,
	ZIC_TOKENIZER_TAG_KEYWORD_NOSUSPEND,
	ZIC_TOKENIZER_TAG_KEYWORD_OPAQUE,
	ZIC_TOKENIZER_TAG_KEYWORD_OR,
	ZIC_TOKENIZER_TAG_KEYWORD_ORELSE,
	ZIC_TOKENIZER_TAG_KEYWORD_PACKED,
	ZIC_TOKENIZER_TAG_KEYWORD_PUB,
	ZIC_TOKENIZER_TAG_KEYWORD_RESUME,
	ZIC_TOKENIZER_TAG_KEYWORD_RETURN,
	ZIC_TOKENIZER_TAG_KEYWORD_LINKSECTION,
	ZIC_TOKENIZER_TAG_KEYWORD_STRUCT,
	ZIC_TOKENIZER_TAG_KEYWORD_SUSPEND,
	ZIC_TOKENIZER_TAG_KEYWORD_SWITCH,
	ZIC_TOKENIZER_TAG_KEYWORD_TEST,
	ZIC_TOKENIZER_TAG_KEYWORD_THREADLOCAL,
	ZIC_TOKENIZER_TAG_KEYWORD_TRY,
	ZIC_TOKENIZER_TAG_KEYWORD_UNION,
	ZIC_TOKENIZER_TAG_KEYWORD_UNREACHABLE,
	ZIC_TOKENIZER_TAG_KEYWORD_USINGNAMESPACE,
	ZIC_TOKENIZER_TAG_KEYWORD_VAR,
	ZIC_TOKENIZER_TAG_KEYWORD_VOLATILE,
	ZIC_TOKENIZER_TAG_KEYWORD_WHILE,
};

struct zic_tokenizer_maybe_tag{
	bool flag;
	enum zic_tokenizer_tag tag;
};

struct zic_tokenizer_token {
	enum zic_tokenizer_tag tag;
	size_t start;
	size_t end;
};

struct zic_tokenizer_maybe_token {
	bool flag;
	struct zic_tokenizer_token token;
};

struct zic_tokenizer {
	unsigned char *buffer;
	size_t buffer_size;
	size_t index;
	struct zic_tokenizer_maybe_token pending_invalid_token;
};

#endif

A  => unicode.c +95 -0
@@ 1,95 @@
#include "unicode.h"

#include <assert.h>

int
zic_unicode_utf8_byte_sequence_length(uint8_t byte) {
	if (byte <= 0x7F) {
		return 1;
	} else if (0xC0 <= byte && byte <= 0xDF) {
		return 2;
	} else if (0xE0 <= byte && byte <= 0xEF) {
		return 3;
	} else if (0xF0 <= byte && byte <= 0xF7) {
		return 4;
	}
	return -1;
}

int
zic_unicode_utf8_decode_2(unsigned char *bytes) {
	int value;

	assert((bytes[0] & 0xE0) == 0xC0);
	value = bytes[0] & 0x1F;

	if ((bytes[1] & 0xC0) != 0x80) {
		return -1;
	}
	value <<= 6;
	value |= bytes[1] & 0x3F;

	if (value < 0x80) {
		return -1;
	}
	return value;
}

int
zic_unicode_utf8_decode_3(unsigned char *bytes) {
	int value;

	assert((bytes[0] & 0xF0) == 0xE0);
	value = bytes[0] & 0xF;

	if ((bytes[1] & 0xC0) != 0x80) {
		return -1;
	}
	value <<= 6;
	value |= bytes[1] & 0x3F;

	if ((bytes[2] & 0xC0) != 0x80) {
		return -1;
	}
	value <<= 6;
	value |= bytes[2] & 0x3F;

	if (value < 0x800) {
		return -1;
	}
	if (0xD800 <= value && value < 0xDFFF) {
		return -1;
	}
	return value;
}

int
zic_unicode_utf8_decode_4(unsigned char *bytes) {
	int value;

	assert((bytes[0] & 0xF8) == 0xF0);
	value = bytes[0] & 0x7;

	if ((bytes[1] & 0xC0) != 0x80) {
		return -1;
	}
	value <<= 6;
	value |= bytes[1] & 0x3F;

	if ((bytes[2] & 0xC0) != 0x80) {
		return -1;
	}
	value <<= 6;
	value |= bytes[2] & 0x3F;

	if ((bytes[3] & 0xC0) != 0x80) {
		return -1;
	}
	value <<= 6;
	value |= bytes[3] & 0x3F;

	if (value < 0x10000 || value > 0x10FFFF) {
		return -1;
	}
	return value;
}

A  => unicode.h +14 -0
@@ 1,14 @@
#ifndef ZIC_UNICODE_H
#define ZIC_UNICODE_H

#include <stdint.h>

int zic_unicode_utf8_byte_sequence_length(uint8_t byte);

int zic_unicode_utf8_decode_2(unsigned char *bytes);

int zic_unicode_utf8_decode_3(unsigned char *bytes);

int zic_unicode_utf8_decode_4(unsigned char *bytes);

#endif