A => .editorconfig +18 -0
@@ 1,18 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+
+[*.{c,h}]
+trim_trailing_whitespace = true
+indent_style = tab
+indent_size = 8
+max_line_length = 100
+
+[makefile]
+trim_trailing_whitespace = true
+indent_style = tab
+indent_size = 8
+max_line_length = 100
A => .gitignore +2 -0
A => LICENSE +21 -0
@@ 1,21 @@
+MIT License
+
+Copyright (c) Andrea Feletto
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
A => README.md +32 -0
@@ 1,32 @@
+
+# [Zic]
+
+Alternative [zig] compiler targeting [c99], written in c99 with no external
+dependencies other than libc.
+It is currently a work in progress.
+
+## Build
+
+```
+git clone https://git.sr.ht/~andreafeletto/zic
+cd zic
+make
+```
+
+## Contributing
+
+Please join the [#andreafeletto] IRC channel to ask for help or to give
+feedback.
+You are welcome to send patches to the [mailing list] or report bugs on the
+[issue tracker].
+If you aren't familiar with `git send-email`, you can use the [web interface]
+or learn about it by following this excellent [tutorial].
+
+[Zic]: https://sr.ht/~andreafeletto/zic
+[zig]: https://ziglang.org
+[c99]: https://en.cppreference.com/w/c/99
+[#andreafeletto]: ircs://irc.libera.chat/#andreafeletto
+[mailing list]: https://lists.sr.ht/~andreafeletto/public-inbox
+[issue tracker]: https://todo.sr.ht/~andreafeletto/zic
+[web interface]: https://git.sr.ht/~andreafeletto/zic/send-email
+[tutorial]: https://git-send-email.io
A => config.mk +6 -0
@@ 1,6 @@
+DESTDIR ?=
+PREFIX ?= /usr/local
+
+CC = cc
+
+CFLAGS = -std=c99 -Werror -Wall -Wextra -pedantic -O2
A => main.c +9 -0
@@ 1,9 @@
+
+#include <stdlib.h>
+
+int
+main(int argc, char **argv) {
+ (void)(argc);
+ (void)(argv);
+ return EXIT_SUCCESS;
+}
A => makefile +24 -0
@@ 1,24 @@
+.POSIX:
+
+include config.mk
+
+SRC := main.c tokenizer.c unicode.c
+OBJ := ${SRC:.c=.o}
+
+all: zic
+
+zic: ${OBJ}
+ ${CC} -o $@ ${OBJ} ${LDFLAGS}
+
+${OBJ}: tokenizer.h unicode.h
+
+install:
+ install -Dm755 zic -t "${DESTDIR}${PREFIX}/bin"
+
+uninstall:
+ rm -f "${DESTDIR}${PREFIX}/bin/zic"
+
+clean:
+ rm -f zic ${OBJ}
+
+.PHONY: all install uninstall clean
A => tokenizer.c +1054 -0
@@ 1,1054 @@
+#include "tokenizer.h"
+
+#include <ctype.h>
+#include <string.h>
+
+#include "unicode.h"
+
+enum zic_tokenizer_state {
+ ZIC_TOKENIZER_STATE_START,
+ ZIC_TOKENIZER_STATE_IDENTIFIER,
+ ZIC_TOKENIZER_STATE_BUILTIN,
+ ZIC_TOKENIZER_STATE_STRING_LITERAL,
+ ZIC_TOKENIZER_STATE_STRING_LITERAL_BACKSLASH,
+ ZIC_TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE,
+ ZIC_TOKENIZER_STATE_CHAR_LITERAL,
+ ZIC_TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH,
+ ZIC_TOKENIZER_STATE_CHAR_LITERAL_HEX_ESCAPE,
+ ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE_SAW_U,
+ ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE,
+ ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_INVALID,
+ ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE,
+ ZIC_TOKENIZER_STATE_CHAR_LITERAL_END,
+ ZIC_TOKENIZER_STATE_BACKSLASH,
+ ZIC_TOKENIZER_STATE_EQUAL,
+ ZIC_TOKENIZER_STATE_BANG,
+ ZIC_TOKENIZER_STATE_PIPE,
+ ZIC_TOKENIZER_STATE_MINUS,
+ ZIC_TOKENIZER_STATE_MINUS_PERCENT,
+ ZIC_TOKENIZER_STATE_MINUS_PIPE,
+ ZIC_TOKENIZER_STATE_ASTERISK,
+ ZIC_TOKENIZER_STATE_ASTERISK_PERCENT,
+ ZIC_TOKENIZER_STATE_ASTERISK_PIPE,
+ ZIC_TOKENIZER_STATE_SLASH,
+ ZIC_TOKENIZER_STATE_LINE_COMMENT_START,
+ ZIC_TOKENIZER_STATE_LINE_COMMENT,
+ ZIC_TOKENIZER_STATE_DOC_COMMENT_START,
+ ZIC_TOKENIZER_STATE_DOC_COMMENT,
+ ZIC_TOKENIZER_STATE_INT,
+ ZIC_TOKENIZER_STATE_INT_EXPONENT,
+ ZIC_TOKENIZER_STATE_INT_PERIOD,
+ ZIC_TOKENIZER_STATE_FLOAT,
+ ZIC_TOKENIZER_STATE_FLOAT_EXPONENT,
+ ZIC_TOKENIZER_STATE_AMPERSAND,
+ ZIC_TOKENIZER_STATE_CARET,
+ ZIC_TOKENIZER_STATE_PERCENT,
+ ZIC_TOKENIZER_STATE_PLUS,
+ ZIC_TOKENIZER_STATE_PLUS_PERCENT,
+ ZIC_TOKENIZER_STATE_PLUS_PIPE,
+ ZIC_TOKENIZER_STATE_ANGLE_BRACKET_LEFT,
+ ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT,
+ ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE,
+ ZIC_TOKENIZER_STATE_ANGLE_BRACKET_RIGHT,
+ ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT,
+ ZIC_TOKENIZER_STATE_PERIOD,
+ ZIC_TOKENIZER_STATE_PERIOD_2,
+ ZIC_TOKENIZER_STATE_PERIOD_ASTERISK,
+ ZIC_TOKENIZER_STATE_SAW_AT_SIGN,
+};
+
+static struct zic_tokenizer_maybe_tag
+zic_tokenizer_tag_from_keyword(char *keyword, size_t size) {
+ struct zic_tokenizer_maybe_tag result;
+
+ result.flag = true;
+ if (strncmp(keyword, "addrspace", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ADDRSPACE;
+ } else if (strncmp(keyword, "align", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ALIGN;
+ } else if (strncmp(keyword, "allowzero", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ALLOWZERO;
+ } else if (strncmp(keyword, "and", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_AND;
+ } else if (strncmp(keyword, "anyframe", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ANYFRAME;
+ } else if (strncmp(keyword, "anytype", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ANYTYPE;
+ } else if (strncmp(keyword, "asm", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ASM;
+ } else if (strncmp(keyword, "async", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ASYNC;
+ } else if (strncmp(keyword, "await", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_AWAIT;
+ } else if (strncmp(keyword, "break", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_BREAK;
+ } else if (strncmp(keyword, "callconv", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_CALLCONV;
+ } else if (strncmp(keyword, "catch", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_CATCH;
+ } else if (strncmp(keyword, "comptime", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_COMPTIME;
+ } else if (strncmp(keyword, "const", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_CONST;
+ } else if (strncmp(keyword, "continue", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_CONTINUE;
+ } else if (strncmp(keyword, "defer", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_DEFER;
+ } else if (strncmp(keyword, "else", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ELSE;
+ } else if (strncmp(keyword, "enum", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ENUM;
+ } else if (strncmp(keyword, "errdefer", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ERRDEFER;
+ } else if (strncmp(keyword, "error", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ERROR;
+ } else if (strncmp(keyword, "export", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_EXPORT;
+ } else if (strncmp(keyword, "extern", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_EXTERN;
+ } else if (strncmp(keyword, "fn", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_FN;
+ } else if (strncmp(keyword, "for", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_FOR;
+ } else if (strncmp(keyword, "if", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_IF;
+ } else if (strncmp(keyword, "inline", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_INLINE;
+ } else if (strncmp(keyword, "noalias", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_NOALIAS;
+ } else if (strncmp(keyword, "noinline", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_NOINLINE;
+ } else if (strncmp(keyword, "nosuspend", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_NOSUSPEND;
+ } else if (strncmp(keyword, "opaque", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_OPAQUE;
+ } else if (strncmp(keyword, "or", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_OR;
+ } else if (strncmp(keyword, "orelse", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_ORELSE;
+ } else if (strncmp(keyword, "packed", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_PACKED;
+ } else if (strncmp(keyword, "pub", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_PUB;
+ } else if (strncmp(keyword, "resume", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_RESUME;
+ } else if (strncmp(keyword, "return", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_RETURN;
+ } else if (strncmp(keyword, "linksection", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_LINKSECTION;
+ } else if (strncmp(keyword, "struct", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_STRUCT;
+ } else if (strncmp(keyword, "suspend", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_SUSPEND;
+ } else if (strncmp(keyword, "switch", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_SWITCH;
+ } else if (strncmp(keyword, "test", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_TEST;
+ } else if (strncmp(keyword, "threadlocal", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_THREADLOCAL;
+ } else if (strncmp(keyword, "try", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_TRY;
+ } else if (strncmp(keyword, "union", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_UNION;
+ } else if (strncmp(keyword, "unreachable", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_UNREACHABLE;
+ } else if (strncmp(keyword, "usingnamespace", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_USINGNAMESPACE;
+ } else if (strncmp(keyword, "var", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_VAR;
+ } else if (strncmp(keyword, "volatile", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_VOLATILE;
+ } else if (strncmp(keyword, "while", size)) {
+ result.tag = ZIC_TOKENIZER_TAG_KEYWORD_WHILE;
+ }
+
+ return result;
+}
+
+void
+zic_tokenizer_init(struct zic_tokenizer *tokenizer, unsigned char *buffer) {
+ tokenizer->buffer = buffer;
+ tokenizer->buffer_size = strlen((char *)buffer);
+ tokenizer->index = strncmp((char *)buffer, "\xEF\xBB\xBF", 3) == 0 ? 3 : 0;
+ tokenizer->pending_invalid_token.flag = false;
+}
+
+static size_t
+zic_tokenizer_get_invalid_character_length(struct zic_tokenizer *tokenizer) {
+ unsigned char c0, *bytes;
+ int length, codepoint;
+
+ c0 = tokenizer->buffer[tokenizer->index];
+ if (c0 < 128) {
+ if (c0 == '\r') {
+ if (tokenizer->index + 1 < tokenizer->buffer_size
+ && tokenizer->buffer[tokenizer->index + 1] == '\n') {
+ return 0;
+ } else {
+ return 1;
+ }
+ } else if (iscntrl(c0)) {
+ return 1;
+ }
+ return 0;
+ }
+
+ length = zic_unicode_utf8_byte_sequence_length(c0);
+ if (length < 0) {
+ return 1;
+ }
+ if (tokenizer->index + length > tokenizer->buffer_size) {
+ return tokenizer->buffer_size - tokenizer->index;
+ }
+ bytes = tokenizer->buffer + tokenizer->index;
+ switch (length) {
+ case 2:
+ codepoint = zic_unicode_utf8_decode_2(bytes);
+ if (codepoint < 0 || codepoint == 0x85) {
+ return length;
+ }
+ break;
+ case 3:
+ codepoint = zic_unicode_utf8_decode_3(bytes);
+ if (codepoint < 0 || codepoint == 0x2028 || codepoint == 0x2029) {
+ return length;
+ }
+ break;
+ case 4:
+ codepoint = zic_unicode_utf8_decode_4(bytes);
+ if (codepoint < 0) {
+ return length;
+ }
+ break;
+ }
+ tokenizer->index += length - 1;
+ return 0;
+}
+
+static void
+zic_tokenizer_check_literal_character(struct zic_tokenizer *tokenizer) {
+ size_t invalid_length;
+
+ if (tokenizer->pending_invalid_token.flag) {
+ return;
+ }
+ invalid_length = zic_tokenizer_get_invalid_character_length(tokenizer);
+
+ if (invalid_length == 0) {
+ return;
+ }
+ tokenizer->pending_invalid_token.flag = true;
+ tokenizer->pending_invalid_token.token.tag = ZIC_TOKENIZER_TAG_INVALID;
+ tokenizer->pending_invalid_token.token.start = tokenizer->index;
+ tokenizer->pending_invalid_token.token.end = tokenizer->index + invalid_length;
+}
+
+struct zic_tokenizer_token
+zic_tokenizer_next(struct zic_tokenizer *tokenizer) {
+ enum zic_tokenizer_state state;
+ struct zic_tokenizer_token result;
+ unsigned char c;
+ struct zic_tokenizer_maybe_tag keyword;
+ size_t seen_escape_digits, remaining_code_units;
+
+ if (tokenizer->pending_invalid_token.flag) {
+ tokenizer->pending_invalid_token.flag = false;
+ return tokenizer->pending_invalid_token.token;
+ }
+
+ state = ZIC_TOKENIZER_STATE_START;
+ result.tag = ZIC_TOKENIZER_TAG_EOF;
+ result.start = tokenizer->index;
+
+ for (;; tokenizer->index++) {
+ c = tokenizer->buffer[tokenizer->index];
+ switch (state) {
+ case ZIC_TOKENIZER_STATE_START:
+ if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
+ state = ZIC_TOKENIZER_STATE_IDENTIFIER;
+ result.tag = ZIC_TOKENIZER_TAG_IDENTIFIER;
+ break;
+ }
+ if ('0' <= c && c <= '9') {
+ state = ZIC_TOKENIZER_STATE_INT;
+ result.tag = ZIC_TOKENIZER_TAG_NUMBER_LITERAL;
+ break;
+ }
+ switch (c) {
+ case 0:
+ if (tokenizer->index != tokenizer->buffer_size) {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ result.start = tokenizer->index++;
+ result.end = tokenizer->index;
+ return result;
+ }
+ goto finish;
+ case ' ':
+ case '\n':
+ case '\t':
+ case '\r':
+ result.start = tokenizer->index + 1;
+ break;
+ case '"':
+ state = ZIC_TOKENIZER_STATE_STRING_LITERAL;
+ result.tag = ZIC_TOKENIZER_TAG_STRING_LITERAL;
+ break;
+ case '\'':
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL;
+ break;
+ case '@':
+ state = ZIC_TOKENIZER_STATE_SAW_AT_SIGN;
+ break;
+ case '=':
+ state = ZIC_TOKENIZER_STATE_EQUAL;
+ break;
+ case '!':
+ state = ZIC_TOKENIZER_STATE_BANG;
+ break;
+ case '|':
+ state = ZIC_TOKENIZER_STATE_PIPE;
+ break;
+ case '(':
+ result.tag = ZIC_TOKENIZER_TAG_L_PAREN;
+ tokenizer->index++;
+ goto finish;
+ case ')':
+ result.tag = ZIC_TOKENIZER_TAG_R_PAREN;
+ tokenizer->index++;
+ goto finish;
+ case '[':
+ result.tag = ZIC_TOKENIZER_TAG_L_BRACKET;
+ tokenizer->index++;
+ goto finish;
+ case ']':
+ result.tag = ZIC_TOKENIZER_TAG_R_BRACKET;
+ tokenizer->index++;
+ goto finish;
+ case ';':
+ result.tag = ZIC_TOKENIZER_TAG_SEMICOLON;
+ tokenizer->index++;
+ goto finish;
+ case ',':
+ result.tag = ZIC_TOKENIZER_TAG_COMMA;
+ tokenizer->index++;
+ goto finish;
+ case '?':
+ result.tag = ZIC_TOKENIZER_TAG_QUESTION_MARK;
+ tokenizer->index++;
+ goto finish;
+ case ':':
+ result.tag = ZIC_TOKENIZER_TAG_COLON;
+ tokenizer->index++;
+ goto finish;
+ case '%':
+ state = ZIC_TOKENIZER_STATE_PERCENT;
+ break;
+ case '*':
+ state = ZIC_TOKENIZER_STATE_ASTERISK;
+ break;
+ case '+':
+ state = ZIC_TOKENIZER_STATE_PLUS;
+ break;
+ case '<':
+ state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_LEFT;
+ break;
+ case '>':
+ state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_RIGHT;
+ break;
+ case '^':
+ state = ZIC_TOKENIZER_STATE_CARET;
+ break;
+ case '\\':
+ state = ZIC_TOKENIZER_STATE_BACKSLASH;
+ result.tag = ZIC_TOKENIZER_TAG_MULTILINE_STRING_LITERAL_LINE;
+ break;
+ case '{':
+ result.tag = ZIC_TOKENIZER_TAG_L_BRACE;
+ tokenizer->index++;
+ goto finish;
+ case '}':
+ result.tag = ZIC_TOKENIZER_TAG_R_BRACE;
+ tokenizer->index++;
+ goto finish;
+ case '~':
+ result.tag = ZIC_TOKENIZER_TAG_TILDE;
+ tokenizer->index++;
+ goto finish;
+ case '.':
+ state = ZIC_TOKENIZER_STATE_PERIOD;
+ break;
+ case '-':
+ state = ZIC_TOKENIZER_STATE_MINUS;
+ break;
+ case '/':
+ state = ZIC_TOKENIZER_STATE_SLASH;
+ break;
+ case '&':
+ state = ZIC_TOKENIZER_STATE_AMPERSAND;
+ break;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ result.end = tokenizer->index++;
+ return result;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_SAW_AT_SIGN:
+ if (c == '"') {
+ result.tag = ZIC_TOKENIZER_TAG_IDENTIFIER;
+ state = ZIC_TOKENIZER_STATE_STRING_LITERAL;
+ } else if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
+ result.tag = ZIC_TOKENIZER_TAG_BUILTIN;
+ state = ZIC_TOKENIZER_STATE_BUILTIN;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_AMPERSAND:
+ if (c == '=') {
+ result.tag = ZIC_TOKENIZER_TAG_AMPERSAND_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_AMPERSAND;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_ASTERISK:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_ASTERISK_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ case '*':
+ result.tag = ZIC_TOKENIZER_TAG_ASTERISK_ASTERISK;
+ tokenizer->index++;
+ goto finish;
+ case '%':
+ state = ZIC_TOKENIZER_STATE_ASTERISK_PERCENT;
+ break;
+ case '|':
+ state = ZIC_TOKENIZER_STATE_ASTERISK_PIPE;
+ break;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_ASTERISK;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_ASTERISK_PERCENT:
+ if (c == '=') {
+ result.tag = ZIC_TOKENIZER_TAG_ASTERISK_PERCENT_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_ASTERISK_PERCENT;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_ASTERISK_PIPE:
+ if (c == '=') {
+ result.tag = ZIC_TOKENIZER_TAG_ASTERISK_PIPE_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_ASTERISK_PIPE;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_PERCENT:
+ if (c == '=') {
+ result.tag = ZIC_TOKENIZER_TAG_PERCENT_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_PERCENT;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_PLUS:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_PLUS_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ case '+':
+ result.tag = ZIC_TOKENIZER_TAG_PLUS_PLUS;
+ tokenizer->index++;
+ goto finish;
+ case '%':
+ state = ZIC_TOKENIZER_STATE_PLUS_PERCENT;
+ break;
+ case '|':
+ state = ZIC_TOKENIZER_STATE_PLUS_PIPE;
+ break;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_PLUS;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_PLUS_PERCENT:
+ if (c == '=') {
+ result.tag = ZIC_TOKENIZER_TAG_PLUS_PERCENT_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_PLUS_PERCENT;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_PLUS_PIPE:
+ if (c == '=') {
+ result.tag = ZIC_TOKENIZER_TAG_PLUS_PIPE_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_PLUS_PIPE;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_CARET:
+ if (c == '=') {
+ result.tag = ZIC_TOKENIZER_TAG_CARET_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_CARET;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_IDENTIFIER:
+ if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_') {
+ break;
+ }
+ keyword = zic_tokenizer_tag_from_keyword(
+ (char *)(tokenizer->buffer + result.start),
+ tokenizer->index - result.start);
+ if (keyword.flag) {
+ result.tag = keyword.tag;
+ }
+ goto finish;
+ case ZIC_TOKENIZER_STATE_BUILTIN:
+ if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_') {
+ break;
+ }
+ goto finish;
+ case ZIC_TOKENIZER_STATE_BACKSLASH:
+ if (c == '\\') {
+ state = ZIC_TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_STRING_LITERAL:
+ switch (c) {
+ case '\\':
+ state = ZIC_TOKENIZER_STATE_STRING_LITERAL_BACKSLASH;
+ break;
+ case '"':
+ tokenizer->index++;
+ goto finish;
+ case 0:
+ if (tokenizer->index == tokenizer->buffer_size) {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ } else {
+ zic_tokenizer_check_literal_character(tokenizer);
+ }
+ break;
+ case '\n':
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ default:
+ zic_tokenizer_check_literal_character(tokenizer);
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_STRING_LITERAL_BACKSLASH:
+ if (c == 0 || c == '\n') {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_STRING_LITERAL;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_CHAR_LITERAL:
+ if (c == '\'' || (0x80 <= c && c <= 0xBF) || c >= 0xF8) {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ } else if (0xC0 <= c && c <= 0xDF) {
+ remaining_code_units = 1;
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE;
+ break;
+ } else if (0xE0 <= c && c <= 0xEF) {
+ remaining_code_units = 2;
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE;
+ break;
+ } else if (0xF0 <= c && c <= 0xF7) {
+ remaining_code_units = 3;
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE;
+ break;
+ }
+ switch (c) {
+ case 0:
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ case '\\':
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH;
+ break;
+ case '\n':
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ default:
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_CHAR_LITERAL_BACKSLASH:
+ switch (c) {
+ case '0':
+ case '\n':
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ case 'x':
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_HEX_ESCAPE;
+ seen_escape_digits = 0;
+ break;
+ case 'u':
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE_SAW_U;
+ break;
+ default:
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_CHAR_LITERAL_HEX_ESCAPE:
+ if (('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
+ seen_escape_digits++;
+ if (seen_escape_digits == 2) {
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
+ }
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE_SAW_U:
+ switch (c) {
+ case '0':
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ case '{':
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE;
+ break;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_INVALID;
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_ESCAPE:
+ if (('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
+ break;
+ }
+ switch (c) {
+ case '0':
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ case '}':
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
+ break;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_INVALID;
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE_INVALID:
+ if (('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) {
+ break;
+ }
+ goto finish;
+ case ZIC_TOKENIZER_STATE_CHAR_LITERAL_END:
+ switch (c) {
+ case '\'':
+ result.tag = ZIC_TOKENIZER_TAG_CHAR_LITERAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_CHAR_LITERAL_UNICODE:
+ if (0x80 <= c && c <= 0xBF) {
+ remaining_code_units--;
+ if (remaining_code_units == 0) {
+ state = ZIC_TOKENIZER_STATE_CHAR_LITERAL_END;
+ }
+ } else {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_MULTILINE_STRING_LITERAL_LINE:
+ switch (c) {
+ case 0:
+ goto finish;
+ case '\n':
+ tokenizer->index++;
+ goto finish;
+ case '\t':
+ break;
+ default:
+ zic_tokenizer_check_literal_character(tokenizer);
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_BANG:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_BANG_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_BANG;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_PIPE:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_PIPE_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ case '|':
+ result.tag = ZIC_TOKENIZER_TAG_PIPE_PIPE;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_PIPE;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_EQUAL:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_EQUAL_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ case '>':
+ result.tag = ZIC_TOKENIZER_TAG_EQUAL_ANGLE_BRACKET_RIGHT;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_EQUAL;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_MINUS:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_MINUS_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ case '>':
+ result.tag = ZIC_TOKENIZER_TAG_ARROW;
+ tokenizer->index++;
+ goto finish;
+ case '%':
+ state = ZIC_TOKENIZER_STATE_MINUS_PERCENT;
+ break;
+ case '|':
+ state = ZIC_TOKENIZER_STATE_MINUS_PIPE;
+ break;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_MINUS;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_MINUS_PERCENT:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_MINUS_PERCENT_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_MINUS_PERCENT;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_MINUS_PIPE:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_MINUS_PIPE_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_MINUS_PIPE;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_LEFT:
+ switch (c) {
+ case '<':
+ state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
+ break;
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_LEFT_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_LEFT;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ case '|':
+ state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
+ break;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_RIGHT:
+ switch (c) {
+ case '>':
+ state = ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
+ break;
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_RIGHT_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_RIGHT;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT:
+ switch (c) {
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_PERIOD:
+ switch (c) {
+ case '.':
+ state = ZIC_TOKENIZER_STATE_PERIOD_2;
+ break;
+ case '*':
+ state = ZIC_TOKENIZER_STATE_PERIOD_ASTERISK;
+ break;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_PERIOD;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_PERIOD_2:
+ switch (c) {
+ case '.':
+ result.tag = ZIC_TOKENIZER_TAG_ELLIPSIS3;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_ELLIPSIS2;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_PERIOD_ASTERISK:
+ switch (c) {
+ case '*':
+ result.tag = ZIC_TOKENIZER_TAG_INVALID_PERIODASTERISKS;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_PERIOD_ASTERISK;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_SLASH:
+ switch (c) {
+ case '/':
+ state = ZIC_TOKENIZER_STATE_LINE_COMMENT_START;
+ break;
+ case '=':
+ result.tag = ZIC_TOKENIZER_TAG_SLASH_EQUAL;
+ tokenizer->index++;
+ goto finish;
+ default:
+ result.tag = ZIC_TOKENIZER_TAG_SLASH;
+ goto finish;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_LINE_COMMENT_START:
+ switch (c) {
+ case 0:
+ if (tokenizer->index != tokenizer->buffer_size) {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ tokenizer->index++;
+ }
+ goto finish;
+ case '/':
+ state = ZIC_TOKENIZER_STATE_DOC_COMMENT_START;
+ break;
+ case '!':
+ result.tag = ZIC_TOKENIZER_TAG_CONTAINER_DOC_COMMENT;
+ state = ZIC_TOKENIZER_STATE_DOC_COMMENT;
+ break;
+ case '\n':
+ state = ZIC_TOKENIZER_STATE_START;
+ result.start = tokenizer->index + 1;
+ break;
+ case '\t':
+ state = ZIC_TOKENIZER_STATE_LINE_COMMENT;
+ break;
+ default:
+ state = ZIC_TOKENIZER_STATE_LINE_COMMENT;
+ zic_tokenizer_check_literal_character(tokenizer);
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_DOC_COMMENT_START:
+ switch (c) {
+ case '/':
+ state = ZIC_TOKENIZER_STATE_LINE_COMMENT;
+ break;
+ case 0:
+ case '\n':
+ result.tag = ZIC_TOKENIZER_TAG_DOC_COMMENT;
+ goto finish;
+ case '\t':
+ state = ZIC_TOKENIZER_STATE_DOC_COMMENT;
+ result.tag = ZIC_TOKENIZER_TAG_DOC_COMMENT;
+ break;
+ default:
+ state = ZIC_TOKENIZER_STATE_DOC_COMMENT;
+ result.tag = ZIC_TOKENIZER_TAG_DOC_COMMENT;
+ zic_tokenizer_check_literal_character(tokenizer);
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_LINE_COMMENT:
+ switch (c) {
+ case 0:
+ if (tokenizer->index != tokenizer->buffer_size) {
+ result.tag = ZIC_TOKENIZER_TAG_INVALID;
+ tokenizer->index++;
+ }
+ goto finish;
+ case '\n':
+ state = ZIC_TOKENIZER_STATE_START;
+ result.start = tokenizer->index + 1;
+ break;
+ case '\t':
+ break;
+ default:
+ zic_tokenizer_check_literal_character(tokenizer);
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_DOC_COMMENT:
+ switch (c) {
+ case 0:
+ case '\n':
+ goto finish;
+ case '\t':
+ break;
+ default:
+ zic_tokenizer_check_literal_character(tokenizer);
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_INT:
+ if (c == 'e' || c == 'E' || c == 'p' || c == 'P') {
+ state = ZIC_TOKENIZER_STATE_INT_EXPONENT;
+ break;
+ }
+ if (('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
+ break;
+ }
+ if (c == '.') {
+ state = ZIC_TOKENIZER_STATE_INT_PERIOD;
+ break;
+ }
+ goto finish;
+ case ZIC_TOKENIZER_STATE_INT_EXPONENT:
+ switch (c) {
+ case '-':
+ case '+':
+ state = ZIC_TOKENIZER_STATE_FLOAT;
+ break;
+ default:
+ tokenizer->index--;
+ state = ZIC_TOKENIZER_STATE_INT;
+ break;
+ }
+ break;
+ case ZIC_TOKENIZER_STATE_INT_PERIOD:
+ if (c == 'e' || c == 'E' || c == 'p' || c == 'P') {
+ state = ZIC_TOKENIZER_STATE_FLOAT_EXPONENT;
+ break;
+ }
+ if (('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
+ state = ZIC_TOKENIZER_STATE_FLOAT;
+ break;
+ }
+ tokenizer->index--;
+ goto finish;
+ case ZIC_TOKENIZER_STATE_FLOAT:
+ if (c == 'e' || c == 'E' || c == 'p' || c == 'P') {
+ state = ZIC_TOKENIZER_STATE_FLOAT_EXPONENT;
+ break;
+ }
+ if (('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_') {
+ break;
+ }
+ goto finish;
+ case ZIC_TOKENIZER_STATE_FLOAT_EXPONENT:
+ switch (c) {
+ case '-':
+ case '+':
+ state = ZIC_TOKENIZER_STATE_FLOAT;
+ break;
+ default:
+ tokenizer->index--;
+ state = ZIC_TOKENIZER_STATE_FLOAT;
+ break;
+ }
+ break;
+ }
+ }
+
+finish:
+ if (result.tag == ZIC_TOKENIZER_TAG_EOF) {
+ if (tokenizer->pending_invalid_token.flag) {
+ tokenizer->pending_invalid_token.flag = false;
+ return tokenizer->pending_invalid_token.token;
+ }
+ result.end = tokenizer->index;
+ }
+
+ result.end = tokenizer->index;
+ return result;
+}<
\ No newline at end of file
A => tokenizer.h +156 -0
@@ 1,156 @@
+#ifndef ZIC_TOKENIZER_H
+#define ZIC_TOKENIZER_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+enum zic_tokenizer_tag {
+ ZIC_TOKENIZER_TAG_INVALID,
+ ZIC_TOKENIZER_TAG_INVALID_PERIODASTERISKS,
+ ZIC_TOKENIZER_TAG_IDENTIFIER,
+ ZIC_TOKENIZER_TAG_STRING_LITERAL,
+ ZIC_TOKENIZER_TAG_MULTILINE_STRING_LITERAL_LINE,
+ ZIC_TOKENIZER_TAG_CHAR_LITERAL,
+ ZIC_TOKENIZER_TAG_EOF,
+ ZIC_TOKENIZER_TAG_BUILTIN,
+ ZIC_TOKENIZER_TAG_BANG,
+ ZIC_TOKENIZER_TAG_PIPE,
+ ZIC_TOKENIZER_TAG_PIPE_PIPE,
+ ZIC_TOKENIZER_TAG_PIPE_EQUAL,
+ ZIC_TOKENIZER_TAG_EQUAL,
+ ZIC_TOKENIZER_TAG_EQUAL_EQUAL,
+ ZIC_TOKENIZER_TAG_EQUAL_ANGLE_BRACKET_RIGHT,
+ ZIC_TOKENIZER_TAG_BANG_EQUAL,
+ ZIC_TOKENIZER_TAG_L_PAREN,
+ ZIC_TOKENIZER_TAG_R_PAREN,
+ ZIC_TOKENIZER_TAG_SEMICOLON,
+ ZIC_TOKENIZER_TAG_PERCENT,
+ ZIC_TOKENIZER_TAG_PERCENT_EQUAL,
+ ZIC_TOKENIZER_TAG_L_BRACE,
+ ZIC_TOKENIZER_TAG_R_BRACE,
+ ZIC_TOKENIZER_TAG_L_BRACKET,
+ ZIC_TOKENIZER_TAG_R_BRACKET,
+ ZIC_TOKENIZER_TAG_PERIOD,
+ ZIC_TOKENIZER_TAG_PERIOD_ASTERISK,
+ ZIC_TOKENIZER_TAG_ELLIPSIS2,
+ ZIC_TOKENIZER_TAG_ELLIPSIS3,
+ ZIC_TOKENIZER_TAG_CARET,
+ ZIC_TOKENIZER_TAG_CARET_EQUAL,
+ ZIC_TOKENIZER_TAG_PLUS,
+ ZIC_TOKENIZER_TAG_PLUS_PLUS,
+ ZIC_TOKENIZER_TAG_PLUS_EQUAL,
+ ZIC_TOKENIZER_TAG_PLUS_PERCENT,
+ ZIC_TOKENIZER_TAG_PLUS_PERCENT_EQUAL,
+ ZIC_TOKENIZER_TAG_PLUS_PIPE,
+ ZIC_TOKENIZER_TAG_PLUS_PIPE_EQUAL,
+ ZIC_TOKENIZER_TAG_MINUS,
+ ZIC_TOKENIZER_TAG_MINUS_EQUAL,
+ ZIC_TOKENIZER_TAG_MINUS_PERCENT,
+ ZIC_TOKENIZER_TAG_MINUS_PERCENT_EQUAL,
+ ZIC_TOKENIZER_TAG_MINUS_PIPE,
+ ZIC_TOKENIZER_TAG_MINUS_PIPE_EQUAL,
+ ZIC_TOKENIZER_TAG_ASTERISK,
+ ZIC_TOKENIZER_TAG_ASTERISK_EQUAL,
+ ZIC_TOKENIZER_TAG_ASTERISK_ASTERISK,
+ ZIC_TOKENIZER_TAG_ASTERISK_PERCENT,
+ ZIC_TOKENIZER_TAG_ASTERISK_PERCENT_EQUAL,
+ ZIC_TOKENIZER_TAG_ASTERISK_PIPE,
+ ZIC_TOKENIZER_TAG_ASTERISK_PIPE_EQUAL,
+ ZIC_TOKENIZER_TAG_ARROW,
+ ZIC_TOKENIZER_TAG_COLON,
+ ZIC_TOKENIZER_TAG_SLASH,
+ ZIC_TOKENIZER_TAG_SLASH_EQUAL,
+ ZIC_TOKENIZER_TAG_COMMA,
+ ZIC_TOKENIZER_TAG_AMPERSAND,
+ ZIC_TOKENIZER_TAG_AMPERSAND_EQUAL,
+ ZIC_TOKENIZER_TAG_QUESTION_MARK,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_LEFT,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_LEFT_EQUAL,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_EQUAL,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_LEFT_PIPE_EQUAL,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_RIGHT,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_RIGHT_EQUAL,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT,
+ ZIC_TOKENIZER_TAG_ANGLE_BRACKET_ANGLE_BRACKET_RIGHT_EQUAL,
+ ZIC_TOKENIZER_TAG_TILDE,
+ ZIC_TOKENIZER_TAG_NUMBER_LITERAL,
+ ZIC_TOKENIZER_TAG_DOC_COMMENT,
+ ZIC_TOKENIZER_TAG_CONTAINER_DOC_COMMENT,
+ ZIC_TOKENIZER_TAG_KEYWORD_ADDRSPACE,
+ ZIC_TOKENIZER_TAG_KEYWORD_ALIGN,
+ ZIC_TOKENIZER_TAG_KEYWORD_ALLOWZERO,
+ ZIC_TOKENIZER_TAG_KEYWORD_AND,
+ ZIC_TOKENIZER_TAG_KEYWORD_ANYFRAME,
+ ZIC_TOKENIZER_TAG_KEYWORD_ANYTYPE,
+ ZIC_TOKENIZER_TAG_KEYWORD_ASM,
+ ZIC_TOKENIZER_TAG_KEYWORD_ASYNC,
+ ZIC_TOKENIZER_TAG_KEYWORD_AWAIT,
+ ZIC_TOKENIZER_TAG_KEYWORD_BREAK,
+ ZIC_TOKENIZER_TAG_KEYWORD_CALLCONV,
+ ZIC_TOKENIZER_TAG_KEYWORD_CATCH,
+ ZIC_TOKENIZER_TAG_KEYWORD_COMPTIME,
+ ZIC_TOKENIZER_TAG_KEYWORD_CONST,
+ ZIC_TOKENIZER_TAG_KEYWORD_CONTINUE,
+ ZIC_TOKENIZER_TAG_KEYWORD_DEFER,
+ ZIC_TOKENIZER_TAG_KEYWORD_ELSE,
+ ZIC_TOKENIZER_TAG_KEYWORD_ENUM,
+ ZIC_TOKENIZER_TAG_KEYWORD_ERRDEFER,
+ ZIC_TOKENIZER_TAG_KEYWORD_ERROR,
+ ZIC_TOKENIZER_TAG_KEYWORD_EXPORT,
+ ZIC_TOKENIZER_TAG_KEYWORD_EXTERN,
+ ZIC_TOKENIZER_TAG_KEYWORD_FN,
+ ZIC_TOKENIZER_TAG_KEYWORD_FOR,
+ ZIC_TOKENIZER_TAG_KEYWORD_IF,
+ ZIC_TOKENIZER_TAG_KEYWORD_INLINE,
+ ZIC_TOKENIZER_TAG_KEYWORD_NOALIAS,
+ ZIC_TOKENIZER_TAG_KEYWORD_NOINLINE,
+ ZIC_TOKENIZER_TAG_KEYWORD_NOSUSPEND,
+ ZIC_TOKENIZER_TAG_KEYWORD_OPAQUE,
+ ZIC_TOKENIZER_TAG_KEYWORD_OR,
+ ZIC_TOKENIZER_TAG_KEYWORD_ORELSE,
+ ZIC_TOKENIZER_TAG_KEYWORD_PACKED,
+ ZIC_TOKENIZER_TAG_KEYWORD_PUB,
+ ZIC_TOKENIZER_TAG_KEYWORD_RESUME,
+ ZIC_TOKENIZER_TAG_KEYWORD_RETURN,
+ ZIC_TOKENIZER_TAG_KEYWORD_LINKSECTION,
+ ZIC_TOKENIZER_TAG_KEYWORD_STRUCT,
+ ZIC_TOKENIZER_TAG_KEYWORD_SUSPEND,
+ ZIC_TOKENIZER_TAG_KEYWORD_SWITCH,
+ ZIC_TOKENIZER_TAG_KEYWORD_TEST,
+ ZIC_TOKENIZER_TAG_KEYWORD_THREADLOCAL,
+ ZIC_TOKENIZER_TAG_KEYWORD_TRY,
+ ZIC_TOKENIZER_TAG_KEYWORD_UNION,
+ ZIC_TOKENIZER_TAG_KEYWORD_UNREACHABLE,
+ ZIC_TOKENIZER_TAG_KEYWORD_USINGNAMESPACE,
+ ZIC_TOKENIZER_TAG_KEYWORD_VAR,
+ ZIC_TOKENIZER_TAG_KEYWORD_VOLATILE,
+ ZIC_TOKENIZER_TAG_KEYWORD_WHILE,
+};
+
+struct zic_tokenizer_maybe_tag{
+ bool flag;
+ enum zic_tokenizer_tag tag;
+};
+
+struct zic_tokenizer_token {
+ enum zic_tokenizer_tag tag;
+ size_t start;
+ size_t end;
+};
+
+struct zic_tokenizer_maybe_token {
+ bool flag;
+ struct zic_tokenizer_token token;
+};
+
+struct zic_tokenizer {
+ unsigned char *buffer;
+ size_t buffer_size;
+ size_t index;
+ struct zic_tokenizer_maybe_token pending_invalid_token;
+};
+
+#endif
A => unicode.c +95 -0
@@ 1,95 @@
+#include "unicode.h"
+
+#include <assert.h>
+
+int
+zic_unicode_utf8_byte_sequence_length(uint8_t byte) {
+ if (byte <= 0x7F) {
+ return 1;
+ } else if (0xC0 <= byte && byte <= 0xDF) {
+ return 2;
+ } else if (0xE0 <= byte && byte <= 0xEF) {
+ return 3;
+ } else if (0xF0 <= byte && byte <= 0xF7) {
+ return 4;
+ }
+ return -1;
+}
+
+int
+zic_unicode_utf8_decode_2(unsigned char *bytes) {
+ int value;
+
+ assert((bytes[0] & 0xE0) == 0xC0);
+ value = bytes[0] & 0x1F;
+
+ if ((bytes[1] & 0xC0) != 0x80) {
+ return -1;
+ }
+ value <<= 6;
+ value |= bytes[1] & 0x3F;
+
+ if (value < 0x80) {
+ return -1;
+ }
+ return value;
+}
+
+int
+zic_unicode_utf8_decode_3(unsigned char *bytes) {
+ int value;
+
+ assert((bytes[0] & 0xF0) == 0xE0);
+ value = bytes[0] & 0xF;
+
+ if ((bytes[1] & 0xC0) != 0x80) {
+ return -1;
+ }
+ value <<= 6;
+ value |= bytes[1] & 0x3F;
+
+ if ((bytes[2] & 0xC0) != 0x80) {
+ return -1;
+ }
+ value <<= 6;
+ value |= bytes[2] & 0x3F;
+
+ if (value < 0x800) {
+ return -1;
+ }
+ if (0xD800 <= value && value < 0xDFFF) {
+ return -1;
+ }
+ return value;
+}
+
+int
+zic_unicode_utf8_decode_4(unsigned char *bytes) {
+ int value;
+
+ assert((bytes[0] & 0xF8) == 0xF0);
+ value = bytes[0] & 0x7;
+
+ if ((bytes[1] & 0xC0) != 0x80) {
+ return -1;
+ }
+ value <<= 6;
+ value |= bytes[1] & 0x3F;
+
+ if ((bytes[2] & 0xC0) != 0x80) {
+ return -1;
+ }
+ value <<= 6;
+ value |= bytes[2] & 0x3F;
+
+ if ((bytes[3] & 0xC0) != 0x80) {
+ return -1;
+ }
+ value <<= 6;
+ value |= bytes[3] & 0x3F;
+
+ if (value < 0x10000 || value > 0x10FFFF) {
+ return -1;
+ }
+ return value;
+}
A => unicode.h +14 -0
@@ 1,14 @@
+#ifndef ZIC_UNICODE_H
+#define ZIC_UNICODE_H
+
+#include <stdint.h>
+
+int zic_unicode_utf8_byte_sequence_length(uint8_t byte);
+
+int zic_unicode_utf8_decode_2(unsigned char *bytes);
+
+int zic_unicode_utf8_decode_3(unsigned char *bytes);
+
+int zic_unicode_utf8_decode_4(unsigned char *bytes);
+
+#endif