
83fd4ea55158677a745713acc3f813579df32871 — Andrea Feletto 1 year, 7 months ago 239dc6f
add unicode support

tokenize on codepoints instead of chars.
do not restrict word content to latin alphabet.

closes: https://todo.sr.ht/~andreafeletto/zig-scfg/1
3 files changed, 78 insertions(+), 72 deletions(-)

M scfg.zig
M src/Parser.zig
M src/Tokenizer.zig
M scfg.zig => scfg.zig +1 -1
@@ 14,7 14,7 @@ const Directive = struct {

pub fn parse(allocator: Allocator, source: [:0]const u8) !Block {
    var tokenizer = Tokenizer.init(source);
    var tokenizer = try Tokenizer.init(source);
    var parser: Parser = .{ .allocator = allocator, .source = source };

    while (true) {

M src/Parser.zig => src/Parser.zig +2 -2
@@ 110,7 110,7 @@ test "parser: minimal" {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();

    var tokenizer = Tokenizer.init(source);
    var tokenizer = try Tokenizer.init(source);
    var parser: Parser = .{ .allocator = arena.allocator(), .source = source };

    while (true) {

@@ 140,7 140,7 @@ test "parser: directive with a block" {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();

    var tokenizer = Tokenizer.init(source);
    var tokenizer = try Tokenizer.init(source);
    var parser: Parser = .{ .allocator = arena.allocator(), .source = source };

    while (true) {

M src/Tokenizer.zig => src/Tokenizer.zig +75 -69
@@ 1,5 1,6 @@
const std = @import("std");
const testing = std.testing;
const unicode = std.unicode;

const Tokenizer = @This();

@@ 32,116 33,117 @@ const State = enum {

source: [:0]const u8,
utf8: unicode.Utf8Iterator,
index: usize,
codepoint: u21,

pub fn init(source: [:0]const u8) Tokenizer {
    return Tokenizer{ .source = source, .index = 0 };
pub fn init(source: []const u8) !Tokenizer {
    const utf8_view = try unicode.Utf8View.init(source);
    var utf8_iter = utf8_view.iterator();
    const first = utf8_iter.nextCodepoint() orelse 0;

    return Tokenizer{ .utf8 = utf8_iter, .index = 0, .codepoint = first };

pub fn next(self: *Tokenizer) Token {
    var state: State = .start;
    var token: Token = .{
        .tag = .eof,
        .loc = .{ .start = self.index, .end = undefined },

    while (true) : (self.index += 1) {
        const char = self.source[self.index];
    var tag: ?Token.Tag = null;
    var start = self.index;

    while (true) {
        switch (state) {
            .start => switch (char) {
            .start => switch (self.codepoint) {
                0 => {
                    tag = .eof;
                ' ', '\t', '\r' => {
                    token.loc.start = self.index + 1;
                    start = self.utf8.i;
                '\n' => {
                    state = .newline;
                    token.tag = .newline;
                    tag = .newline;
                '\'' => {
                    state = .squote_string;
                    token.tag = .squote_string;
                    tag = .squote_string;
                '"' => {
                    state = .dquote_string;
                    token.tag = .dquote_string;
                'a'...'z', 'A'...'Z', '0'...'9', '_' => {
                    state = .bare_string;
                    token.tag = .bare_string;
                    tag = .dquote_string;
                '{' => {
                    self.index += 1;
                    token.tag = .l_brace;
                    token.loc.end = self.index;
                    return token;
                    tag = .l_brace;
                '}' => {
                    self.index += 1;
                    token.tag = .r_brace;
                    token.loc.end = self.index;
                    return token;
                    tag = .r_brace;
                else => {
                    token.tag = .invalid;
                    token.loc.end = self.index;
                    self.index += 1;
                    return token;
                    state = .bare_string;
                    tag = .bare_string;
            .bare_string => switch (char) {
            .bare_string => switch (self.codepoint) {
                0, ' ', '\t', '\r', '\n', '{', '}' => {
                    return .{
                        .tag = tag.?,
                        .loc = .{ .start = start, .end = self.index },
                '"', '\'' => {
                    token.tag = .invalid;
                    token.loc.end = self.index;
                    self.index += 1;
                    return token;
                    tag = .invalid;
                else => {},
            .squote_string => switch (char) {
            .squote_string => switch (self.codepoint) {
                '\'' => {
                    self.index += 1;
                0, '\n' => {
                    token.tag = .invalid;
                    token.loc.end = self.index;
                    return token;
                    tag = .invalid;
                else => {},
            .dquote_string => switch (char) {
            .dquote_string => switch (self.codepoint) {
                '"' => {
                    self.index += 1;
                0, '\n' => {
                    token.tag = .invalid;
                    token.loc.end = self.index;
                    return token;
                    tag = .invalid;
                '\\' => {
                    self.index += 1;
                    self.index = self.utf8.i;
                    self.codepoint = self.utf8.nextCodepoint() orelse 0;
                else => {},
            .newline => switch (char) {
                '\n' => {},
            .newline => switch (self.codepoint) {
                '\n', ' ', '\t', '\r' => {},
                else => {
                    return .{
                        .tag = tag.?,
                        .loc = .{ .start = start, .end = self.index },

        self.index = self.utf8.i;
        self.codepoint = self.utf8.nextCodepoint() orelse 0;

    token.loc.end = self.index;
    return token;
    self.index = self.utf8.i;
    self.codepoint = self.utf8.nextCodepoint() orelse 0;

    return .{
        .tag = tag.?,
        .loc = .{ .start = start, .end = self.index },

test "tokenizer: minimal" {

@@ 152,14 154,14 @@ test "tokenizer: minimal" {
        .{ .tag = .eof, .loc = .{ .start = 8, .end = 8 } },

    var tokenizer = Tokenizer.init(source);
    var tokenizer = try Tokenizer.init(source);
    var tokens = std.ArrayList(Token).init(testing.allocator);
    defer tokens.deinit();

    while (true) {
        const token = tokenizer.next();
        try tokens.append(token);
        if (token.tag == .eof) break;
        if (token.tag == .eof or token.tag == .invalid) break;

    try testing.expectEqualSlices(Token, &expected_tokens, tokens.items);

@@ 168,35 170,39 @@ test "tokenizer: minimal" {
test "tokenizer: full" {
    const source =
        \\model "E5" {
        \\   max-speed 320km/h
        \\  max-speed 320km/h
        \\   weight '453.5t' "\""
        \\  weight '453.5t' "\""
        \\  emoji 🙋‍♂️
    const expected_tokens = [_]Token{
        .{ .tag = .bare_string, .loc = .{ .start = 0, .end = 5 } },
        .{ .tag = .dquote_string, .loc = .{ .start = 6, .end = 10 } },
        .{ .tag = .l_brace, .loc = .{ .start = 11, .end = 12 } },
        .{ .tag = .newline, .loc = .{ .start = 12, .end = 13 } },
        .{ .tag = .bare_string, .loc = .{ .start = 16, .end = 25 } },
        .{ .tag = .bare_string, .loc = .{ .start = 26, .end = 33 } },
        .{ .tag = .newline, .loc = .{ .start = 33, .end = 35 } },
        .{ .tag = .bare_string, .loc = .{ .start = 38, .end = 44 } },
        .{ .tag = .squote_string, .loc = .{ .start = 45, .end = 53 } },
        .{ .tag = .dquote_string, .loc = .{ .start = 54, .end = 58 } },
        .{ .tag = .newline, .loc = .{ .start = 58, .end = 59 } },
        .{ .tag = .r_brace, .loc = .{ .start = 59, .end = 60 } },
        .{ .tag = .eof, .loc = .{ .start = 60, .end = 60 } },
        .{ .tag = .newline, .loc = .{ .start = 12, .end = 15 } },
        .{ .tag = .bare_string, .loc = .{ .start = 15, .end = 24 } },
        .{ .tag = .bare_string, .loc = .{ .start = 25, .end = 32 } },
        .{ .tag = .newline, .loc = .{ .start = 32, .end = 36 } },
        .{ .tag = .bare_string, .loc = .{ .start = 36, .end = 42 } },
        .{ .tag = .squote_string, .loc = .{ .start = 43, .end = 51 } },
        .{ .tag = .dquote_string, .loc = .{ .start = 52, .end = 56 } },
        .{ .tag = .newline, .loc = .{ .start = 56, .end = 59 } },
        .{ .tag = .bare_string, .loc = .{ .start = 59, .end = 64 } },
        .{ .tag = .bare_string, .loc = .{ .start = 65, .end = 78 } },
        .{ .tag = .newline, .loc = .{ .start = 78, .end = 79 } },
        .{ .tag = .r_brace, .loc = .{ .start = 79, .end = 80 } },
        .{ .tag = .eof, .loc = .{ .start = 80, .end = 80 } },

    var tokenizer = Tokenizer.init(source);
    var tokenizer = try Tokenizer.init(source);
    var tokens = std.ArrayList(Token).init(testing.allocator);
    defer tokens.deinit();

    while (true) {
        const token = tokenizer.next();
        try tokens.append(token);
        if (token.tag == .eof) break;
        if (token.tag == .eof or token.tag == .invalid) break;

    try testing.expectEqualSlices(Token, &expected_tokens, tokens.items);