~cricket/zckt

f768d1aa791da8c5c10c1b6512574ad419ee1736 — c piapiac 5 months ago
initial commit
4 files changed, 592 insertions(+), 0 deletions(-)

A .gitignore
A LICENSE
A README
A zckt.zig
A  => .gitignore +1 -0
@@ 1,1 @@
zig-cache/

A  => LICENSE +9 -0
@@ 1,9 @@
copyright (c) 2021 cricket <_c@piapiac.org>

permission is hereby granted to modify, distribute, and/or use this software 
in any way you damn well please as long as this entire copyright notice, 
along with the disclaimer below, is kept intact

as far as the law allows, this software comes as is, without any warranty 
or condition, and no contributor will be liable to anyone for any damages 
related to this software or license, under any kind of legal claim

A  => README +44 -0
@@ 1,44 @@
       _   _
 _____| |_| |
|_ / _| / / _|
/__\__|_\_\__|
    zig ckt parser

ABOUT
    this parser will parse a ckt file into to a stringhashmap
    ckt is a stupid simple file format for notating tables
    you can view more about it on https://sr.ht/~cricket/ckt

USAGE
    dont really use this yet; it's sort of just bodged together to work,
    and tests are lacking / incomplete.

    that being said,
    ---
    const zckt = @import("zckt").Parser;

    // ...
    zkct.parse(data, allocator);
    // returns a table of key/value pairs you can work with,
    // with values being either strings or tables.

DEVELOPMENT
    contributions are welcome to zckt and ckt itself!
    
    don't worry too much about coding style! worry more about the code itself, 
    though, please run `zig fmt` on any changed files before contributing
    and try to follow the zig style guide.

    then, send an email with your patch to ~cricket/zckt-devel@lists.sr.ht
    (unsure how? check out git-send-email.io)

LICENSE
    zckt is free software - it is licensed under the CP/PL license

    this means that you're free to use the software in any way you damn well please,
    as long as the copyright notice and warranty disclaimer are kept intact

    read the LICENSE file for more information

AUTHORS
    do `git shortlog -sne` to show a list of all contributors

A  => zckt.zig +538 -0
@@ 1,538 @@
// small, bodged together ckt parser
//
//
const std = @import("std");
const debug = std.debug;
const fmt = std.fmt;
const mem = std.mem;

const Key = []const u8;

const Value = union(enum) {
    string: []const u8,
    table: Table,

    pub fn deinit(self: *Value, a: *mem.Allocator) void {
        switch (self.*) {
            .string => |s| a.free(s),
            .table => |*t| t.deinit(),
        }
    }

    pub fn format(self: Value, comptime _: []const u8, options: fmt.FormatOptions, writer: anytype) !void {
        switch (self) {
            .string => |str| try writer.print("{s}", .{str}),
            .table => |table| try writer.print("[ ... ]", .{}), //try writer.print("{s}", .{table}),
        }
    }
};

const Table = struct {
    const Map = std.StringArrayHashMap(Value);

    map: Map,
    a: *mem.Allocator,

    pub fn init(a: *mem.Allocator) Table {
        return .{
            .map = Map.init(a),
            .a = a,
        };
    }

    pub fn format(self: Table, comptime _: []const u8, options: fmt.FormatOptions, writer: anytype) !void {
        try writer.print("[ ", .{});
        var iter = self.map.iterator();
        while (iter.next()) |entry| try writer.print("{s}={s};", .{ entry.key, entry.value });
        try writer.print(" ]", .{});
    }

    pub fn get(self: *Table, key: []const u8) ?Value {
        return self.map.get(key);
    }

    pub fn put(self: *Table, key: []const u8, value: Value) !void {
        var result = try self.map.fetchPut(key, value);

        // free memory if needed
        if (result) |*entry| {
            self.a.free(key);
            entry.value.deinit(self.a);
        }
    }

    pub fn deinit(self: *Table) void {
        var iter = self.map.iterator();

        while (iter.next()) |entry| {
            self.a.free(entry.key);
            entry.value.deinit(self.a);
        }

        self.map.deinit();
    }
};

pub const Token = union(enum) {
    pub const String = union(enum) {
        unquoted: []const u8,
        quoted: []const u8,
        multiline: []const u8,

        pub fn format(self: String, comptime _: []const u8, options: fmt.FormatOptions, writer: anytype) !void {
            switch (self) {
                .unquoted => |str| try writer.print("unquoted: {s}", .{str}),
                .quoted => |str| try writer.print("quoted: {s}", .{str}),
                .multiline => |str| try writer.print("multiline: {s}", .{str}),
            }
        }
    };

    keyeq: String, // key =
    value: String, // value
    table_start, // table start
    table_end,

    // just for pretty printing when debugging lol
    pub fn format(self: Token, comptime _: []const u8, options: fmt.FormatOptions, writer: anytype) !void {
        switch (self) {
            .keyeq => |str| try writer.print("key {{{s}}} =", .{str}),
            .value => |str| try writer.print("value {{{s}}};", .{str}),
            .table_start => |str| try writer.print("[", .{}),
            .table_end => |str| try writer.print("]", .{}),
        }
    }
};

pub const Tokenizer = struct {
    const Error = error{
        EmptyKey,
        InvalidChar,
        UnexpectedNewline,
        UnexpectedEof,
    };

    content: []const u8,

    index: usize,
    last: Token,
    finished: bool,

    pub fn init(data: []const u8) Tokenizer {
        return Tokenizer{
            .content = data,

            .index = 0,
            .last = undefined,
            .finished = false,
        };
    }

    fn eof(self: *Tokenizer) Error!void {
        switch (self.last) {
            .table_start => return Error.UnexpectedEof,
            else => self.finished = true,
        }
    }

    fn skipToNewline(self: *Tokenizer) Error!void {
        while (self.content[self.index] != '\n') {
            self.index += 1;
            if (self.index >= self.content.len) {
                return self.eof();
            }
        }
    }

    fn skipPastNewline(self: *Tokenizer) Error!void {
        try self.skipToNewline();
        self.index += 1;
    }

    fn skipWhitespace(self: *Tokenizer) Error!void {
        while (self.index < self.content.len) : (self.index += 1) {
            switch (self.content[self.index]) {
                // actual whitespace
                ' ', '\t', 0x0B, 0x0C, '\r' => continue,
                '#' => return try self.skipToNewline(),
                else => return,
            }
        }
    }

    fn skipWhitespaceAndNewlines(self: *Tokenizer) Error!void {
        while (self.index < self.content.len) : (self.index += 1) {
            switch (self.content[self.index]) {
                // actual whitespace
                ' ', '\t', 0x0B, 0x0C, '\r' => continue,
                '#' => try self.skipPastNewline(),
                '\n' => continue,
                else => return,
            }
        }
        try self.eof();
    }

    fn skipWhitespaceAndNewlinesAndBreaks(self: *Tokenizer) Error!void {
        while (self.index < self.content.len) : (self.index += 1) {
            switch (self.content[self.index]) {
                // actual whitespace
                ' ', '\t', 0x0B, 0x0C, '\r' => continue,
                ';', ',' => continue,
                '#' => try self.skipPastNewline(),
                '\n' => continue,
                else => return,
            }
        }
        try self.eof();
    }

    fn readString(self: *Tokenizer) ![]const u8 {
        const start_index = self.index;
        while (self.index < self.content.len) : (self.index += 1) {
            const char = self.content[self.index];
            switch (char) {
                '\n', '=', ';', ',', ']' => {
                    // remove trailing whitespace
                    // (eg key = value)
                    var end_index = self.index - 1;
                    while (end_index > start_index) {
                        switch (self.content[end_index]) {
                            ' ', '\t', 0x0B, 0x0C, '\r' => end_index -= 1,
                            else => return self.content[start_index .. end_index + 1],
                        }
                    }
                    return error.EmptyKey;
                },
                else => continue,
            }
        }

        // EOF
        try self.eof();
        return self.content[start_index..self.index];
    }

    fn readQuotedString(self: *Tokenizer, comptime quote: u8) Error![]const u8 {
        const start_index = self.index;

        self.index += 1;

        while (self.index < self.content.len) : (self.index += 1) {
            const char = self.content[self.index];
            switch (char) {
                quote => {
                    self.index += 1;
                    return self.content[start_index..self.index];
                },
                '\n' => return error.UnexpectedNewline,
                '\\' => self.index += 1,
                else => continue,
            }
        }

        // EOF before closing bracket
        return error.UnexpectedEof;
    }

    fn readMultilineString(self: *Tokenizer) Error![]const u8 {
        const start_index = self.index;

        const State = enum { root, line };
        var state = State.root;

        while (self.index < self.content.len) : (self.index += 1) {
            switch (state) {
                .root => {
                    const end_index = self.index;
                    try self.skipWhitespace();
                    switch (self.content[self.index]) {
                        '|' => state = State.line,
                        else => return self.content[start_index..end_index],
                    }
                },
                .line => {
                    switch (self.content[self.index]) {
                        '\n' => state = State.root,
                        else => continue,
                    }
                },
            }
        }

        // empty line or EOF
        return self.content[start_index..self.index];
    }

    pub fn next(self: *Tokenizer) Error!?Token {
        if (!self.finished) {
            self.last = ret: {
                // treat ; and , as whitespace
                try self.skipWhitespaceAndNewlinesAndBreaks();
                if (self.finished) return null;
                const char = self.content[self.index];
                switch (char) {
                    '[' => break :ret Token.table_start,
                    ']' => break :ret Token.table_end,
                    '=' => return error.InvalidChar,
                    else => {
                        var value: Token.String = switch (char) {
                            '"' => Token.String{ .quoted = try self.readQuotedString('"') },
                            '\'' => Token.String{ .quoted = try self.readQuotedString('\'') },
                            '|' => Token.String{ .multiline = try self.readMultilineString() },
                            else => Token.String{ .unquoted = try self.readString() },
                        };
                        try self.skipWhitespaceAndNewlines();
                        if (!self.finished) {
                            switch (self.content[self.index]) {
                                ';', ',' => break :ret Token{ .value = value },
                                '=' => break :ret Token{ .keyeq = value },
                                ']' => {
                                    self.index -= 1;
                                    break :ret Token{ .value = value };
                                },
                                else => {
                                    self.index -= 1;
                                    break :ret Token{ .value = value };
                                },
                            }
                        } else break :ret Token{ .value = value };
                    },
                }
            };
            self.index += 1;
            if (self.index >= self.content.len) try self.eof();
            return self.last;
        } else {
            return null;
        }
    }
};

// just a namespace to put parsing-related functions in,
// rather than having them scattered about in the top-level.
pub const Parser = struct {
    const MemError = error{
        OutOfMemory,
    };

    const StringError = error{
        UnexpectedNewline,
        InvalidEscapeChar,
        InvalidHexEscape,
        InvalidUnicodeEscape,
    };

    fn skipWhitespace(str: []const u8, i: *usize) void {
        while (i.* < str.len) : (i.* += 1) {
            switch (str[i.*]) {
                ' ', '\t', 0x0B, 0x0C, '\r' => continue,
                else => return,
            }
        }
    }

    fn parseMultilineString(str: []const u8, a: *mem.Allocator) MemError![]const u8 {
        const State = enum { root, line };

        var state = State.line;
        var buf = std.ArrayList(u8).init(a);
        defer buf.deinit();

        var i: usize = 0;
        debug.assert(str[i] == '|');
        i += 1; // skip initial pipe

        while (i < str.len) : (i += 1) {
            switch (state) {
                .root => {
                    skipWhitespace(str, &i);
                    switch (str[i]) {
                        '|' => state = State.line,
                        else => return buf.toOwnedSlice(),
                    }
                },
                .line => {
                    const char = str[i];
                    switch (char) {
                        '\\' => {
                            const next_is_newline_or_eof = if (i + 1 < str.len) str[i + 1] == '\n' else true;
                            if (next_is_newline_or_eof) {
                                if (str[i - 1] == '\\') continue;
                                i += 1; // skip over newline
                                state = State.root;
                            } else {
                                try buf.append('\\');
                            }
                        },
                        '\n' => {
                            try buf.append('\n');
                            state = State.root;
                        },
                        else => try buf.append(char),
                    }
                },
            }
        }

        // this shouldn't happen; the tokenizer should've caught and errored on this
        return buf.toOwnedSlice();
    }

    // ~~stolen~~ inspired by std/zig/string_literal.zig
    fn parseQuotedString(str: []const u8, a: *mem.Allocator, comptime quote: u8) (MemError || StringError)![]const u8 {
        const State = enum { root, backslash };

        var state = State.root;
        var buf = std.ArrayList(u8).init(a);
        defer buf.deinit();

        var i: usize = 0;
        i += 1;

        while (i < str.len) : (i += 1) {
            const char = str[i];
            switch (state) {
                .root => switch (char) {
                    '\n' => return error.UnexpectedNewline,
                    '\\' => state = State.backslash,
                    quote => {
                        i += 1;
                        return buf.toOwnedSlice();
                    },
                    else => {
                        try buf.append(char);
                    },
                },
                .backslash => switch (char) {
                    'f' => {
                        try buf.append(0x0C);
                        state = State.root;
                    },
                    'n' => {
                        try buf.append('\n');
                        state = State.root;
                    },
                    'r' => {
                        try buf.append('\r');
                        state = State.root;
                    },
                    't' => {
                        try buf.append('\t');
                        state = State.root;
                    },
                    'v' => {
                        try buf.append(0x0B);
                        state = State.root;
                    },
                    '\\', '"', '\'' => {
                        try buf.append(char);
                        state = State.root;
                    },
                    'x' => {
                        if (str.len < i + 3) return error.InvalidHexEscape;
                        if (fmt.parseUnsigned(u8, str[i + 1 .. i + 3], 16)) |byte| {
                            try buf.append(byte);
                            state = State.root;
                            i += 2;
                        } else |err| {
                            return error.InvalidHexEscape;
                        }
                    },
                    'u' => {
                        if (str.len < i + 5) return error.InvalidUnicodeEscape;
                        if (fmt.parseUnsigned(u16, str[i + 1 .. i + 5], 16)) |unic| {
                            try buf.appendSlice(mem.toBytes(unic)[0..]);
                            state = State.root;
                            i += 4;
                        } else |err| {
                            return error.InvalidUnicodeEscape;
                        }
                    },
                    'U' => {
                        if (str.len < i + 9) return error.InvalidUnicodeEscape;
                        if (fmt.parseUnsigned(u32, str[i + 1 .. i + 9], 16)) |unic| {
                            try buf.appendSlice(mem.toBytes(unic)[0..]);
                            state = State.root;
                            i += 8;
                        } else |err| {
                            return error.InvalidUnicodeEscape;
                        }
                    },
                    else => return error.InvalidEscapeChar,
                },
            }
        }

        // unclosed string
        // the tokenizer should've caught this
        return buf.toOwnedSlice();
    }

    fn parseString(tok: Token.String, a: *mem.Allocator) (MemError || StringError)![]const u8 {
        switch (tok) {
            .unquoted => |str| return a.dupe(u8, str),
            .quoted => |str| return switch (str[0]) {
                '"' => parseQuotedString(str, a, '"'),
                '\'' => parseQuotedString(str, a, '\''),
                else => @panic("unknown quote?"),
            },
            .multiline => |str| return parseMultilineString(str, a),
        }
    }

    pub fn parseTable(tokens: *Tokenizer, a: *mem.Allocator) (MemError || StringError || Tokenizer.Error)!Table {
        var table = Table.init(a);
        var key: ?Key = null;
        var value: ?Value = null;
        var i: usize = 0;

        while (try tokens.next()) |token| {
            switch (token) {
                .table_end => return table,
                .keyeq => |str| key = try parseString(str, a),
                .table_start => value = .{ .table = try parseTable(tokens, a) },
                .value => |str| value = .{ .string = try parseString(str, a) },
            }

            if (value) |val| {
                const keystr = key orelse key: {
                    defer i += 1;
                    break :key try fmt.allocPrint(a, "{d}", .{i});
                };
                try table.put(keystr, val);
                value = null;
                key = null;
            }
        }

        // if no table end, return the table anyway;
        // any unclosed tables should've been caught by the tokenizer
        return table;
    }

    // aka 'put that shit in a hashmap please'
    pub fn parse(data: []const u8, a: *mem.Allocator) !Table {
        var tokenizer = Tokenizer.init(data);
        return parseTable(&tokenizer, a);
    }
};

const testing = std.testing;

// TODO: more tests
test "small test" {
    const alloc = testing.allocator;
    const zckt =
        \\key = value
        \\key = another god damned value
        \\key = [ this is a table :), this is the second value ]
    ;

    var table = try Parser.parse(zckt, alloc);
    defer table.deinit();

    testing.expectEqualStrings(table.get("key").?.table.get("1").?.string, "this is the second value");

    std.debug.print("\n{s}\n", .{table});
}