~ntgg/zosh

44e679630f8b4017f5cb6ba23901b82a51c00cb6 — Noah Graff 4 years ago 34b5664
started adding tokenizing, with a Tokenizer
1 files changed, 204 insertions(+), 0 deletions(-)

A src/tokenizer.zig
A src/tokenizer.zig => src/tokenizer.zig +204 -0
@@ 0,0 1,204 @@
const std = @import("std");
const TextBuffer = @import("text_buffer.zig").TextBuffer;

pub const Symbol = struct {
    id: Id,

    pub const Id = enum {
        /// End of File
        EndOfFile,

        /// a new line char ('\n')
        NewLine,

        /// a token
        Token,

        /// &&
        AndIf,

        /// ||
        OrIf,

        /// ;;
        DSemi,

        /// <<
        DLess,

        /// >>
        DGreat,

        /// <&
        LessAnd,

        /// >&
        GreatAnd,

        /// <>
        LessGreat,

        /// <<-
        DLessDash,

        /// >|
        Clobber,
    };
};

pub const Tokenizer = struct {
    text: *TextBuffer,

    pub fn init(text: *TextBuffer) Tokenizer {
        return Tokenizer{
            .text = text,
        };
    }

    pub fn deinit(tokenizer: Tokenizer) void {
        // nothing, placeholder.
    }

    // see https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_03

    /// get the type of the next symbol. multiple calls without changing the
    /// state of the tokenizer/text will return the same symbol type.
    pub fn nextSymbolId(tokenizer: *Tokenizer) Symbol.Id {
        if (tokenizer.text.peekChar()) |c| {
            if (c == '\n') return .NewLine;

            if (isOperatorStart(c)) {
                for (operators) |op| {
                    if (std.mem.eql(u8, tokenizer.text.peek(op.text.len), op.text)) {
                        return op.symbol_id;
                    }
                }
            }

            if (std.ascii.isBlank(c)) {
                // skip blank characters.
                _ = tokenizer.text.readChar();
                return tokenizer.nextSymbolId();
            }

            if (c == '#') {
                // ignore characters until end-of-file or new line, not
                // including the new line character.
                while (tokenizer.text.peekChar()) |comment_char| {
                    if (comment_char == '\n') break;
                    _ = tokenizer.text.readChar();
                }
                return tokenizer.nextSymbolId();
            }

            return .Token;
        } else {
            return .EndOfFile;
        }
    }
};

test "Tokenizer.nextSymbolId()" {
    const t = std.testing;

    var text = try TextBuffer.init(std.heap.direct_allocator,
        \\# this symbol will be EndOfFile
    );
    defer text.deinit();

    var tokenizer = Tokenizer.init(&text);
    defer tokenizer.deinit();

    t.expectEqual(Symbol.Id.EndOfFile, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.EndOfFile, tokenizer.nextSymbolId());

    try text.append("# this symbol will be new line\n");
    t.expectEqual(Symbol.Id.NewLine, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.NewLine, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.NewLine, tokenizer.nextSymbolId());
    _ = text.readChar();
    t.expectEqual(Symbol.Id.EndOfFile, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.EndOfFile, tokenizer.nextSymbolId());

    try text.append(
        \\# Some operators!
        \\&&||;;<<>><&>&<>>|<<-
    );
    t.expectEqual(Symbol.Id.NewLine, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.NewLine, tokenizer.nextSymbolId());
    _ = text.readChar();
    t.expectEqual(Symbol.Id.AndIf, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.AndIf, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.OrIf, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.OrIf, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.DSemi, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.DSemi, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.DLess, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.DLess, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.DGreat, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.DGreat, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.LessAnd, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.LessAnd, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.GreatAnd, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.GreatAnd, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.LessGreat, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.LessGreat, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.Clobber, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.Clobber, tokenizer.nextSymbolId());
    _ = text.read(2);
    t.expectEqual(Symbol.Id.DLessDash, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.DLessDash, tokenizer.nextSymbolId());
    _ = text.read(3);
    t.expectEqual(Symbol.Id.EndOfFile, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.EndOfFile, tokenizer.nextSymbolId());

    try text.append("Some tokens,     With  Spaces   ");
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    _ = text.read(4);
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    _ = text.read(7);
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    _ = text.read(4);
    t.expectEqual(Symbol.Id.Token, tokenizer.nextSymbolId());
    _ = text.read(6);
    t.expectEqual(Symbol.Id.EndOfFile, tokenizer.nextSymbolId());
}

const Operator = struct {
    symbol_id: Symbol.Id,
    text: []const u8,
};

// order by text length long -> short.
const operators = [_]Operator{
    Operator{ .symbol_id = .DLessDash, .text = "<<-" },
    Operator{ .symbol_id = .AndIf, .text = "&&" },
    Operator{ .symbol_id = .OrIf, .text = "||" },
    Operator{ .symbol_id = .DSemi, .text = ";;" },
    Operator{ .symbol_id = .DLess, .text = "<<" },
    Operator{ .symbol_id = .DGreat, .text = ">>" },
    Operator{ .symbol_id = .LessAnd, .text = "<&" },
    Operator{ .symbol_id = .GreatAnd, .text = ">&" },
    Operator{ .symbol_id = .LessGreat, .text = "<>" },
    Operator{ .symbol_id = .Clobber, .text = ">|" },
};

pub fn isOperatorStart(char: u8) bool {
    return switch (char) {
        '&', '|', ';', '<', '>' => true,
        else => false,
    };
}