~fsx/cbare

e4b02badcdcc14312abd9979d120010bf38d102e — Frank Smit a month ago 9f30039 dsl
Work on lexer.
4 files changed, 268 insertions(+), 224 deletions(-)

M .gitignore
M src/lex.c
M src/lex.h
M test/lextest.c
M .gitignore => .gitignore +1 -0
@@ 1,6 1,7 @@
/baretest
/utf8test
/buftest
/lextest
/libcbare.so
/libcbare.a
/cbare.pc

M src/lex.c => src/lex.c +16 -6
@@ 1,10 1,8 @@
#include <ctype.h>
#include <string.h>

#include "bs.h"
#include "die.h"
#include "lex.h"
#include "membuf.h"

static struct lex_token scanword(struct lexer *ctx);
static struct lex_token scanint(struct lexer *ctx);


@@ 15,10 13,20 @@ static bool isatend(struct lexer *ctx);
static char advance(struct lexer *ctx);
static char peek(struct lexer *ctx);

char *bstrcpy(struct arena *a, const char *src, size_t sz)
{
	char *dst = (char *)arena_alloc(a, sz + 1);

	memset(dst, 0, sz + 1);
	memcpy(dst, src, sz);

	return dst;
}

void
lex_init(struct lexer *ctx, const char *src)
lex_init(struct arena *alloc, struct lexer *ctx, const char *src)
{
	memset(ctx, 0, sizeof(*ctx));
	ctx->alloc = alloc;
	ctx->src = src;
	ctx->cur = src;
	ctx->line = 1;


@@ 134,7 142,9 @@ scanword(struct lexer *ctx)
static struct lex_token
scanint(struct lexer *ctx)
{
	while (isdigit(peek(ctx))) advance(ctx);
	while (isdigit(peek(ctx))) {
		advance(ctx);
	}
	return mktoken(ctx, LEX_INTEGER);
}



@@ 149,7 159,7 @@ scancomment(struct lexer *ctx)
static struct lex_token
mktoken(struct lexer *ctx, uint8_t type)
{
	const char *src = bscpy(ctx->src, (size_t)(ctx->cur - ctx->src));
	const char *src = bstrcpy(ctx->alloc, ctx->src, (size_t)(ctx->cur - ctx->src));

	return (struct lex_token){
		.type = type,

M src/lex.h => src/lex.h +4 -1
@@ 5,6 5,8 @@
#include <stddef.h>
#include <stdint.h>

#include "alloc.h"

enum lex_token_type {
	LEX_UNKNOWN,
	LEX_EOF,


@@ 51,6 53,7 @@ struct lex_token {
};

struct lexer {
	struct arena *alloc;
	const char *src;
	const char *cur;
	size_t sz;


@@ 60,7 63,7 @@ struct lexer {
	bool has_peeked;
};

void lex_init(struct lexer *ctx, const char *src);
void lex_init(struct arena *alloc, struct lexer *ctx, const char *src);
struct lex_token lex_next(struct lexer *ctx);
struct lex_token lex_peek(struct lexer *ctx);
const char *lex_token_name(enum lex_token_type type);

M test/lextest.c => test/lextest.c +247 -217
@@ 2,12 2,21 @@

#include "die.h"
#include "lex.h"
#include "lexdata.h"
// #include "lexdata.h"

#define arrlen(v) (int) (sizeof(v) / sizeof(v[0]))
// #define arrlen(v) (int) (sizeof(v) / sizeof(v[0]))

void
check(bool result, char *msg, ...)
#define UNUSED(x) (void)(x)

#define ARRLEN(a) sizeof(a)/sizeof(a[0])

#define FOR_EACH_ARRAY(type, elem, array) \
	for (type elem = &(array)[0]; \
	     elem < &(array)[ARRLEN(array)]; \
	     ++elem)

static void
ok(bool result, char *msg, ...)
{
	va_list args;



@@ 18,10 27,22 @@ check(bool result, char *msg, ...)
	va_end(args);
}

// void
// check(bool result, char *msg, ...)
// {
// 	va_list args;

// 	va_start(args, msg);
// 	if (!result) {
// 		die_va(msg, &args);
// 	}
// 	va_end(args);
// }

void
toktype(enum lex_token_type expected, enum lex_token_type actual)
{
	check(
	ok(
		expected == actual,
		"expected \"%s\", but got \"%s\"",
		lex_token_name(expected),


@@ 29,25 50,28 @@ toktype(enum lex_token_type expected, enum lex_token_type actual)
	);
}

void
toksrc(const char *expected, const char *actual)
{
	check(
		(
			(expected == NULL && actual == NULL)
			||
			(expected != NULL && strcmp(expected, actual) == 0)
		),
		"expected \"%s\", but got \"%s\"", expected, actual
	);
}
// void
// toksrc(const char *expected, const char *actual)
// {
// 	check(
// 		(
// 			(expected == NULL && actual == NULL)
// 			||
// 			(expected != NULL && strcmp(expected, actual) == 0)
// 		),
// 		"expected \"%s\", but got \"%s\"", expected, actual
// 	);
// }

void
test_lex_words()
{
	struct lexer lex;
	struct test {
		enum lex_token_type type;
		char *input;
	};

	struct { enum lex_token_type type; char *input; } words[] = {
	struct test tests[] = {
		{LEX_TYPE, "type"},
		{LEX_UINT, "uint"},
		{LEX_U8, "u8"},


@@ 70,213 94,219 @@ test_lex_words()
		{LEX_NAME, "bleepbloop"},
	};

	for (int i = 0; i < arrlen(words); ++i) {
		lex_init(&lex, words[i].input);
		toktype(words[i].type, lex_next(&lex).type);
	}
}

void
test_lex_eof()
{
	struct lexer lex;
	struct lex_token token;

	lex_init(&lex, "");

	token = lex_next(&lex);
	toktype(LEX_EOF,token.type);
	check(token.src == NULL, "expected NULL");

}

void
test_lex_integer()
{
	struct lexer lex;
	struct lex_token token;

	lex_init(&lex, "12345");

	token = lex_next(&lex);
	toktype(LEX_INTEGER, token.type);
	toksrc("12345", token.src);

}

void
test_lex_symbols()
{
	struct lexer lex;

	struct { enum lex_token_type type; char *input; } symbols[] = {
		{LEX_LANGLE, "<"},
		{LEX_RANGLE, ">"},
		{LEX_LBRACE, "{"},
		{LEX_RBRACE, "}"},
		{LEX_LBRACKET, "["},
		{LEX_RBRACKET, "]"},
		{LEX_LPAREN, "("},
		{LEX_RPAREN, ")"},
		{LEX_PIPE, "|"},
		{LEX_EQUAL, "="},
		{LEX_COLON, ":"},
	};

	for (int i = 0; i < arrlen(symbols); ++i) {
		lex_init(&lex, symbols[i].input);
		toktype(symbols[i].type, lex_next(&lex).type);
	}
}

void
test_lex_unknown()
{
	struct lexer lex;

	struct { enum lex_token_type type; char *input; } symbols[] = {
		{LEX_UNKNOWN, "!"},
		{LEX_UNKNOWN, "@"},
		{LEX_UNKNOWN, "$"},
		{LEX_UNKNOWN, "%"},
		{LEX_UNKNOWN, "^"},
		{LEX_UNKNOWN, "&"},
		{LEX_UNKNOWN, "*"},
		{LEX_UNKNOWN, "-"},
		{LEX_UNKNOWN, "?"},
		{LEX_UNKNOWN, "."},
		{LEX_UNKNOWN, ","},
		{LEX_UNKNOWN, ";"},
		{LEX_UNKNOWN, "'"},
		{LEX_UNKNOWN, "\""},
		{LEX_UNKNOWN, "_"}, // _ is valid, but nothing starts with an underscore.
	};
	// for (int i = 0; i < arrlen(words); ++i) {
	FOR_EACH_ARRAY(struct test *, t, tests) {
		struct arena a = {0};
		struct lexer lex = {0};

	for (int i = 0; i < arrlen(symbols); ++i) {
		lex_init(&lex, symbols[i].input);
		toktype(symbols[i].type, lex_next(&lex).type);
		arena_create(&a);
		lex_init(&a, &lex, t->input);
		toktype(t->type, lex_next(&lex).type);
		arena_destroy(&a);
	}
}

void
test_lex_example()
{
	struct lexer lex;
	struct lex_token token;

	struct { enum lex_token_type type; char *src; } expected[] = {
		{LEX_TYPE, "type"},
		{LEX_NAME, "PublicKey"},
		{LEX_DATA, "data"},
		{LEX_LANGLE, "<"},
		{LEX_INTEGER, "128"},
		{LEX_RANGLE, ">"},
		{LEX_TYPE, "type"},
		{LEX_NAME, "Time"},
		{LEX_STRING, "string"},
		{LEX_TYPE, "type"},
		{LEX_NAME, "Department"},
		{LEX_LANGLE, "<"},
		{LEX_NAME, "ACCOUNTING"},
		{LEX_NAME, "ADMINISTRATION"},
		{LEX_NAME, "CUSTOMER_SERVICE"},
		{LEX_NAME, "DEVELOPMENT"},
		{LEX_NAME, "JSMITH"},
		{LEX_EQUAL, "="},
		{LEX_INTEGER, "99"},
		{LEX_RANGLE, ">"},
		{LEX_TYPE, "type"},
		{LEX_NAME, "Customer"},
		{LEX_LBRACE, "{"},
		{LEX_NAME, "name"},
		{LEX_COLON, ":"},
		{LEX_STRING, "string"},
		{LEX_NAME, "email"},
		{LEX_COLON, ":"},
		{LEX_STRING, "string"},
		{LEX_NAME, "address"},
		{LEX_COLON, ":"},
		{LEX_NAME, "Address"},
		{LEX_NAME, "orders"},
		{LEX_COLON, ":"},
		{LEX_LBRACKET, "["},
		{LEX_RBRACKET, "]"},
		{LEX_LBRACE, "{"},
		{LEX_NAME, "orderId"},
		{LEX_COLON, ":"},
		{LEX_I64, "i64"},
		{LEX_NAME, "quantity"},
		{LEX_COLON, ":"},
		{LEX_I32, "i32"},
		{LEX_RBRACE, "}"},
		{LEX_NAME, "metadata"},
		{LEX_COLON, ":"},
		{LEX_MAP, "map"},
		{LEX_LBRACKET, "["},
		{LEX_STRING, "string"},
		{LEX_RBRACKET, "]"},
		{LEX_DATA, "data"},
		{LEX_RBRACE, "}"},
		{LEX_TYPE, "type"},
		{LEX_NAME, "Person"},
		{LEX_LPAREN, "("},
		{LEX_NAME, "Customer"},
		{LEX_PIPE, "|"},
		{LEX_NAME, "Employee"},
		{LEX_RPAREN, ")"},
		{LEX_EOF, NULL},
	};

	lex_init(&lex, testdata_example);

	for (int i = 0; i < arrlen(expected); ++i) {
		token = lex_next(&lex);
		toktype(expected[i].type, token.type);
		toksrc(expected[i].src, token.src);
	}

}

void
test_lex_comment()
{
	struct lexer lex;
	struct lex_token token;

	struct { enum lex_token_type type; char *src; } expected[] = {
		{LEX_EOF, NULL},
	};

	lex_init(&lex, testdata_comment);

	for (int i = 0; i < arrlen(expected); ++i) {
		token = lex_next(&lex);
		toktype(expected[i].type, token.type);
		toksrc(expected[i].src, token.src);
	}

}

void
test_lex_name_out_of_bounds()
{
    // Check if value outside of enum lex_token_type is handled.
    check(strcmp(lex_token_name(9999), "unknown") == 0, "expected unknown");
}
// void
// test_lex_eof()
// {
// 	struct lexer lex;
// 	struct lex_token token;

// 	lex_init(&lex, "");

// 	token = lex_next(&lex);
// 	toktype(LEX_EOF,token.type);
// 	check(token.src == NULL, "expected NULL");

// }

// void
// test_lex_integer()
// {
// 	struct lexer lex;
// 	struct lex_token token;

// 	lex_init(&lex, "12345");

// 	token = lex_next(&lex);
// 	toktype(LEX_INTEGER, token.type);
// 	toksrc("12345", token.src);

// }

// void
// test_lex_symbols()
// {
// 	struct lexer lex;

// 	struct { enum lex_token_type type; char *input; } symbols[] = {
// 		{LEX_LANGLE, "<"},
// 		{LEX_RANGLE, ">"},
// 		{LEX_LBRACE, "{"},
// 		{LEX_RBRACE, "}"},
// 		{LEX_LBRACKET, "["},
// 		{LEX_RBRACKET, "]"},
// 		{LEX_LPAREN, "("},
// 		{LEX_RPAREN, ")"},
// 		{LEX_PIPE, "|"},
// 		{LEX_EQUAL, "="},
// 		{LEX_COLON, ":"},
// 	};

// 	for (int i = 0; i < arrlen(symbols); ++i) {
// 		lex_init(&lex, symbols[i].input);
// 		toktype(symbols[i].type, lex_next(&lex).type);
// 	}
// }

// void
// test_lex_unknown()
// {
// 	struct lexer lex;

// 	struct { enum lex_token_type type; char *input; } symbols[] = {
// 		{LEX_UNKNOWN, "!"},
// 		{LEX_UNKNOWN, "@"},
// 		{LEX_UNKNOWN, "$"},
// 		{LEX_UNKNOWN, "%"},
// 		{LEX_UNKNOWN, "^"},
// 		{LEX_UNKNOWN, "&"},
// 		{LEX_UNKNOWN, "*"},
// 		{LEX_UNKNOWN, "-"},
// 		{LEX_UNKNOWN, "?"},
// 		{LEX_UNKNOWN, "."},
// 		{LEX_UNKNOWN, ","},
// 		{LEX_UNKNOWN, ";"},
// 		{LEX_UNKNOWN, "'"},
// 		{LEX_UNKNOWN, "\""},
// 		{LEX_UNKNOWN, "_"}, // _ is valid, but nothing starts with an underscore.
// 	};

// 	for (int i = 0; i < arrlen(symbols); ++i) {
// 		lex_init(&lex, symbols[i].input);
// 		toktype(symbols[i].type, lex_next(&lex).type);
// 	}
// }

// void
// test_lex_example()
// {
// 	struct lexer lex;
// 	struct lex_token token;

// 	struct { enum lex_token_type type; char *src; } expected[] = {
// 		{LEX_TYPE, "type"},
// 		{LEX_NAME, "PublicKey"},
// 		{LEX_DATA, "data"},
// 		{LEX_LANGLE, "<"},
// 		{LEX_INTEGER, "128"},
// 		{LEX_RANGLE, ">"},
// 		{LEX_TYPE, "type"},
// 		{LEX_NAME, "Time"},
// 		{LEX_STRING, "string"},
// 		{LEX_TYPE, "type"},
// 		{LEX_NAME, "Department"},
// 		{LEX_LANGLE, "<"},
// 		{LEX_NAME, "ACCOUNTING"},
// 		{LEX_NAME, "ADMINISTRATION"},
// 		{LEX_NAME, "CUSTOMER_SERVICE"},
// 		{LEX_NAME, "DEVELOPMENT"},
// 		{LEX_NAME, "JSMITH"},
// 		{LEX_EQUAL, "="},
// 		{LEX_INTEGER, "99"},
// 		{LEX_RANGLE, ">"},
// 		{LEX_TYPE, "type"},
// 		{LEX_NAME, "Customer"},
// 		{LEX_LBRACE, "{"},
// 		{LEX_NAME, "name"},
// 		{LEX_COLON, ":"},
// 		{LEX_STRING, "string"},
// 		{LEX_NAME, "email"},
// 		{LEX_COLON, ":"},
// 		{LEX_STRING, "string"},
// 		{LEX_NAME, "address"},
// 		{LEX_COLON, ":"},
// 		{LEX_NAME, "Address"},
// 		{LEX_NAME, "orders"},
// 		{LEX_COLON, ":"},
// 		{LEX_LBRACKET, "["},
// 		{LEX_RBRACKET, "]"},
// 		{LEX_LBRACE, "{"},
// 		{LEX_NAME, "orderId"},
// 		{LEX_COLON, ":"},
// 		{LEX_I64, "i64"},
// 		{LEX_NAME, "quantity"},
// 		{LEX_COLON, ":"},
// 		{LEX_I32, "i32"},
// 		{LEX_RBRACE, "}"},
// 		{LEX_NAME, "metadata"},
// 		{LEX_COLON, ":"},
// 		{LEX_MAP, "map"},
// 		{LEX_LBRACKET, "["},
// 		{LEX_STRING, "string"},
// 		{LEX_RBRACKET, "]"},
// 		{LEX_DATA, "data"},
// 		{LEX_RBRACE, "}"},
// 		{LEX_TYPE, "type"},
// 		{LEX_NAME, "Person"},
// 		{LEX_LPAREN, "("},
// 		{LEX_NAME, "Customer"},
// 		{LEX_PIPE, "|"},
// 		{LEX_NAME, "Employee"},
// 		{LEX_RPAREN, ")"},
// 		{LEX_EOF, NULL},
// 	};

// 	lex_init(&lex, testdata_example);

// 	for (int i = 0; i < arrlen(expected); ++i) {
// 		token = lex_next(&lex);
// 		toktype(expected[i].type, token.type);
// 		toksrc(expected[i].src, token.src);
// 	}

// }

// void
// test_lex_comment()
// {
// 	struct lexer lex;
// 	struct lex_token token;

// 	struct { enum lex_token_type type; char *src; } expected[] = {
// 		{LEX_EOF, NULL},
// 	};

// 	lex_init(&lex, testdata_comment);

// 	for (int i = 0; i < arrlen(expected); ++i) {
// 		token = lex_next(&lex);
// 		toktype(expected[i].type, token.type);
// 		toksrc(expected[i].src, token.src);
// 	}

// }

// void
// test_lex_name_out_of_bounds()
// {
//     // Check if value outside of enum lex_token_type is handled.
//     check(strcmp(lex_token_name(9999), "unknown") == 0, "expected unknown");
// }

int main(int argc, char const *argv[])
{
	(void)argc;
	(void)argv;
	UNUSED(argc);
	UNUSED(argv);

	test_lex_words();
	test_lex_eof();
	test_lex_integer();
	test_lex_symbols();
	test_lex_unknown();
	test_lex_example();
	test_lex_comment();
	test_lex_name_out_of_bounds();
	// test_lex_eof();
	// test_lex_integer();
	// test_lex_symbols();
	// test_lex_unknown();
	// test_lex_example();
	// test_lex_comment();
	// test_lex_name_out_of_bounds();

	return 0;
}