From 932437b33b3640f0b9c94ec34eeec33a752ecac3 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Thu, 3 Oct 2019 23:01:27 -0500 Subject: [PATCH] Add support for regex literal and matching (regex? /foo/) ; => true (regex-match /foo/ "foo") ; => 0 (regex-match /foo/ "bar") ; => nil (/foo/ "foo") ; => 0 (regexes are callable) --- Makefile | 1 + core.c | 38 ++++++++++++++++++++++++++++++ core.h | 2 ++ malcc.c | 12 ++++++++++ printer.c | 33 ++++++++++++++++++++++++++ printer.h | 1 + reader.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++- reader.h | 3 ++- tests/regex.mal | 49 +++++++++++++++++++++++++++++++++++++++ types.c | 10 ++++++++ types.h | 13 ++++++++++- 11 files changed, 220 insertions(+), 3 deletions(-) create mode 100644 tests/regex.mal diff --git a/Makefile b/Makefile index 83f2273..348e118 100644 --- a/Makefile +++ b/Makefile @@ -102,6 +102,7 @@ test-self-hosted: all test-supplemental: all $(RUN_TEST_CMD) --test-timeout 30 ../../tests/utf-8.mal ../../malcc + $(RUN_TEST_CMD) --test-timeout 30 ../../tests/regex.mal ../../malcc test-mal-in-mal: mal-in-mal $(RUN_TEST_CMD) --test-timeout 30 step2_eval.mal ../../mal-in-mal diff --git a/core.c b/core.c index 6d28064..2e64348 100644 --- a/core.c +++ b/core.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -76,6 +77,8 @@ struct hashmap* core_ns() { hashmap_put(ns, "number?", core_is_number); hashmap_put(ns, "fn?", core_is_fn); hashmap_put(ns, "macro?", core_is_macro); + hashmap_put(ns, "regex?", core_is_regex); + hashmap_put(ns, "regex-match", core_regex_match); return ns; } @@ -757,6 +760,41 @@ MalType* core_is_macro(MalEnv *env, size_t argc, MalType **args) { return is_macro(val) ? mal_true() : mal_false(); } +MalType* core_is_regex(MalEnv *env, size_t argc, MalType **args) { + UNUSED(env); + mal_assert(argc == 1, "Expected 1 argument to regex?"); + MalType *val = args[0]; + return is_regex(val) ? mal_true() : mal_false(); +} + +MalType* core_regex_match(MalEnv *env, size_t argc, MalType **args) { + UNUSED(env); + mal_assert(argc == 2, "Expected 2 argument to regex-match"); + MalType *regex = args[0]; + mal_assert(is_regex(regex), "Expected first argument to regex-match to be a regex"); + MalType *str = args[1]; + mal_assert(is_string(str), "Expected second argument to regex-match to be a string"); + + const char *pcreErrorStr; + int pcreErrorOffset; + pcre *reCompiled = pcre_compile(regex->regex, 0, &pcreErrorStr, &pcreErrorOffset, NULL); + if(reCompiled == NULL) { + return mal_error(mal_string("Could not compile regex.")); + } + + pcre_extra *pcreExtra = pcre_study(reCompiled, PCRE_EXTENDED, &pcreErrorStr); + if(pcreErrorStr != NULL) { + return mal_error(mal_string("Could not study regex.")); + } + + int subStrVec[30]; + int pcreExecRet = pcre_exec(reCompiled, pcreExtra, str->str, str->str_len, 0, 0, subStrVec, 30); + + pcre_free(reCompiled); + + return pcreExecRet < 0 ? mal_nil() : mal_number(subStrVec[0]); +} + void add_core_ns_to_env(MalEnv *env) { struct hashmap *ns = core_ns(); struct hashmap_iter *core_iter; diff --git a/core.h b/core.h index 78a1d14..c8bf8eb 100644 --- a/core.h +++ b/core.h @@ -67,6 +67,8 @@ MalType* core_is_string(MalEnv *env, size_t argc, MalType **args); MalType* core_is_number(MalEnv *env, size_t argc, MalType **args); MalType* core_is_fn(MalEnv *env, size_t argc, MalType **args); MalType* core_is_macro(MalEnv *env, size_t argc, MalType **args); +MalType* core_is_regex(MalEnv *env, size_t argc, MalType **args); +MalType* core_regex_match(MalEnv *env, size_t argc, MalType **args); void add_core_ns_to_env(MalEnv *env); #endif diff --git a/malcc.c b/malcc.c index b4cd7df..b6e8ee8 100644 --- a/malcc.c +++ b/malcc.c @@ -65,6 +65,7 @@ int gen_list_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int int gen_load_file(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var_num); int gen_number_code(MalType *node, struct codegen *code, int ret); int gen_string_code(MalType *node, struct codegen *code, int ret); +int gen_regex_code(MalType *node, struct codegen *code, int ret); int gen_symbol_code(MalType *node, struct codegen *code, int ret); int gen_symbol_lookup_code(MalType *node, MalEnv *env, struct codegen *code, int ret); int gen_try_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var_num); @@ -229,6 +230,8 @@ int gen_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var return gen_number_code(node, code, ret); case MAL_STRING_TYPE: return gen_string_code(node, code, ret); + case MAL_REGEX_TYPE: + return gen_regex_code(node, code, ret); case MAL_SYMBOL_TYPE: if (quoting) { return gen_symbol_code(node, code, ret); @@ -312,6 +315,8 @@ int gen_call_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int } else if (strcmp(sym->symbol, "load-file") == 0) { return gen_load_file(mal_car(mal_cdr(node)), env, code, ret, var_num); } + } else if (is_regex(sym)) { + return gen_call_code(mal_cons(mal_symbol("regex-match"), node), env, code, ret, var_num); } // look up the lambda in the env @@ -724,6 +729,13 @@ int gen_string_code(MalType *node, struct codegen *code, int ret) { return 1; } +int gen_regex_code(MalType *node, struct codegen *code, int ret) { + MalType *str = mal_string(node->regex); + MalType *temp_code = mal_sprintf("mal_regex(%s)", pr_str(str, 1)); + append_code(code->body, temp_code, ret); + return 1; +} + int gen_symbol_code(MalType *node, struct codegen *code, int ret) { MalType *temp_code = mal_sprintf("mal_symbol(%s)", pr_str(mal_string(node->symbol), 1)); append_code(code->body, temp_code, ret); diff --git a/printer.c b/printer.c index 9bd18eb..eaf16fd 100644 --- a/printer.c +++ b/printer.c @@ -43,6 +43,8 @@ char* pr_str(MalType *val, int print_readably) { return string("nil"); case MAL_NUMBER_TYPE: return long_long_to_string(val->number); + case MAL_REGEX_TYPE: + return pr_regex(val, print_readably); case MAL_STRING_TYPE: return pr_string(val, print_readably); case MAL_SYMBOL_TYPE: @@ -151,3 +153,34 @@ char* pr_string(MalType *val, int print_readably) { return str; } } + +char* pr_regex(MalType *val, int print_readably) { + assert(strlen(val->regex) == val->regex_len); + if (print_readably) { + size_t len = val->regex_len; + char *orig = val->regex; + MalType *repr = mal_string("/"); + for (size_t i=0; istr; + } else { + size_t len = val->regex_len + 1; + char *str = GC_MALLOC(len); + snprintf(str, len, "%s", val->regex); + return str; + } +} diff --git a/printer.h b/printer.h index da3fc2f..5b2674b 100644 --- a/printer.h +++ b/printer.h @@ -12,5 +12,6 @@ char* pr_list(MalType *val, int print_readably); char* pr_vector(MalType *val, int print_readably); char* pr_hashmap(MalType *val, int print_readably); char* pr_string(MalType *val, int print_readably); +char* pr_regex(MalType *val, int print_readably); #endif diff --git a/reader.c b/reader.c index 2f150f2..a36b071 100644 --- a/reader.c +++ b/reader.c @@ -115,7 +115,8 @@ MalType* read_str(char* code) { MalType* read_form(Reader* reader) { char *token = reader_peek(reader); - if (!token || strlen(token) == 0) { + size_t len = strlen(token); + if (!token || len == 0) { return mal_nil(); } else { switch (*token) { @@ -130,6 +131,12 @@ MalType* read_form(Reader* reader) { return read_hashmap(reader); case '"': return read_string(reader); + case '/': + if (len == 1) { + return read_atom(reader); + } else { + return read_regex(reader); + } case ':': return read_keyword(reader); case '\'': @@ -273,6 +280,58 @@ char unescape_char(char *token, size_t *i, size_t len) { } } +MalType* read_regex(Reader *reader) { + char *token = reader_next(reader); + size_t len = strlen(token); + char *str = GC_MALLOC(len + 1); + size_t index = 0; + int saw_slashes = 0; + int saw_lparens = 0; + int saw_rparens = 0; + char unescaped; + for (size_t i=0; istr = string("/"); + token->next = tokenize(token_string+1); // skip the first character + Token* last = token->next; + while (last->next) last = last->next; // find the last token in the newly tokenized list + last->next = reader->token; // append our existing list on the end of it + reader->token = token; // point the reader at our newly prepended list of tokens + return read_atom(reader); + } + return mal_regex(str); +} + MalType* read_list(Reader *reader) { MalType *atom, *last_cons = NULL, *first_cons = NULL, *cons; reader_next(reader); // consume '(' diff --git a/reader.h b/reader.h index 27fbe58..1e3cd52 100644 --- a/reader.h +++ b/reader.h @@ -17,7 +17,7 @@ char* reader_next(Reader* reader); char* reader_peek(Reader* reader); -static const char PATTERN[] = "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;.*|[^\\s\\[\\]{}('\"`,;)]*)"; +static const char PATTERN[] = "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|/(?:\\\\.|[^\\\\/])*/|;.*|[^\\s\\[\\]{}('\"`,;)]*)"; Token* tokenize(char* code); @@ -30,6 +30,7 @@ MalType* read_list(Reader* reader); MalType* read_vector(Reader* reader); MalType* read_hashmap(Reader *reader); MalType* read_string(Reader *reader); +MalType* read_regex(Reader *reader); char unescape_char(char *token, size_t *i, size_t len); MalType* read_atom(Reader* reader); diff --git a/tests/regex.mal b/tests/regex.mal new file mode 100644 index 0000000..b3aa5c6 --- /dev/null +++ b/tests/regex.mal @@ -0,0 +1,49 @@ +;;; +;;; Supplemental malcc test for regex support +;;; + +;; Testing that the reader understands regex literals +/regex/ +;=>/regex/ +/escaped \/ slash/ +;=>/escaped \/ slash/ +(regex? /foo/) +;=>true +(regex? / foo /) +;=>true +(regex? "foo") +;=>false +(regex? /) +;=>false + +;; Not to be confused with the division function +/ +;=> +(/ 6 2) +;=>3 +(/ 6 (/ 4 2)) +;=>3 +(/ 6 (/ 4 (/ 2 1))) +;=>3 + +;; Testing that matching works +(regex-match /foo/ "foo") +;=>0 +(regex-match /foo/ "foofoo") +;=>0 +(regex-match /bar/ "foobar") +;=>3 +(regex-match /foo/ "foobar") +;=>0 +(regex-match /^foo/ "foobar") +;=>0 +(regex-match /^foo$/ "foobar") +;=>nil +(regex-match /foo/ "bar") +;=>nil + +;; Testing that regex is callable (calls regex-match under the hood) +(/foo/ "foo") +;=>0 +(/foo/ "bar") +;=>nil diff --git a/types.c b/types.c index 63b7484..72e7b5c 100644 --- a/types.c +++ b/types.c @@ -240,6 +240,16 @@ MalType* mal_string_to_list(MalType *orig) { return mal_vector_to_list(vec); } +MalType* mal_regex(char *str) { + MalType *val = mal_alloc(); + size_t len = strlen(str); + val->type = MAL_REGEX_TYPE; + val->regex_len = len; + val->regex = GC_MALLOC(len + 1); + snprintf(val->regex, len + 1, "%s", str); + return val; +} + #define VECTOR_INIT_SIZE 10 #define VECTOR_GROW_FACTOR 2 diff --git a/types.h b/types.h index 2c4b3ba..a46050c 100644 --- a/types.h +++ b/types.h @@ -27,6 +27,7 @@ enum MalTypeType { MAL_VECTOR_TYPE, MAL_HASHMAP_TYPE, MAL_STRING_TYPE, + MAL_REGEX_TYPE, MAL_LAMBDA_TYPE, MAL_CONTINUATION_TYPE, MAL_ATOM_TYPE, @@ -65,6 +66,12 @@ struct MalType { char *str; }; + // MAL_REGEX_TYPE + struct { + size_t regex_len; + char *regex; + }; + // MAL_LAMBDA_TYPE, MAL_CONTINUATION_TYPE struct { MalType* (*fn)(MalEnv *env, size_t argc, MalType **args); @@ -86,7 +93,8 @@ struct MalType { (val)->type == MAL_KEYWORD_TYPE || \ (val)->type == MAL_NUMBER_TYPE || \ (val)->type == MAL_SYMBOL_TYPE || \ - (val)->type == MAL_STRING_TYPE) + (val)->type == MAL_STRING_TYPE || \ + (val)->type == MAL_REGEX_TYPE) MalType* mal_alloc(); @@ -143,6 +151,9 @@ MalType* mal_string_replace(MalType *val, char *find, char *replace); MalType* mal_string_replace_all(MalType *orig, char *find, char *replace); MalType* mal_string_to_list(MalType *orig); +MalType* mal_regex(char *str); +#define is_regex(val) ((val)->type == MAL_REGEX_TYPE) + MalType* mal_keyword(char *name); #define is_keyword(val) ((val)->type == MAL_KEYWORD_TYPE) -- 2.45.2