M Makefile => Makefile +1 -0
@@ 102,6 102,7 @@ test-self-hosted: all
test-supplemental: all
$(RUN_TEST_CMD) --test-timeout 30 ../../tests/utf-8.mal ../../malcc
+ $(RUN_TEST_CMD) --test-timeout 30 ../../tests/regex.mal ../../malcc
test-mal-in-mal: mal-in-mal
$(RUN_TEST_CMD) --test-timeout 30 step2_eval.mal ../../mal-in-mal
M core.c => core.c +38 -0
@@ 1,5 1,6 @@
#include <assert.h>
#include <gc.h>
+#include <pcre.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
@@ 76,6 77,8 @@ struct hashmap* core_ns() {
hashmap_put(ns, "number?", core_is_number);
hashmap_put(ns, "fn?", core_is_fn);
hashmap_put(ns, "macro?", core_is_macro);
+ hashmap_put(ns, "regex?", core_is_regex);
+ hashmap_put(ns, "regex-match", core_regex_match);
return ns;
}
@@ 757,6 760,41 @@ MalType* core_is_macro(MalEnv *env, size_t argc, MalType **args) {
return is_macro(val) ? mal_true() : mal_false();
}
+MalType* core_is_regex(MalEnv *env, size_t argc, MalType **args) {
+ UNUSED(env);
+ mal_assert(argc == 1, "Expected 1 argument to regex?");
+ MalType *val = args[0];
+ return is_regex(val) ? mal_true() : mal_false();
+}
+
+MalType* core_regex_match(MalEnv *env, size_t argc, MalType **args) {
+ UNUSED(env);
+ mal_assert(argc == 2, "Expected 2 argument to regex-match");
+ MalType *regex = args[0];
+ mal_assert(is_regex(regex), "Expected first argument to regex-match to be a regex");
+ MalType *str = args[1];
+ mal_assert(is_string(str), "Expected second argument to regex-match to be a string");
+
+ const char *pcreErrorStr;
+ int pcreErrorOffset;
+ pcre *reCompiled = pcre_compile(regex->regex, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
+ if(reCompiled == NULL) {
+ return mal_error(mal_string("Could not compile regex."));
+ }
+
+ pcre_extra *pcreExtra = pcre_study(reCompiled, PCRE_EXTENDED, &pcreErrorStr);
+ if(pcreErrorStr != NULL) {
+ return mal_error(mal_string("Could not study regex."));
+ }
+
+ int subStrVec[30];
+ int pcreExecRet = pcre_exec(reCompiled, pcreExtra, str->str, str->str_len, 0, 0, subStrVec, 30);
+
+ pcre_free(reCompiled);
+
+ return pcreExecRet < 0 ? mal_nil() : mal_number(subStrVec[0]);
+}
+
void add_core_ns_to_env(MalEnv *env) {
struct hashmap *ns = core_ns();
struct hashmap_iter *core_iter;
M core.h => core.h +2 -0
@@ 67,6 67,8 @@ MalType* core_is_string(MalEnv *env, size_t argc, MalType **args);
MalType* core_is_number(MalEnv *env, size_t argc, MalType **args);
MalType* core_is_fn(MalEnv *env, size_t argc, MalType **args);
MalType* core_is_macro(MalEnv *env, size_t argc, MalType **args);
+MalType* core_is_regex(MalEnv *env, size_t argc, MalType **args);
+MalType* core_regex_match(MalEnv *env, size_t argc, MalType **args);
void add_core_ns_to_env(MalEnv *env);
#endif
M malcc.c => malcc.c +12 -0
@@ 65,6 65,7 @@ int gen_list_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int
int gen_load_file(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var_num);
int gen_number_code(MalType *node, struct codegen *code, int ret);
int gen_string_code(MalType *node, struct codegen *code, int ret);
+int gen_regex_code(MalType *node, struct codegen *code, int ret);
int gen_symbol_code(MalType *node, struct codegen *code, int ret);
int gen_symbol_lookup_code(MalType *node, MalEnv *env, struct codegen *code, int ret);
int gen_try_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var_num);
@@ 229,6 230,8 @@ int gen_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var
return gen_number_code(node, code, ret);
case MAL_STRING_TYPE:
return gen_string_code(node, code, ret);
+ case MAL_REGEX_TYPE:
+ return gen_regex_code(node, code, ret);
case MAL_SYMBOL_TYPE:
if (quoting) {
return gen_symbol_code(node, code, ret);
@@ 312,6 315,8 @@ int gen_call_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int
} else if (strcmp(sym->symbol, "load-file") == 0) {
return gen_load_file(mal_car(mal_cdr(node)), env, code, ret, var_num);
}
+ } else if (is_regex(sym)) {
+ return gen_call_code(mal_cons(mal_symbol("regex-match"), node), env, code, ret, var_num);
}
// look up the lambda in the env
@@ 724,6 729,13 @@ int gen_string_code(MalType *node, struct codegen *code, int ret) {
return 1;
}
+int gen_regex_code(MalType *node, struct codegen *code, int ret) {
+ MalType *str = mal_string(node->regex);
+ MalType *temp_code = mal_sprintf("mal_regex(%s)", pr_str(str, 1));
+ append_code(code->body, temp_code, ret);
+ return 1;
+}
+
int gen_symbol_code(MalType *node, struct codegen *code, int ret) {
MalType *temp_code = mal_sprintf("mal_symbol(%s)", pr_str(mal_string(node->symbol), 1));
append_code(code->body, temp_code, ret);
M printer.c => printer.c +33 -0
@@ 43,6 43,8 @@ char* pr_str(MalType *val, int print_readably) {
return string("nil");
case MAL_NUMBER_TYPE:
return long_long_to_string(val->number);
+ case MAL_REGEX_TYPE:
+ return pr_regex(val, print_readably);
case MAL_STRING_TYPE:
return pr_string(val, print_readably);
case MAL_SYMBOL_TYPE:
@@ 151,3 153,34 @@ char* pr_string(MalType *val, int print_readably) {
return str;
}
}
+
+char* pr_regex(MalType *val, int print_readably) {
+ assert(strlen(val->regex) == val->regex_len);
+ if (print_readably) {
+ size_t len = val->regex_len;
+ char *orig = val->regex;
+ MalType *repr = mal_string("/");
+ for (size_t i=0; i<len; i++) {
+ switch (orig[i]) {
+ case '\n':
+ mal_string_append(repr, "\\n");
+ break;
+ case '/':
+ mal_string_append(repr, "\\/");
+ break;
+ case '\\':
+ mal_string_append(repr, "\\\\");
+ break;
+ default:
+ mal_string_append_char(repr, orig[i]);
+ }
+ }
+ mal_string_append_char(repr, '/');
+ return repr->str;
+ } else {
+ size_t len = val->regex_len + 1;
+ char *str = GC_MALLOC(len);
+ snprintf(str, len, "%s", val->regex);
+ return str;
+ }
+}
M printer.h => printer.h +1 -0
@@ 12,5 12,6 @@ char* pr_list(MalType *val, int print_readably);
char* pr_vector(MalType *val, int print_readably);
char* pr_hashmap(MalType *val, int print_readably);
char* pr_string(MalType *val, int print_readably);
+char* pr_regex(MalType *val, int print_readably);
#endif
M reader.c => reader.c +60 -1
@@ 115,7 115,8 @@ MalType* read_str(char* code) {
MalType* read_form(Reader* reader) {
char *token = reader_peek(reader);
- if (!token || strlen(token) == 0) {
+ size_t len = strlen(token);
+ if (!token || len == 0) {
return mal_nil();
} else {
switch (*token) {
@@ 130,6 131,12 @@ MalType* read_form(Reader* reader) {
return read_hashmap(reader);
case '"':
return read_string(reader);
+ case '/':
+ if (len == 1) {
+ return read_atom(reader);
+ } else {
+ return read_regex(reader);
+ }
case ':':
return read_keyword(reader);
case '\'':
@@ 273,6 280,58 @@ char unescape_char(char *token, size_t *i, size_t len) {
}
}
+MalType* read_regex(Reader *reader) {
+ char *token = reader_next(reader);
+ size_t len = strlen(token);
+ char *str = GC_MALLOC(len + 1);
+ size_t index = 0;
+ int saw_slashes = 0;
+ int saw_lparens = 0;
+ int saw_rparens = 0;
+ char unescaped;
+ for (size_t i=0; i<len; i++) {
+ switch (token[i]) {
+ case '/':
+ saw_slashes++;
+ break;
+ case '(':
+ saw_lparens++;
+ break;
+ case ')':
+ saw_rparens++;
+ break;
+ case '\\':
+ i++;
+ unescaped = unescape_char(token, &i, len - 1); // note: len-1 because of closing quote
+ if (unescaped) {
+ str[index++] = unescaped;
+ break;
+ } else {
+ return mal_error(mal_string("Invalid escape sequence in regex"));
+ }
+ default:
+ str[index++] = token[i];
+ }
+ }
+ str[index] = 0;
+ if (saw_slashes != 2) {
+ printf("EOF\n");
+ }
+ if (saw_lparens != saw_rparens) {
+ // Oops, this doesn't look like a regex afterall! Let's break it apart into more tokens...
+ char* token_string = token;
+ Token* token = GC_MALLOC(sizeof(Token));
+ token->str = string("/");
+ token->next = tokenize(token_string+1); // skip the first character
+ Token* last = token->next;
+ while (last->next) last = last->next; // find the last token in the newly tokenized list
+ last->next = reader->token; // append our existing list on the end of it
+ reader->token = token; // point the reader at our newly prepended list of tokens
+ return read_atom(reader);
+ }
+ return mal_regex(str);
+}
+
MalType* read_list(Reader *reader) {
MalType *atom, *last_cons = NULL, *first_cons = NULL, *cons;
reader_next(reader); // consume '('
M reader.h => reader.h +2 -1
@@ 17,7 17,7 @@ char* reader_next(Reader* reader);
char* reader_peek(Reader* reader);
-static const char PATTERN[] = "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;.*|[^\\s\\[\\]{}('\"`,;)]*)";
+static const char PATTERN[] = "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|/(?:\\\\.|[^\\\\/])*/|;.*|[^\\s\\[\\]{}('\"`,;)]*)";
Token* tokenize(char* code);
@@ 30,6 30,7 @@ MalType* read_list(Reader* reader);
MalType* read_vector(Reader* reader);
MalType* read_hashmap(Reader *reader);
MalType* read_string(Reader *reader);
+MalType* read_regex(Reader *reader);
char unescape_char(char *token, size_t *i, size_t len);
MalType* read_atom(Reader* reader);
A tests/regex.mal => tests/regex.mal +49 -0
@@ 0,0 1,49 @@
+;;;
+;;; Supplemental malcc test for regex support
+;;;
+
+;; Testing that the reader understands regex literals
+/regex/
+;=>/regex/
+/escaped \/ slash/
+;=>/escaped \/ slash/
+(regex? /foo/)
+;=>true
+(regex? / foo /)
+;=>true
+(regex? "foo")
+;=>false
+(regex? /)
+;=>false
+
+;; Not to be confused with the division function
+/
+;=><lambda>
+(/ 6 2)
+;=>3
+(/ 6 (/ 4 2))
+;=>3
+(/ 6 (/ 4 (/ 2 1)))
+;=>3
+
+;; Testing that matching works
+(regex-match /foo/ "foo")
+;=>0
+(regex-match /foo/ "foofoo")
+;=>0
+(regex-match /bar/ "foobar")
+;=>3
+(regex-match /foo/ "foobar")
+;=>0
+(regex-match /^foo/ "foobar")
+;=>0
+(regex-match /^foo$/ "foobar")
+;=>nil
+(regex-match /foo/ "bar")
+;=>nil
+
+;; Testing that regex is callable (calls regex-match under the hood)
+(/foo/ "foo")
+;=>0
+(/foo/ "bar")
+;=>nil
M types.c => types.c +10 -0
@@ 240,6 240,16 @@ MalType* mal_string_to_list(MalType *orig) {
return mal_vector_to_list(vec);
}
+MalType* mal_regex(char *str) {
+ MalType *val = mal_alloc();
+ size_t len = strlen(str);
+ val->type = MAL_REGEX_TYPE;
+ val->regex_len = len;
+ val->regex = GC_MALLOC(len + 1);
+ snprintf(val->regex, len + 1, "%s", str);
+ return val;
+}
+
#define VECTOR_INIT_SIZE 10
#define VECTOR_GROW_FACTOR 2
M types.h => types.h +12 -1
@@ 27,6 27,7 @@ enum MalTypeType {
MAL_VECTOR_TYPE,
MAL_HASHMAP_TYPE,
MAL_STRING_TYPE,
+ MAL_REGEX_TYPE,
MAL_LAMBDA_TYPE,
MAL_CONTINUATION_TYPE,
MAL_ATOM_TYPE,
@@ 65,6 66,12 @@ struct MalType {
char *str;
};
+ // MAL_REGEX_TYPE
+ struct {
+ size_t regex_len;
+ char *regex;
+ };
+
// MAL_LAMBDA_TYPE, MAL_CONTINUATION_TYPE
struct {
MalType* (*fn)(MalEnv *env, size_t argc, MalType **args);
@@ 86,7 93,8 @@ struct MalType {
(val)->type == MAL_KEYWORD_TYPE || \
(val)->type == MAL_NUMBER_TYPE || \
(val)->type == MAL_SYMBOL_TYPE || \
- (val)->type == MAL_STRING_TYPE)
+ (val)->type == MAL_STRING_TYPE || \
+ (val)->type == MAL_REGEX_TYPE)
MalType* mal_alloc();
@@ 143,6 151,9 @@ MalType* mal_string_replace(MalType *val, char *find, char *replace);
MalType* mal_string_replace_all(MalType *orig, char *find, char *replace);
MalType* mal_string_to_list(MalType *orig);
+MalType* mal_regex(char *str);
+#define is_regex(val) ((val)->type == MAL_REGEX_TYPE)
+
MalType* mal_keyword(char *name);
#define is_keyword(val) ((val)->type == MAL_KEYWORD_TYPE)