~tim/malcc

932437b33b3640f0b9c94ec34eeec33a752ecac3 — Tim Morgan 4 years ago f2adbf5
Add support for regex literal and matching

    (regex? /foo/) ; => true
    (regex-match /foo/ "foo") ; => 0
    (regex-match /foo/ "bar") ; => nil
    (/foo/ "foo") ; => 0 (regexes are callable)
11 files changed, 220 insertions(+), 3 deletions(-)

M Makefile
M core.c
M core.h
M malcc.c
M printer.c
M printer.h
M reader.c
M reader.h
A tests/regex.mal
M types.c
M types.h
M Makefile => Makefile +1 -0
@@ 102,6 102,7 @@ test-self-hosted: all

test-supplemental: all
	$(RUN_TEST_CMD) --test-timeout 30 ../../tests/utf-8.mal ../../malcc
	$(RUN_TEST_CMD) --test-timeout 30 ../../tests/regex.mal ../../malcc

test-mal-in-mal: mal-in-mal
	$(RUN_TEST_CMD) --test-timeout 30 step2_eval.mal ../../mal-in-mal

M core.c => core.c +38 -0
@@ 1,5 1,6 @@
#include <assert.h>
#include <gc.h>
#include <pcre.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>


@@ 76,6 77,8 @@ struct hashmap* core_ns() {
  hashmap_put(ns, "number?", core_is_number);
  hashmap_put(ns, "fn?", core_is_fn);
  hashmap_put(ns, "macro?", core_is_macro);
  hashmap_put(ns, "regex?", core_is_regex);
  hashmap_put(ns, "regex-match", core_regex_match);
  return ns;
}



@@ 757,6 760,41 @@ MalType* core_is_macro(MalEnv *env, size_t argc, MalType **args) {
  return is_macro(val) ? mal_true() : mal_false();
}

MalType* core_is_regex(MalEnv *env, size_t argc, MalType **args) {
  UNUSED(env);
  mal_assert(argc == 1, "Expected 1 argument to regex?");
  MalType *val = args[0];
  return is_regex(val) ? mal_true() : mal_false();
}

MalType* core_regex_match(MalEnv *env, size_t argc, MalType **args) {
  UNUSED(env);
  mal_assert(argc == 2, "Expected 2 argument to regex-match");
  MalType *regex = args[0];
  mal_assert(is_regex(regex), "Expected first argument to regex-match to be a regex");
  MalType *str = args[1];
  mal_assert(is_string(str), "Expected second argument to regex-match to be a string");

  const char *pcreErrorStr;
  int pcreErrorOffset;
  pcre *reCompiled = pcre_compile(regex->regex, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
  if(reCompiled == NULL) {
    return mal_error(mal_string("Could not compile regex."));
  }

  pcre_extra *pcreExtra = pcre_study(reCompiled, PCRE_EXTENDED, &pcreErrorStr);
  if(pcreErrorStr != NULL) {
    return mal_error(mal_string("Could not study regex."));
  }

  int subStrVec[30];
  int pcreExecRet = pcre_exec(reCompiled, pcreExtra, str->str, str->str_len, 0, 0, subStrVec, 30);

  pcre_free(reCompiled);

  return pcreExecRet < 0 ? mal_nil() : mal_number(subStrVec[0]);
}

void add_core_ns_to_env(MalEnv *env) {
  struct hashmap *ns = core_ns();
  struct hashmap_iter *core_iter;

M core.h => core.h +2 -0
@@ 67,6 67,8 @@ MalType* core_is_string(MalEnv *env, size_t argc, MalType **args);
MalType* core_is_number(MalEnv *env, size_t argc, MalType **args);
MalType* core_is_fn(MalEnv *env, size_t argc, MalType **args);
MalType* core_is_macro(MalEnv *env, size_t argc, MalType **args);
MalType* core_is_regex(MalEnv *env, size_t argc, MalType **args);
MalType* core_regex_match(MalEnv *env, size_t argc, MalType **args);
void add_core_ns_to_env(MalEnv *env);

#endif

M malcc.c => malcc.c +12 -0
@@ 65,6 65,7 @@ int gen_list_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int
int gen_load_file(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var_num);
int gen_number_code(MalType *node, struct codegen *code, int ret);
int gen_string_code(MalType *node, struct codegen *code, int ret);
int gen_regex_code(MalType *node, struct codegen *code, int ret);
int gen_symbol_code(MalType *node, struct codegen *code, int ret);
int gen_symbol_lookup_code(MalType *node, MalEnv *env, struct codegen *code, int ret);
int gen_try_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var_num);


@@ 229,6 230,8 @@ int gen_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int *var
      return gen_number_code(node, code, ret);
    case MAL_STRING_TYPE:
      return gen_string_code(node, code, ret);
    case MAL_REGEX_TYPE:
      return gen_regex_code(node, code, ret);
    case MAL_SYMBOL_TYPE:
      if (quoting) {
        return gen_symbol_code(node, code, ret);


@@ 312,6 315,8 @@ int gen_call_code(MalType *node, MalEnv *env, struct codegen *code, int ret, int
    } else if (strcmp(sym->symbol, "load-file") == 0) {
      return gen_load_file(mal_car(mal_cdr(node)), env, code, ret, var_num);
    }
  } else if (is_regex(sym)) {
    return gen_call_code(mal_cons(mal_symbol("regex-match"), node), env, code, ret, var_num);
  }

  // look up the lambda in the env


@@ 724,6 729,13 @@ int gen_string_code(MalType *node, struct codegen *code, int ret) {
  return 1;
}

int gen_regex_code(MalType *node, struct codegen *code, int ret) {
  MalType *str = mal_string(node->regex);
  MalType *temp_code = mal_sprintf("mal_regex(%s)", pr_str(str, 1));
  append_code(code->body, temp_code, ret);
  return 1;
}

int gen_symbol_code(MalType *node, struct codegen *code, int ret) {
  MalType *temp_code = mal_sprintf("mal_symbol(%s)", pr_str(mal_string(node->symbol), 1));
  append_code(code->body, temp_code, ret);

M printer.c => printer.c +33 -0
@@ 43,6 43,8 @@ char* pr_str(MalType *val, int print_readably) {
      return string("nil");
    case MAL_NUMBER_TYPE:
      return long_long_to_string(val->number);
    case MAL_REGEX_TYPE:
      return pr_regex(val, print_readably);
    case MAL_STRING_TYPE:
      return pr_string(val, print_readably);
    case MAL_SYMBOL_TYPE:


@@ 151,3 153,34 @@ char* pr_string(MalType *val, int print_readably) {
    return str;
  }
}

char* pr_regex(MalType *val, int print_readably) {
  assert(strlen(val->regex) == val->regex_len);
  if (print_readably) {
    size_t len = val->regex_len;
    char *orig = val->regex;
    MalType *repr = mal_string("/");
    for (size_t i=0; i<len; i++) {
      switch (orig[i]) {
        case '\n':
          mal_string_append(repr, "\\n");
          break;
        case '/':
          mal_string_append(repr, "\\/");
          break;
        case '\\':
          mal_string_append(repr, "\\\\");
          break;
        default:
          mal_string_append_char(repr, orig[i]);
      }
    }
    mal_string_append_char(repr, '/');
    return repr->str;
  } else {
    size_t len = val->regex_len + 1;
    char *str = GC_MALLOC(len);
    snprintf(str, len, "%s", val->regex);
    return str;
  }
}

M printer.h => printer.h +1 -0
@@ 12,5 12,6 @@ char* pr_list(MalType *val, int print_readably);
char* pr_vector(MalType *val, int print_readably);
char* pr_hashmap(MalType *val, int print_readably);
char* pr_string(MalType *val, int print_readably);
char* pr_regex(MalType *val, int print_readably);

#endif

M reader.c => reader.c +60 -1
@@ 115,7 115,8 @@ MalType* read_str(char* code) {

MalType* read_form(Reader* reader) {
  char *token = reader_peek(reader);
  if (!token || strlen(token) == 0) {
  size_t len = strlen(token);
  if (!token || len == 0) {
    return mal_nil();
  } else {
    switch (*token) {


@@ 130,6 131,12 @@ MalType* read_form(Reader* reader) {
        return read_hashmap(reader);
      case '"':
        return read_string(reader);
      case '/':
        if (len == 1) {
          return read_atom(reader);
        } else {
          return read_regex(reader);
        }
      case ':':
        return read_keyword(reader);
      case '\'':


@@ 273,6 280,58 @@ char unescape_char(char *token, size_t *i, size_t len) {
  }
}

MalType* read_regex(Reader *reader) {
  char *token = reader_next(reader);
  size_t len = strlen(token);
  char *str = GC_MALLOC(len + 1);
  size_t index = 0;
  int saw_slashes = 0;
  int saw_lparens = 0;
  int saw_rparens = 0;
  char unescaped;
  for (size_t i=0; i<len; i++) {
    switch (token[i]) {
      case '/':
        saw_slashes++;
        break;
      case '(':
        saw_lparens++;
        break;
      case ')':
        saw_rparens++;
        break;
      case '\\':
        i++;
        unescaped = unescape_char(token, &i, len - 1); // note: len-1 because of closing quote
        if (unescaped) {
          str[index++] = unescaped;
          break;
        } else {
          return mal_error(mal_string("Invalid escape sequence in regex"));
        }
      default:
        str[index++] = token[i];
    }
  }
  str[index] = 0;
  if (saw_slashes != 2) {
    printf("EOF\n");
  }
  if (saw_lparens != saw_rparens) {
    // Oops, this doesn't look like a regex afterall! Let's break it apart into more tokens...
    char* token_string = token;
    Token* token = GC_MALLOC(sizeof(Token));
    token->str = string("/");
    token->next = tokenize(token_string+1); // skip the first character
    Token* last = token->next;
    while (last->next) last = last->next; // find the last token in the newly tokenized list
    last->next = reader->token; // append our existing list on the end of it
    reader->token = token; // point the reader at our newly prepended list of tokens
    return read_atom(reader);
  }
  return mal_regex(str);
}

MalType* read_list(Reader *reader) {
  MalType *atom, *last_cons = NULL, *first_cons = NULL, *cons;
  reader_next(reader); // consume '('

M reader.h => reader.h +2 -1
@@ 17,7 17,7 @@ char* reader_next(Reader* reader);

char* reader_peek(Reader* reader);

static const char PATTERN[] = "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|;.*|[^\\s\\[\\]{}('\"`,;)]*)";
static const char PATTERN[] = "[\\s,]*(~@|[\\[\\]{}()'`~^@]|\"(?:\\\\.|[^\\\\\"])*\"?|/(?:\\\\.|[^\\\\/])*/|;.*|[^\\s\\[\\]{}('\"`,;)]*)";

Token* tokenize(char* code);



@@ 30,6 30,7 @@ MalType* read_list(Reader* reader);
MalType* read_vector(Reader* reader);
MalType* read_hashmap(Reader *reader);
MalType* read_string(Reader *reader);
MalType* read_regex(Reader *reader);
char unescape_char(char *token, size_t *i, size_t len);
MalType* read_atom(Reader* reader);


A tests/regex.mal => tests/regex.mal +49 -0
@@ 0,0 1,49 @@
;;;
;;; Supplemental malcc test for regex support
;;;

;; Testing that the reader understands regex literals
/regex/
;=>/regex/
/escaped \/ slash/
;=>/escaped \/ slash/
(regex? /foo/)
;=>true
(regex? / foo /)
;=>true
(regex? "foo")
;=>false
(regex? /)
;=>false

;; Not to be confused with the division function
/
;=><lambda>
(/ 6 2)
;=>3
(/ 6 (/ 4 2))
;=>3
(/ 6 (/ 4 (/ 2 1)))
;=>3

;; Testing that matching works
(regex-match /foo/ "foo")
;=>0
(regex-match /foo/ "foofoo")
;=>0
(regex-match /bar/ "foobar")
;=>3
(regex-match /foo/ "foobar")
;=>0
(regex-match /^foo/ "foobar")
;=>0
(regex-match /^foo$/ "foobar")
;=>nil
(regex-match /foo/ "bar")
;=>nil

;; Testing that regex is callable (calls regex-match under the hood)
(/foo/ "foo")
;=>0
(/foo/ "bar")
;=>nil

M types.c => types.c +10 -0
@@ 240,6 240,16 @@ MalType* mal_string_to_list(MalType *orig) {
  return mal_vector_to_list(vec);
}

MalType* mal_regex(char *str) {
  MalType *val = mal_alloc();
  size_t len = strlen(str);
  val->type = MAL_REGEX_TYPE;
  val->regex_len = len;
  val->regex = GC_MALLOC(len + 1);
  snprintf(val->regex, len + 1, "%s", str);
  return val;
}

#define VECTOR_INIT_SIZE 10
#define VECTOR_GROW_FACTOR 2


M types.h => types.h +12 -1
@@ 27,6 27,7 @@ enum MalTypeType {
  MAL_VECTOR_TYPE,
  MAL_HASHMAP_TYPE,
  MAL_STRING_TYPE,
  MAL_REGEX_TYPE,
  MAL_LAMBDA_TYPE,
  MAL_CONTINUATION_TYPE,
  MAL_ATOM_TYPE,


@@ 65,6 66,12 @@ struct MalType {
      char *str;
    };

    // MAL_REGEX_TYPE
    struct {
      size_t regex_len;
      char *regex;
    };

    // MAL_LAMBDA_TYPE, MAL_CONTINUATION_TYPE
    struct {
      MalType* (*fn)(MalEnv *env, size_t argc, MalType **args);


@@ 86,7 93,8 @@ struct MalType {
                           (val)->type == MAL_KEYWORD_TYPE || \
                           (val)->type == MAL_NUMBER_TYPE || \
                           (val)->type == MAL_SYMBOL_TYPE || \
                           (val)->type == MAL_STRING_TYPE)
                           (val)->type == MAL_STRING_TYPE || \
                           (val)->type == MAL_REGEX_TYPE)

MalType* mal_alloc();



@@ 143,6 151,9 @@ MalType* mal_string_replace(MalType *val, char *find, char *replace);
MalType* mal_string_replace_all(MalType *orig, char *find, char *replace);
MalType* mal_string_to_list(MalType *orig);

MalType* mal_regex(char *str);
#define is_regex(val) ((val)->type == MAL_REGEX_TYPE)

MalType* mal_keyword(char *name);
#define is_keyword(val) ((val)->type == MAL_KEYWORD_TYPE)