~tim/malcc

ae8bc2cb8fecc1cc57cf65cc03f5e921aec21933 — Tim Morgan 5 years ago bc76d0b
Add support for UTF-8 in strings
6 files changed, 90 insertions(+), 11 deletions(-)

M Makefile
M malcc.c
M reader.c
M reader.h
A tests/unicode.mal
M types.c
M Makefile => Makefile +5 -2
@@ 29,7 29,7 @@ clean:
	rm -f $(ALL_STEPS) *.o
	cd tinycc && make clean

test: test0 test1 test2 test3 test4 test5 test6 test7 test8 test9 testA test-self-hosted test_malcc
test: test0 test1 test2 test3 test4 test5 test6 test7 test8 test9 testA test-malcc test-self-hosted test-supplemental

RUN_TEST_CMD=mal/runtest.py --rundir mal/tests --hard --deferrable --optional --start-timeout 1 --test-timeout 1



@@ 75,7 75,7 @@ testA: all
	$(RUN_TEST_CMD) step9_try.mal ../../stepA_mal
	$(RUN_TEST_CMD) stepA_mal.mal ../../stepA_mal

test_malcc: all
test-malcc: all
	$(RUN_TEST_CMD) step2_eval.mal ../../malcc
	$(RUN_TEST_CMD) step3_env.mal ../../malcc
	$(RUN_TEST_CMD) step4_if_fn_do.mal ../../malcc


@@ 97,6 97,9 @@ test-self-hosted: all
	$(RUN_TEST_CMD) --test-timeout 30 step9_try.mal ../../self_hosted_run
	$(RUN_TEST_CMD) --test-timeout 30 stepA_mal.mal ../../self_hosted_run

test-supplemental: all
	$(RUN_TEST_CMD) --test-timeout 30 ../../tests/unicode.mal ../../malcc

perf: all
	cd mal/tests && ../../malcc perf1.mal && ../../malcc perf2.mal && ../../malcc perf3.mal


M malcc.c => malcc.c +5 -1
@@ 922,7 922,11 @@ char* PRINT(MalType *ast) {
}

char* rep(char *str, MalEnv *repl_env) {
  MalType *result = EVAL(READ(str), repl_env);
  MalType *ast = READ(str);
  if (is_error(ast)) {
    return PRINT(mal_sprintf("ERROR: %s\n", pr_str(ast->error_val, 0)));
  }
  MalType *result = EVAL(ast, repl_env);
  if (is_error(result)) {
    return PRINT(mal_sprintf("ERROR: %s\n", pr_str(result->error_val, 0)));
  } else {

M reader.c => reader.c +23 -5
@@ 225,17 225,22 @@ MalType* read_string(Reader *reader) {
  size_t len = strlen(token);
  char *str = GC_MALLOC(len + 1);
  size_t index = 0;
  char escaped;
  int saw_quotes = 0;
  char unescaped;
  for (size_t i=0; i<len; i++) {
    switch (token[i]) {
      case '"':
        saw_quotes++;
        break;
      case '\\':
        escaped = token[++i];
        str[index++] = unescape_char(escaped);
        break;
        i++;
        unescaped = unescape_char(token, &i, len - 1); // note: len-1 because of closing quote
        if (unescaped) {
          str[index++] = unescaped;
          break;
        } else {
          return mal_error(mal_string("Invalid escape sequence in string"));
        }
      default:
        str[index++] = token[i];
    }


@@ 247,9 252,22 @@ MalType* read_string(Reader *reader) {
  return mal_string(str);
}

char unescape_char(char c) {
char unescape_char(char *token, size_t *i, size_t len) {
  char c = token[*i];
  if (c == 'n') {
    return 10; // newline
  } else if (c == 'x') {
    char seq[3];
    if ((*i)+2 < len) {
      seq[0] = token[++*i];
      seq[1] = token[++*i];
      seq[2] = 0;
      int val;
      sscanf(seq, "%x", &val);
      return (char)val;
    } else {
      return 0;
    }
  } else {
    return c;
  }

M reader.h => reader.h +1 -1
@@ 30,7 30,7 @@ MalType* read_list(Reader* reader);
MalType* read_vector(Reader* reader);
MalType* read_hashmap(Reader *reader);
MalType* read_string(Reader *reader);
char unescape_char(char c);
char unescape_char(char *token, size_t *i, size_t len);
MalType* read_atom(Reader* reader);

#endif

A tests/unicode.mal => tests/unicode.mal +37 -0
@@ 0,0 1,37 @@
;;;
;;; Supplemental malcc test for unicode support
;;;

;; Testing that (seq str) doesn't split 2-byte utf-8 characters in the middle
(first (seq "ă"))
;=>"ă"

;; Testing that (seq str) doesn't split 3-byte utf-8 characters in the middle
(first (seq "€"))
;=>"€"

;; Testing that (seq str) doesn't split 4-byte utf-8 characters in the middle
(first (seq "🙅"))
;=>"🙅"

;; Testing that splitting and re-joining multibyte characters does not change anything
(apply str (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x99\x8e"))
;=>"🤷🙎"

;; Testing that escaped hex escape sequences are intepreted
"\xf0\x9f\xa4\xb7\xf0\x9f\x99\x8e"
;=>"🤷🙎"

;; Testing that incomplete hex escape sequence produces an error
"\xf"
;/.*Invalid escape sequence in string.*

;; Testing that (seq str) splits emoji modifiers apart from emoji base
(first (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbf\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f"))
;=>"🤷"
(first (rest (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbf\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f")))
;=>"🏿"

;; Testing that splitting on incomplete utf-8 encodings produce an error
(seq "\xf0\x9f\xa4")
;/.*Invalid utf-8 encoding in string.*

M types.c => types.c +19 -2
@@ 198,10 198,27 @@ char* mal_string_substring(MalType *orig, size_t start, size_t len) {
MalType* mal_string_to_list(MalType *orig) {
  assert(is_string(orig));
  MalType *vec = mal_vector();
  char buffer[2];
  char buffer[5];
  for (size_t i=0; i<orig->str_len; i++) {
    buffer[0] = orig->str[i];
    buffer[1] = 0;
    if (((unsigned char)buffer[0] >> 3) == 30) { // 11110xxx, 4 bytes
      if (i + 3 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
      buffer[1] = orig->str[++i];
      buffer[2] = orig->str[++i];
      buffer[3] = orig->str[++i];
      buffer[4] = 0;
    } else if (((unsigned char)buffer[0] >> 4) == 14) { // 1110xxxx, 3 bytes
      if (i + 2 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
      buffer[1] = orig->str[++i];
      buffer[2] = orig->str[++i];
      buffer[3] = 0;
    } else if (((unsigned char)buffer[0] >> 5) == 6) { // 110xxxxx, 2 bytes
      if (i + 1 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
      buffer[1] = orig->str[++i];
      buffer[2] = 0;
    } else {
      buffer[1] = 0;
    }
    mal_vector_push(vec, mal_string(buffer));
  }
  return mal_vector_to_list(vec);