~tim/malcc

e65ada248959bf4b53aba5d5bc7eef17974f9a02 — Tim Morgan 5 years ago bc76d0b
Add support for UTF-8 in strings

Squashed commit of the following:

commit 20537781a60474315c02b3f8aa9d397fedf954cd
Author: Tim Morgan <tim@timmorgan.org>
Date:   Wed Feb 27 08:16:10 2019 -0600

    Just set LC_ALL in Dockerfile

commit 5f94f8d012bc09ec0cb3fd4c35e9e52438c7a861
Author: Tim Morgan <tim@timmorgan.org>
Date:   Wed Feb 27 08:07:22 2019 -0600

    Set locale in sr.ht build

commit 2589e43e96ae3ae7c3d3b5c6bccd3f1d83f838ad
Author: Tim Morgan <tim@timmorgan.org>
Date:   Wed Feb 27 08:00:58 2019 -0600

    Fix editline locale for multibyte characters

commit 7f9ab03b3cf069248a2beed0b5aaf39a92e73de3
Author: Tim Morgan <tim@timmorgan.org>
Date:   Mon Feb 25 22:50:01 2019 -0600

    Allow building without editline

    I couldn't get multibyte characters to work in Ubuntu with editline. So
    I'm just disabling it for the tests.

commit ae8bc2cb8fecc1cc57cf65cc03f5e921aec21933
Author: Tim Morgan <tim@timmorgan.org>
Date:   Sun Feb 24 20:15:13 2019 -0600

    Add support for UTF-8 in strings
8 files changed, 101 insertions(+), 12 deletions(-)

M .build.yml
M Dockerfile
M Makefile
M malcc.c
M reader.c
M reader.h
A tests/utf-8.mal
M types.c
M .build.yml => .build.yml +2 -0
@@ 7,6 7,8 @@ packages:
  - python
sources:
  - https://git.sr.ht/~tim/malcc
environment:
  LC_ALL: C.UTF-8
tasks:
  - build: |
      cd malcc

M Dockerfile => Dockerfile +2 -0
@@ 18,6 18,8 @@ RUN cd /tmp && \
    ./configure && \
    make install

ENV LC_ALL C.UTF-8

WORKDIR /malcc

CMD ["bash"]

M Makefile => Makefile +9 -3
@@ 1,7 1,7 @@
OS:=$(shell uname)
CC=gcc
CFLAGS=-Itinycc -Wall -Wextra -Werror -g
LDLIBS=-ledit -lgc -lpcre -ldl
LDLIBS=-ledit -ltermcap -lgc -lpcre -ldl

ALL_STEPS=step0_repl step1_read_print step2_eval step3_env step4_if_fn_do step5_tco step6_file step7_quote step8_macros step9_try stepA_mal malcc



@@ 29,7 29,7 @@ clean:
	rm -f $(ALL_STEPS) *.o
	cd tinycc && make clean

test: test0 test1 test2 test3 test4 test5 test6 test7 test8 test9 testA test-self-hosted test_malcc
test: test0 test1 test2 test3 test4 test5 test6 test7 test8 test9 testA test-malcc test-self-hosted test-supplemental

RUN_TEST_CMD=mal/runtest.py --rundir mal/tests --hard --deferrable --optional --start-timeout 1 --test-timeout 1



@@ 75,7 75,7 @@ testA: all
	$(RUN_TEST_CMD) step9_try.mal ../../stepA_mal
	$(RUN_TEST_CMD) stepA_mal.mal ../../stepA_mal

test_malcc: all
test-malcc: all
	$(RUN_TEST_CMD) step2_eval.mal ../../malcc
	$(RUN_TEST_CMD) step3_env.mal ../../malcc
	$(RUN_TEST_CMD) step4_if_fn_do.mal ../../malcc


@@ 97,6 97,9 @@ test-self-hosted: all
	$(RUN_TEST_CMD) --test-timeout 30 step9_try.mal ../../self_hosted_run
	$(RUN_TEST_CMD) --test-timeout 30 stepA_mal.mal ../../self_hosted_run

test-supplemental: all
	$(RUN_TEST_CMD) --test-timeout 30 ../../tests/utf-8.mal ../../malcc

perf: all
	cd mal/tests && ../../malcc perf1.mal && ../../malcc perf2.mal && ../../malcc perf3.mal



@@ 114,5 117,8 @@ docker-bash: docker-build
docker-test: docker-build
	$(RUN_DOCKER_CMD) make test

docker-test-supplemental: docker-build
	$(RUN_DOCKER_CMD) make test-supplemental

docker-watch: docker-build
	$(RUN_DOCKER_CMD) bash -c "ls *.c *.h Makefile | entr -c -s 'make test'"

M malcc.c => malcc.c +8 -1
@@ 6,6 6,7 @@
#include <string.h>
#include <unistd.h>
#include <libtcc.h>
#include <locale.h>

#include "core.h"
#include "env.h"


@@ 922,7 923,11 @@ char* PRINT(MalType *ast) {
}

char* rep(char *str, MalEnv *repl_env) {
  MalType *result = EVAL(READ(str), repl_env);
  MalType *ast = READ(str);
  if (is_error(ast)) {
    return PRINT(mal_sprintf("ERROR: %s\n", pr_str(ast->error_val, 0)));
  }
  MalType *result = EVAL(ast, repl_env);
  if (is_error(result)) {
    return PRINT(mal_sprintf("ERROR: %s\n", pr_str(result->error_val, 0)));
  } else {


@@ 1019,6 1024,8 @@ int main(int argc, char *argv[]) {

  rep(builtin_defs, repl_env);

  setlocale(LC_ALL, ""); // use locale set from environment

  if (mal_vector_len(arg_vec) >= 1) {
    rep(mal_sprintf("(load-file %s)", pr_str(mal_vector_ref(arg_vec, 0), 1))->str, repl_env);
  } else {

M reader.c => reader.c +23 -5
@@ 225,17 225,22 @@ MalType* read_string(Reader *reader) {
  size_t len = strlen(token);
  char *str = GC_MALLOC(len + 1);
  size_t index = 0;
  char escaped;
  int saw_quotes = 0;
  char unescaped;
  for (size_t i=0; i<len; i++) {
    switch (token[i]) {
      case '"':
        saw_quotes++;
        break;
      case '\\':
        escaped = token[++i];
        str[index++] = unescape_char(escaped);
        break;
        i++;
        unescaped = unescape_char(token, &i, len - 1); // note: len-1 because of closing quote
        if (unescaped) {
          str[index++] = unescaped;
          break;
        } else {
          return mal_error(mal_string("Invalid escape sequence in string"));
        }
      default:
        str[index++] = token[i];
    }


@@ 247,9 252,22 @@ MalType* read_string(Reader *reader) {
  return mal_string(str);
}

char unescape_char(char c) {
char unescape_char(char *token, size_t *i, size_t len) {
  char c = token[*i];
  if (c == 'n') {
    return 10; // newline
  } else if (c == 'x') {
    char seq[3];
    if ((*i)+2 < len) {
      seq[0] = token[++*i];
      seq[1] = token[++*i];
      seq[2] = 0;
      int val;
      sscanf(seq, "%x", &val);
      return (char)val;
    } else {
      return 0;
    }
  } else {
    return c;
  }

M reader.h => reader.h +1 -1
@@ 30,7 30,7 @@ MalType* read_list(Reader* reader);
MalType* read_vector(Reader* reader);
MalType* read_hashmap(Reader *reader);
MalType* read_string(Reader *reader);
char unescape_char(char c);
char unescape_char(char *token, size_t *i, size_t len);
MalType* read_atom(Reader* reader);

#endif

A tests/utf-8.mal => tests/utf-8.mal +37 -0
@@ 0,0 1,37 @@
;;;
;;; Supplemental malcc test for unicode support
;;;

;; Testing that (seq str) doesn't split 2-byte utf-8 characters in the middle
(first (seq "ă"))
;=>"ă"

;; Testing that (seq str) doesn't split 3-byte utf-8 characters in the middle
(first (seq "€"))
;=>"€"

;; Testing that (seq str) doesn't split 4-byte utf-8 characters in the middle
(first (seq "🙅"))
;=>"🙅"

;; Testing that splitting and re-joining multibyte characters does not change anything
(apply str (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x99\x8e"))
;=>"🤷🙎"

;; Testing that escaped hex escape sequences are intepreted
"\xf0\x9f\xa4\xb7\xf0\x9f\x99\x8e"
;=>"🤷🙎"

;; Testing that incomplete hex escape sequence produces an error
"\xf"
;/.*Invalid escape sequence in string.*

;; Testing that (seq str) splits emoji modifiers apart from emoji base
(first (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbf\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f"))
;=>"🤷"
(first (rest (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbf\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f")))
;=>"🏿"

;; Testing that splitting on incomplete utf-8 encodings produce an error
(seq "\xf0\x9f\xa4")
;/.*Invalid utf-8 encoding in string.*

M types.c => types.c +19 -2
@@ 198,10 198,27 @@ char* mal_string_substring(MalType *orig, size_t start, size_t len) {
MalType* mal_string_to_list(MalType *orig) {
  assert(is_string(orig));
  MalType *vec = mal_vector();
  char buffer[2];
  char buffer[5];
  for (size_t i=0; i<orig->str_len; i++) {
    buffer[0] = orig->str[i];
    buffer[1] = 0;
    if (((unsigned char)buffer[0] >> 3) == 30) { // 11110xxx, 4 bytes
      if (i + 3 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
      buffer[1] = orig->str[++i];
      buffer[2] = orig->str[++i];
      buffer[3] = orig->str[++i];
      buffer[4] = 0;
    } else if (((unsigned char)buffer[0] >> 4) == 14) { // 1110xxxx, 3 bytes
      if (i + 2 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
      buffer[1] = orig->str[++i];
      buffer[2] = orig->str[++i];
      buffer[3] = 0;
    } else if (((unsigned char)buffer[0] >> 5) == 6) { // 110xxxxx, 2 bytes
      if (i + 1 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
      buffer[1] = orig->str[++i];
      buffer[2] = 0;
    } else {
      buffer[1] = 0;
    }
    mal_vector_push(vec, mal_string(buffer));
  }
  return mal_vector_to_list(vec);