M .build.yml => .build.yml +2 -0
@@ 7,6 7,8 @@ packages:
- python
sources:
- https://git.sr.ht/~tim/malcc
+environment:
+ LC_ALL: C.UTF-8
tasks:
- build: |
cd malcc
M Dockerfile => Dockerfile +2 -0
@@ 18,6 18,8 @@ RUN cd /tmp && \
./configure && \
make install
+ENV LC_ALL C.UTF-8
+
WORKDIR /malcc
CMD ["bash"]
M Makefile => Makefile +9 -3
@@ 1,7 1,7 @@
OS:=$(shell uname)
CC=gcc
CFLAGS=-Itinycc -Wall -Wextra -Werror -g
-LDLIBS=-ledit -lgc -lpcre -ldl
+LDLIBS=-ledit -ltermcap -lgc -lpcre -ldl
ALL_STEPS=step0_repl step1_read_print step2_eval step3_env step4_if_fn_do step5_tco step6_file step7_quote step8_macros step9_try stepA_mal malcc
@@ 29,7 29,7 @@ clean:
rm -f $(ALL_STEPS) *.o
cd tinycc && make clean
-test: test0 test1 test2 test3 test4 test5 test6 test7 test8 test9 testA test-self-hosted test_malcc
+test: test0 test1 test2 test3 test4 test5 test6 test7 test8 test9 testA test-malcc test-self-hosted test-supplemental
RUN_TEST_CMD=mal/runtest.py --rundir mal/tests --hard --deferrable --optional --start-timeout 1 --test-timeout 1
@@ 75,7 75,7 @@ testA: all
$(RUN_TEST_CMD) step9_try.mal ../../stepA_mal
$(RUN_TEST_CMD) stepA_mal.mal ../../stepA_mal
-test_malcc: all
+test-malcc: all
$(RUN_TEST_CMD) step2_eval.mal ../../malcc
$(RUN_TEST_CMD) step3_env.mal ../../malcc
$(RUN_TEST_CMD) step4_if_fn_do.mal ../../malcc
@@ 97,6 97,9 @@ test-self-hosted: all
$(RUN_TEST_CMD) --test-timeout 30 step9_try.mal ../../self_hosted_run
$(RUN_TEST_CMD) --test-timeout 30 stepA_mal.mal ../../self_hosted_run
+test-supplemental: all
+ $(RUN_TEST_CMD) --test-timeout 30 ../../tests/utf-8.mal ../../malcc
+
perf: all
cd mal/tests && ../../malcc perf1.mal && ../../malcc perf2.mal && ../../malcc perf3.mal
@@ 114,5 117,8 @@ docker-bash: docker-build
docker-test: docker-build
$(RUN_DOCKER_CMD) make test
+docker-test-supplemental: docker-build
+ $(RUN_DOCKER_CMD) make test-supplemental
+
docker-watch: docker-build
$(RUN_DOCKER_CMD) bash -c "ls *.c *.h Makefile | entr -c -s 'make test'"
M malcc.c => malcc.c +8 -1
@@ 6,6 6,7 @@
#include <string.h>
#include <unistd.h>
#include <libtcc.h>
+#include <locale.h>
#include "core.h"
#include "env.h"
@@ 922,7 923,11 @@ char* PRINT(MalType *ast) {
}
char* rep(char *str, MalEnv *repl_env) {
- MalType *result = EVAL(READ(str), repl_env);
+ MalType *ast = READ(str);
+ if (is_error(ast)) {
+ return PRINT(mal_sprintf("ERROR: %s\n", pr_str(ast->error_val, 0)));
+ }
+ MalType *result = EVAL(ast, repl_env);
if (is_error(result)) {
return PRINT(mal_sprintf("ERROR: %s\n", pr_str(result->error_val, 0)));
} else {
@@ 1019,6 1024,8 @@ int main(int argc, char *argv[]) {
rep(builtin_defs, repl_env);
+ setlocale(LC_ALL, ""); // use locale set from environment
+
if (mal_vector_len(arg_vec) >= 1) {
rep(mal_sprintf("(load-file %s)", pr_str(mal_vector_ref(arg_vec, 0), 1))->str, repl_env);
} else {
M reader.c => reader.c +23 -5
@@ 225,17 225,22 @@ MalType* read_string(Reader *reader) {
size_t len = strlen(token);
char *str = GC_MALLOC(len + 1);
size_t index = 0;
- char escaped;
int saw_quotes = 0;
+ char unescaped;
for (size_t i=0; i<len; i++) {
switch (token[i]) {
case '"':
saw_quotes++;
break;
case '\\':
- escaped = token[++i];
- str[index++] = unescape_char(escaped);
- break;
+ i++;
+ unescaped = unescape_char(token, &i, len - 1); // note: len-1 because of closing quote
+ if (unescaped) {
+ str[index++] = unescaped;
+ break;
+ } else {
+ return mal_error(mal_string("Invalid escape sequence in string"));
+ }
default:
str[index++] = token[i];
}
@@ 247,9 252,22 @@ MalType* read_string(Reader *reader) {
return mal_string(str);
}
-char unescape_char(char c) {
+char unescape_char(char *token, size_t *i, size_t len) {
+ char c = token[*i];
if (c == 'n') {
return 10; // newline
+ } else if (c == 'x') {
+ char seq[3];
+ if ((*i)+2 < len) {
+ seq[0] = token[++*i];
+ seq[1] = token[++*i];
+ seq[2] = 0;
+ int val;
+ sscanf(seq, "%x", &val);
+ return (char)val;
+ } else {
+ return 0;
+ }
} else {
return c;
}
M reader.h => reader.h +1 -1
@@ 30,7 30,7 @@ MalType* read_list(Reader* reader);
MalType* read_vector(Reader* reader);
MalType* read_hashmap(Reader *reader);
MalType* read_string(Reader *reader);
-char unescape_char(char c);
+char unescape_char(char *token, size_t *i, size_t len);
MalType* read_atom(Reader* reader);
#endif
A tests/utf-8.mal => tests/utf-8.mal +37 -0
@@ 0,0 1,37 @@
+;;;
+;;; Supplemental malcc test for unicode support
+;;;
+
+;; Testing that (seq str) doesn't split 2-byte utf-8 characters in the middle
+(first (seq "ă"))
+;=>"ă"
+
+;; Testing that (seq str) doesn't split 3-byte utf-8 characters in the middle
+(first (seq "€"))
+;=>"€"
+
+;; Testing that (seq str) doesn't split 4-byte utf-8 characters in the middle
+(first (seq "🙅"))
+;=>"🙅"
+
+;; Testing that splitting and re-joining multibyte characters does not change anything
+(apply str (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x99\x8e"))
+;=>"🤷🙎"
+
+;; Testing that escaped hex escape sequences are intepreted
+"\xf0\x9f\xa4\xb7\xf0\x9f\x99\x8e"
+;=>"🤷🙎"
+
+;; Testing that incomplete hex escape sequence produces an error
+"\xf"
+;/.*Invalid escape sequence in string.*
+
+;; Testing that (seq str) splits emoji modifiers apart from emoji base
+(first (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbf\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f"))
+;=>"🤷"
+(first (rest (seq "\xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbf\xe2\x80\x8d\xe2\x99\x80\xef\xb8\x8f")))
+;=>"🏿"
+
+;; Testing that splitting on incomplete utf-8 encodings produce an error
+(seq "\xf0\x9f\xa4")
+;/.*Invalid utf-8 encoding in string.*
M types.c => types.c +19 -2
@@ 198,10 198,27 @@ char* mal_string_substring(MalType *orig, size_t start, size_t len) {
MalType* mal_string_to_list(MalType *orig) {
assert(is_string(orig));
MalType *vec = mal_vector();
- char buffer[2];
+ char buffer[5];
for (size_t i=0; i<orig->str_len; i++) {
buffer[0] = orig->str[i];
- buffer[1] = 0;
+ if (((unsigned char)buffer[0] >> 3) == 30) { // 11110xxx, 4 bytes
+ if (i + 3 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
+ buffer[1] = orig->str[++i];
+ buffer[2] = orig->str[++i];
+ buffer[3] = orig->str[++i];
+ buffer[4] = 0;
+ } else if (((unsigned char)buffer[0] >> 4) == 14) { // 1110xxxx, 3 bytes
+ if (i + 2 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
+ buffer[1] = orig->str[++i];
+ buffer[2] = orig->str[++i];
+ buffer[3] = 0;
+ } else if (((unsigned char)buffer[0] >> 5) == 6) { // 110xxxxx, 2 bytes
+ if (i + 1 >= orig->str_len) return mal_error(mal_string("Invalid utf-8 encoding in string"));
+ buffer[1] = orig->str[++i];
+ buffer[2] = 0;
+ } else {
+ buffer[1] = 0;
+ }
mal_vector_push(vec, mal_string(buffer));
}
return mal_vector_to_list(vec);