~sircmpwn/hare

hare/strings/tokenize.ha -rw-r--r-- 3.6 KiB
5c7cd775Alexey Yerin all: add 0 value to enums used as flags 2 days ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
use bytes;
use types;

// The state for a tokenizer.
export type tokenizer = bytes::tokenizer;

// Returns a tokenizer which yields sub-strings tokenized by a delimiter.
//
// 	let tok = strings::tokenize("hello, my name is drew", " ");
// 	assert(strings::next_token(tok) == "hello,");
// 	assert(strings::next_token(tok) == "my");
// 	assert(strings::next_token(tok) == "name");
// 	assert(strings::remaining_tokens(tok) == "is drew");
export fn tokenize(s: str, delim: str) tokenizer =
	bytes::tokenize(toutf8(s), toutf8(delim));

// Returns the next string from a tokenizer, and advances the cursor. Returns
// void if there are no tokens left.
export fn next_token(s: *tokenizer) (str | void) = {
	return match (bytes::next_token(s)) {
	case b: []u8 =>
		yield fromutf8(b);
	case void => void;
	};
};

// Same as next_token(), but does not advance the cursor
export fn peek_token(s: *tokenizer) (str | void) = {
	return match (bytes::peek_token(s)) {
	case b: []u8 =>
		yield fromutf8(b);
	case void => void;
	};
};

// Returns the remainder of the string associated with a tokenizer, without doing
// any further tokenization.
export fn remaining_tokens(s: *tokenizer) str = {
	return fromutf8(bytes::remaining_tokens(s));
};

@test fn tokenize() void = {
	let tok = tokenize("Hello, my name is drew", " ");
	match (next_token(&tok)) {
	case s: str =>
		assert(s == "Hello,");
	case void =>
		abort();
	};

	match (next_token(&tok)) {
	case s: str =>
		assert(s == "my");
	case void =>
		abort();
	};

	match (peek_token(&tok)) {
	case s: str =>
		assert(s == "name");
	case void =>
		abort();
	};


	match (next_token(&tok)) {
	case s: str =>
		assert(s == "name");
	case void =>
		abort();
	};

	assert(remaining_tokens(&tok) == "is drew");
	assert(peek_token(&tok) as str == "is");
	assert(remaining_tokens(&tok) == "is drew");

	tok = tokenize("foo", "foo");

	assert(peek_token(&tok) as str == "");
	assert(next_token(&tok) as str == "");

	assert(peek_token(&tok) as str == "");
	assert(next_token(&tok) as str == "");

	assert(peek_token(&tok) is void);
	assert(next_token(&tok) is void);

	tok = tokenize("", "foo");
	assert(peek_token(&tok) is void);
	assert(next_token(&tok) is void);
};

// Splits a string into tokens delimited by 'delim', returning a slice of up to
// N tokens. The caller must free this slice. The strings within the slice are
// borrowed from 'in', and needn't be freed - but should be [[strings::dup_all]]'d
// if they should outlive 'in'.
export fn splitN(in: str, delim: str, n: size) []str = {
	let toks: []str = alloc([]);
	let tok = tokenize(in, delim);
	for (let i = 0z; i < n - 1z; i += 1) {
		match (next_token(&tok)) {
		case s: str =>
			append(toks, s);
		case void =>
			return toks;
		};
	};
	append(toks, remaining_tokens(&tok));
	return toks;
};

// Splits a string into tokens delimited by 'delim'.  The caller must free the
// returned slice. The strings within the slice are borrowed from 'in', and
// needn't be freed - but must be [[strings::dup_all]]'d if they should outlive
// 'in'.
export fn split(in: str, delim: str) []str = splitN(in, delim, types::SIZE_MAX);

@test fn split() void = {
	const expected = ["Hello,", "my", "name", "is Drew"];
	const actual = splitN("Hello, my name is Drew", " ", 4z);
	assert(len(expected) == len(actual));
	for (let i = 0z; i < len(expected); i += 1) {
		assert(expected[i] == actual[i]);
	};

	const expected2 = ["Hello,", "my", "name", "is", "Drew"];
	const actual2 = split("Hello, my name is Drew", " ");
	assert(len(expected2) == len(actual2));
	for (let i = 0z; i < len(expected2); i += 1) {
		assert(expected2[i] == actual2[i]);
	};
};