~sircmpwn/hare

4e47d429b45c0431b75c34804eaded1029a35e74 — Sebastian 2 months ago daff63e
encoding::utf8: operate exclusively on byte slices

Previously, utf8::decode and utf8::valid accepted (str | []u8). This has
been changed, so they now only accept []u8.

utf8::decode with a string operand duplicates the functionality of
strings::iter.

utf8::valid only makes sense on byte slices, since str can't hold
invalid UTF-8. If code really wants to check if the contents of a string
are valid, it's simple enough to just call strings::toutf8 on the str
first.

Signed-off-by: Sebastian <sebastian@sebsite.pw>
5 files changed, 15 insertions(+), 32 deletions(-)

M encoding/utf8/decode.ha
M hare/parse/+test/loc.ha
M strings/iter.ha
M strings/utf8.ha
M types/c/strings.ha
M encoding/utf8/decode.ha => encoding/utf8/decode.ha +4 -16
@@ 1,23 1,14 @@
// SPDX-License-Identifier: MPL-2.0
// (c) Hare authors <https://harelang.org>

fn toutf8(in: str) []u8 = *(&in: *[]u8);

fn fromtagged(in: (str | []u8)) []u8 = match (in) {
case let s: str =>
	return toutf8(s);
case let b: []u8 =>
	return b;
};

export type decoder = struct {
	offs: size,
	src: []u8,
};

// Initializes a new UTF-8 decoder.
export fn decode(src: (str | []u8)) decoder = decoder {
	src = fromtagged(src),
export fn decode(src: []u8) decoder = decoder {
	src = src,
	offs = 0,
};



@@ 161,11 152,8 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
	assert(prev(&decoder) is invalid);
};

// Returns true if a given string or byte slice contains only valid UTF-8
// sequences. Note that Hare strings (str) are always valid UTF-8 - if this
// returns false for a str type, something funny is going on.
export fn valid(src: (str | []u8)) bool = {
	let src = fromtagged(src);
// Returns true if a given byte slice contains only valid UTF-8 sequences.
export fn valid(src: []u8) bool = {
	let state = 0;
	for (let i = 0z; i < len(src) && state >= 0; i += 1) {
		state = table[state][src[i]];

M hare/parse/+test/loc.ha => hare/parse/+test/loc.ha +4 -11
@@ 2,7 2,6 @@
// (c) Hare authors <https://harelang.org>

use bufio;
use encoding::utf8;
use fmt;
use hare::ast;
use hare::lex;


@@ 26,11 25,8 @@ fn expr_testloc(srcs: str...) void = for (let i = 0z; i < len(srcs); i += 1) {
	};
	defer ast::expr_finish(&exp);
	let runes = 0z;
	let d = utf8::decode(srcs[i]);
	for (true) match (utf8::next(&d)!) {
	case void =>
		break;
	case rune =>
	let it = strings::iter(srcs[i]);
	for (strings::next(&it) is rune) {
		runes += 1;
	};
	assert(exp.start.line == 1 && exp.start.col == 1);


@@ 117,11 113,8 @@ fn type_testloc(srcs: str...) void = for (let i = 0z; i < len(srcs); i += 1) {
	};
	defer ast::type_finish(&typ);
	let runes = 0z;
	let d = utf8::decode(srcs[i]);
	for (true) match (utf8::next(&d)!) {
	case void =>
		break;
	case rune =>
	let it = strings::iter(srcs[i]);
	for (strings::next(&it) is rune) {
		runes += 1;
	};
	assert(typ.start.line == 1 && typ.start.col == 1);

M strings/iter.ha => strings/iter.ha +2 -2
@@ 22,7 22,7 @@ export type iterator = struct {
//	strings::next(&dup);	// '!'
//	strings::next(&dup);	// void
export fn iter(src: str) iterator = iterator {
	dec = utf8::decode(src),
	dec = utf8::decode(toutf8(src)),
	reverse = false,
};



@@ 30,7 30,7 @@ export fn iter(src: str) iterator = iterator {
// backwards with each call to [[next]].
export fn riter(src: str) iterator = {
	let ret = iterator {
		dec = utf8::decode(src),
		dec = utf8::decode(toutf8(src)),
		reverse = true,
	};
	ret.dec.offs = len(src);

M strings/utf8.ha => strings/utf8.ha +1 -1
@@ 21,7 21,7 @@ export fn fromutf8_unsafe(in: []u8) str = {
// [[encoding::utf8::invalid]] is returned instead.
export fn fromutf8(in: []u8) (str | utf8::invalid) = {
	let s = fromutf8_unsafe(in);
	if (!utf8::valid(s)) {
	if (!utf8::valid(in)) {
		return utf8::invalid;
	};
	return s;

M types/c/strings.ha => types/c/strings.ha +4 -2
@@ 44,8 44,10 @@ export fn tostr(cstr: *const char) (const str | utf8::invalid) = {
// Converts a C string with a given length to a Hare string. If the string is
// not valid UTF-8, return [[encoding::utf8::invalid]].
export fn tostrn(cstr: *const char, length: size) (const str | utf8::invalid) = {
	let s = tostrn_unsafe(cstr, length);
	return if (utf8::valid(s)) s else utf8::invalid;
	if (!utf8::valid((cstr: *[*]u8)[..length])) {
		return utf8::invalid;
	};
	return tostrn_unsafe(cstr, length);
};

// Converts a Hare string to a C string. The result is allocated; the caller