~sircmpwn/hare-message

57f86978034e1e50d82206cfa4799f07e7762ec9 — Max Schillinger 4 months ago 0601352 master
Decode and encode non-ascii header values (UTF-8)

Non-ascii strings are represented as "encoded-words" as defined in
RFC 2047:
https://datatracker.ietf.org/doc/html/rfc2047

This commit handles only UTF-8 charsets, both "B" and "Q" encodings.
3 files changed, 191 insertions(+), 3 deletions(-)

M message/canonical.ha
A message/encodedword.ha
M message/header.ha
M message/canonical.ha => message/canonical.ha +1 -1
@@ 46,7 46,7 @@ export fn canonical_mime_header_key(key: str) str = {
		const rn = match (strings::next(&iter)) {
		case let rn: rune =>
			yield rn;
		case void =>
		case done =>
			break;
		};
		if (!ascii::valid(rn) || !valid_header_field(rn: u32: u8)) {

A message/encodedword.ha => message/encodedword.ha +178 -0
@@ 0,0 1,178 @@
use ascii;
use encoding::base64;
use encoding::utf8;
use regex;
use strconv;
use strings;

let re_encoded_word: regex::regex = regex::regex { ... };

@init fn init() void = {
	re_encoded_word = regex::compile(`=\?([^? ]+)\?([bBqQ])\?([^? ]+)\?=`)!;
};

@fini fn fini() void = {
	regex::finish(&re_encoded_word);
};

// See RFC 2047, Section 4, for the definition of Q and B encodings:
// https://datatracker.ietf.org/doc/html/rfc2047#section-4
type recommended_encoding = enum {
	NONE,
	Q,
	B,
};

fn decode_utf8q(s: str) str = {
	let result: []u8 = alloc([0...], len(s));
	let bytes = strings::toutf8(s);
	let j = 0z;
	for (let i = 0z; i < len(s); i += 1) {
		if (bytes[i] == '=' && i+2 < len(s)) {
			i += 1;
			const byte = match (strconv::stou8(strings::sub(s, i, i+2), 16)) {
			case let b: u8 => yield b;
			case => yield '?': u8;
			};
			result[j] = byte;
			i += 1;
		} else if (bytes[i] == '_') {
			result[j] = ' ';
		} else {
			result[j] = bytes[i];
		};
		j += 1;
	};
	return strings::fromutf8(result[0..j])!;
};

@test fn decode_utf8q() void = {
	assert(decode_utf8q("M=C3=BCller") == "Müller");
	assert(decode_utf8q("B=C3=A9la_Bart=C3=B3k") == "Béla Bartók");
	assert(decode_utf8q("=F0=9F=98=8E") == "😎");
};

fn decode_encoded_words(line: str) str = {
	let matches: regex::result = [];
	for (strings::contains(line, "=?")) {
		matches = regex::find(&re_encoded_word, line);
		defer regex::result_free(matches);
		if (len(matches) == 0)
			break;

		const charset = ascii::strlower(matches[1].content);
		const encoding = ascii::strlower(matches[2].content);
		const encoded_text = matches[3].content;

		switch (charset) {
		case "utf-8" =>
			switch (encoding) {
			case "b" =>
				const decoded_slice = encoding::base64::decodestr(
					&encoding::base64::std_encoding, encoded_text)!;
				defer free(decoded_slice);
				const decoded_string = strings::fromutf8_unsafe(decoded_slice);
				line = strings::replace(line, matches[0].content, decoded_string);
			case "q" =>
				const decoded = decode_utf8q(encoded_text);
				defer free(decoded);
				line = strings::replace(line, matches[0].content, decoded);
			case => return line; // warning?
			};
		case =>
			// TODO: Handle charsets other than UTF-8,
			// especially ISO-8859-1(5) and Windows-1252
			break;
		};
	};
	return line;
};

@test fn decode_encoded_words() void = {
	assert(decode_encoded_words("=?UTF-8?Q?M=C3=B6ller?=") == "Möller");
	assert(decode_encoded_words("=?UTF-8?B?5byg5LiJ?= <zhang.san@example.com>")
		== "张三 <zhang.san@example.com>");
};

fn get_recommended_encoding(s: str) recommended_encoding = {
	let iter = strings::iter(s);
	let ascii_count = 0z;
	let rune_count = 0z;
	for (let r => strings::next(&iter)) {
		rune_count += 1;
		if (ascii::isprint(r)) {
			ascii_count += 1;
		};
	};
	// RFC 2047, Section 4:
	// The "Q" encoding is recommended for use when most of the characters
	// to be encoded are in the ASCII character set; otherwise, the "B"
	// encoding should be used.
	if (ascii_count == rune_count) {
		return recommended_encoding::NONE;
	} else if (ascii_count: f32 >= rune_count: f32 / 2.0) {
		return recommended_encoding::Q;
	} else {
		return recommended_encoding::B;
	};
};

@test fn get_recommended_encoding() void = {
	assert(get_recommended_encoding("John Doe <john@example.org>")
		== recommended_encoding::NONE);
	assert(get_recommended_encoding("Möller") == recommended_encoding::Q);
	assert(get_recommended_encoding("张三 <zhang.san@example.com>")
		== recommended_encoding::Q);
	assert(get_recommended_encoding("张三") == recommended_encoding::B);
	assert(get_recommended_encoding("😎") == recommended_encoding::B);
};

fn encode_utf8q(value: str) str = {
	let bytes_encoded: []u8 = [];
	let iter = strings::iter(value);
	for (let r => strings::next(&iter)) {
		if (r == ' ') {
			append(bytes_encoded, '_');
		} else if (ascii::isprint(r)) {
			append(bytes_encoded, r: u8);
		} else {
			const bytes = encoding::utf8::encoderune(r);
			for (let b .. bytes) {
				const byte_encoded =
					strings::toutf8(strings::dup(strconv::u8tos(b, 16)));
				append(bytes_encoded, '=');
				append(bytes_encoded, byte_encoded...);
			};
		};
	};
	return strings::fromutf8_unsafe(bytes_encoded);
};

@test fn encode_utf8q() void = {
	assert(encode_utf8q("Dr. Möller") == "Dr._M=C3=B6ller");
	assert(encode_utf8q("张三") == "=E5=BC=A0=E4=B8=89");
	assert(encode_utf8q("😎") == "=F0=9F=98=8E");
};

fn encode(value: str) str = {
	switch (get_recommended_encoding(value)) {
	case recommended_encoding::B =>
		return strings::concat("=?UTF-8?B?",
			encoding::base64::encodestr(&encoding::base64::std_encoding,
			strings::toutf8(value)), "?=");
	case recommended_encoding::Q =>
		return strings::concat("=?UTF-8?Q?", encode_utf8q(value), "?=");
	case recommended_encoding::NONE =>
		return value;
	};
};

@test fn encode() void = {
	assert(encode("John Doe <john@example.org>")
		== "John Doe <john@example.org>");
	assert(encode("Möller") == "=?UTF-8?Q?M=C3=B6ller?=");
	assert(encode("张三") == "=?UTF-8?B?5byg5LiJ?=");
	assert(encode("张三 <zhang.san@example.com>")
		== "=?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=");
	assert(encode("😎") == "=?UTF-8?B?8J+Yjg==?=");
};

M message/header.ha => message/header.ha +12 -2
@@ 84,6 84,7 @@ export fn header_add(head: *header, key: str, val: str) void = {
	defer free(key);
	let map = header_get_mapkey(head, key);

	const val = encode(val);
	const field = alloc(new_header_field(key, val, []));
	append(head.fields, field);
	append(map.fields, field);


@@ 101,7 102,7 @@ export fn header_get(head: *header, key: str) str = {
		if (map.key != key) {
			continue;
		};
		return map.fields[len(map.fields) - 1].val;
		return decode_encoded_words(map.fields[len(map.fields) - 1].val);
	};

	return "";


@@ 123,6 124,9 @@ export fn header_get(head: *header, key: str) str = {
	header_add(&head, "User-Agent", "Harriet");
	assert(header_get(&head, "User-Agent") == "Harriet");

	header_add(&head, "To", "=?UTF-8?Q?A._D=C3=BCrer?= <duerer@example.org>");
	assert(header_get(&head, "To") == "A. Dürer <duerer@example.org>");

	assert(header_get(&head, "foobar") == "");
};



@@ 356,6 360,7 @@ export fn read_header(
		};

		const val = decode_header_value(kv[i+1..]);
		const val = decode_encoded_words(val);
		const field = alloc(header_field {
			raw = kv,
			key = key,


@@ 371,6 376,7 @@ export fn read_header(
	const input =
		"To: Drew DeVault <sir@cmpwn.com>\r\n"
		"From: Harriet <harriet@harelang.org>\r\n"
		"Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
		"Content-Type: text/plain\r\n"
		"DKIM-Signature: a=rsa-sha256;\r\n"
		" bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n"


@@ 383,6 389,7 @@ export fn read_header(

	assert(header_get(&head, "To") == "Drew DeVault <sir@cmpwn.com>");
	assert(header_get(&head, "From") == "Harriet <harriet@harelang.org>");
	assert(header_get(&head, "Cc") == "张三 <zhang.san@example.com>");
	assert(header_get(&head, "Content-Type") == "text/plain");
	assert(header_get(&head, "Dkim-Signature") == "a=rsa-sha256; bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple; d=example.org; h=Subject:To:From; s=default; t=1577562184; v=1; b=;");
};


@@ 410,6 417,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
	header_add(&head, "Content-Type", "text/plain");
	header_add(&head, "FROM", "Harriet <harriet@harelang.org>");
	header_add(&head, "to", "Drew DeVault <sir@cmpwn.com>");
	header_add(&head, "cc", "张三 <zhang.san@example.com>");

	const sink = memio::dynamic();
	defer io::close(&sink)!;


@@ 417,6 425,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
	const result = memio::string(&sink)!;

	const expect =
		"Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
		"To: Drew DeVault <sir@cmpwn.com>\r\n"
		"From: Harriet <harriet@harelang.org>\r\n"
		"Content-Type: text/plain\r\n"


@@ 435,6 444,7 @@ export fn write_header(sink: io::handle, head: *header) (size | io::error) = {
	const input =
		"To: Drew DeVault <sir@cmpwn.com>\r\n"
		"From: Harriet <harriet@harelang.org>\r\n"
		"Cc: =?UTF-8?Q?=E5=BC=A0=E4=B8=89_<zhang.san@example.com>?=\r\n"
		"Content-Type: text/plain\r\n"
		"DKIM-Signature: a=rsa-sha256;\r\n"
		" bh=uI/rVH7mLBSWkJVvQYKz3TbpdI2BLZWTIMKcuo0KHOI=; c=simple/simple;\r\n"


@@ 481,7 491,7 @@ fn header_field_raw(hf: *header_field) ([]u8 | errors::invalid) = {
		const rn = match (strings::next(&iter)) {
		case let rn: rune =>
			yield rn;
		case void =>
		case done =>
			break;
		};