~sircmpwn/hare-xml

cb21826ea7dc887d69fb46458ea30482dba5d8e9 — Sebastian 5 months ago 07155c8
format::xml: return line number of syntax error

This also expands the tests to ensure that the line number is accurate.

Signed-off-by: Sebastian <sebastian@sebsite.pw>
3 files changed, 104 insertions(+), 59 deletions(-)

M +test.ha
M parser.ha
M types.ha
M +test.ha => +test.ha +17 -11
@@ 5,6 5,7 @@
// (c) 2021 Eyal Sawady <ecs@d2evs.net>
use io;
use bufio;
use fmt;
use strings;

@test fn basic() void = {


@@ 25,7 26,7 @@ use strings;
		"foobar": elementend,
		"\n": text,
		"root": elementend,
	], false);
	], void);
};

@test fn comments() void = {


@@ 42,7 43,7 @@ use strings;
		"hello": elementend,
		"\n": text,
		"root": elementend,
	], false);
	], void);
};

@test fn entities() void = {


@@ 58,7 59,7 @@ use strings;
		"hello": elementend,
		"\n": text,
		"root": elementend,
	], false);
	], void);
};

@test fn cdata() void = {


@@ 71,11 72,15 @@ use strings;
		"Hello world &foo <bar>": text,
		"\n": text,
		"root": elementend,
	], false);
	], void);
};

@test fn errors() void = {
	xmltest("<?xml version='1.0' encoding='utf-8' ?>
<!--
comment which spans
multiple lines
-->
<root>
	<hello name='foobar'></world>
</root>", [


@@ 83,10 88,10 @@ use strings;
		"\n\t": text,
		"hello": elementstart,
		("name", "foobar"): attribute,
	], true);
	], 7);
};

fn xmltest(input: str, expected: []token, err: bool) void = {
fn xmltest(input: str, expected: []token, err: (void | size)) void = {
	let in = bufio::fixed(strings::toutf8(input), io::mode::READ);
	let parser = parse(&in) as *parser;
	for (let i = 0z; i < len(expected); i += 1) {


@@ 95,8 100,8 @@ fn xmltest(input: str, expected: []token, err: bool) void = {
			yield tok;
		case void =>
			abort("Expected token, got void");
		case syntaxerr =>
			abort("Expected token, got syntax error");
		case let err: syntaxerr =>
			fmt::fatal("{}", strerror(err));
		};
		match (tok) {
		case let el: elementstart =>


@@ 117,9 122,10 @@ fn xmltest(input: str, expected: []token, err: bool) void = {
			assert(el == ex);
		};
	};
	if (err) {
		assert(scan(parser) is error);
	} else {
	match (err) {
	case void =>
		assert(scan(parser) is void);
	case let z: size =>
		assert(scan(parser) as syntaxerr: size == z);
	};
};

M parser.ha => parser.ha +74 -40
@@ 33,6 33,7 @@ export fn parse(in: io::handle) (*parser | error) = {
		namebuf = strio::dynamic(),
		entbuf = strio::dynamic(),
		textbuf = strio::dynamic(),
		line = 1,
		...
	});
	if (bufio::isbuffered(in)) {


@@ 77,7 78,7 @@ export fn scan(par: *parser) (token | void | error) = {
	let rn: rune = match (bufio::scanrune(par.in)?) {
	case io::EOF =>
		if (par.state == state::ROOT) {
			return syntaxerr;
			return par.line: syntaxerr;
		} else {
			return;
		};


@@ 90,7 91,7 @@ export fn scan(par: *parser) (token | void | error) = {
		case '<' =>
			const next = match (bufio::scanrune(par.in)?) {
			case io::EOF =>
				return syntaxerr;
				return par.line: syntaxerr;
			case let rn: rune =>
				bufio::unreadrune(par.in, rn);
				yield rn;


@@ 108,7 109,7 @@ export fn scan(par: *parser) (token | void | error) = {
			return el;
		case =>
			if (par.state == state::ROOT) {
				return syntaxerr;
				return par.line: syntaxerr;
			};
			bufio::unreadrune(par.in, rn);
			return scan_content(par)?;


@@ 122,7 123,7 @@ export fn scan(par: *parser) (token | void | error) = {
			par.state = state::ELEMENT;
			return scan(par)?;
		} else if (!isnamestart(rn)) {
			return syntaxerr;
			return par.line: syntaxerr;
		};
		bufio::unreadrune(par.in, rn);
		return scan_attr(par)?;


@@ 131,13 132,13 @@ export fn scan(par: *parser) (token | void | error) = {

fn poptag(par: *parser, expect: str) (str | error) = {
	if (len(par.tags) == 0) {
		return syntaxerr;
		return par.line: syntaxerr;
	};
	let pop = par.tags[len(par.tags) - 1];
	delete(par.tags[len(par.tags) - 1]);
	defer free(pop);
	if (expect != "" && expect != pop) {
		return syntaxerr;
		return par.line: syntaxerr;
	};
	strio::reset(&par.namebuf);
	strio::concat(&par.namebuf, pop)!;


@@ 151,14 152,17 @@ fn scan_attr(par: *parser) (token | error) = {
	strio::reset(&par.textbuf);
	for (true) match (bufio::scanrune(par.in)?) {
	case io::EOF =>
		return syntaxerr;
		return par.line: syntaxerr;
	case let rn: rune =>
		rn = switch (rn) {
		case '<' =>
			return syntaxerr;
			return par.line: syntaxerr;
		case '&' =>
			bufio::unreadrune(par.in, rn);
			yield scan_entity(par)?;
		case '\n' =>
			par.line += 1;
			yield rn;
		case =>
			yield rn;
		};


@@ 172,7 176,7 @@ fn scan_comment(par: *parser) (token | void | error) = {
	want(par, "<!")?;
	match (bufio::scanrune(par.in)?) {
	case io::EOF =>
		return syntaxerr;
		return par.line: syntaxerr;
	case let rn: rune =>
		switch (rn) {
		case '-' => // Comments


@@ 180,35 184,43 @@ fn scan_comment(par: *parser) (token | void | error) = {
		case '[' =>
			want(par, "CDATA[")?;
			if (par.state != state::ELEMENT) {
				return syntaxerr;
				return par.line: syntaxerr;
			};
			return scan_cdata(par)?;
		case =>
			return syntaxerr;
			return par.line: syntaxerr;
		};
	};
	for (true) {
		const rn = match (bufio::scanrune(par.in)?) {
		case io::EOF =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let rn: rune =>
			if (rn == '\n') par.line += 1;
			yield rn;
		};
		if (rn != '-') continue;
		const rn = match (bufio::scanrune(par.in)?) {
		case io::EOF =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let rn: rune =>
			if (rn == '\n') par.line += 1;
			yield rn;
		};
		if (rn != '-') continue;
		const rn = match (bufio::scanrune(par.in)?) {
		case io::EOF =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let rn: rune =>
			yield rn;
		};
		if (rn == '>') break;
		switch (rn) {
		case '>' =>
			break;
		case '\n' =>
			par.line += 1;
		case => void;
		};
	};
	return scan(par);
};


@@ 218,31 230,39 @@ fn scan_cdata(par: *parser) (text | error) = {
	for (true) {
		const rn = match (bufio::scanrune(par.in)?) {
		case io::EOF =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let rn: rune =>
			yield rn;
		};
		if (rn != ']') {
			if (rn == '\n') par.line += 1;
			strio::appendrune(&par.textbuf, rn)!;
			continue;
		};
		const rn = match (bufio::scanrune(par.in)?) {
		case io::EOF =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let rn: rune =>
			yield rn;
		};
		if (rn != ']') {
			if (rn == '\n') par.line += 1;
			strio::appendrune(&par.textbuf, rn)!;
			continue;
		};
		const rn = match (bufio::scanrune(par.in)?) {
		case io::EOF =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let rn: rune =>
			yield rn;
		};
		if (rn == '>') break;
		switch (rn) {
		case '>' =>
			break;
		case '\n' =>
			par.line += 1;
		case => void;
		};
		strio::appendrune(&par.textbuf, rn)!;
	};
	return strio::string(&par.textbuf): text;


@@ 261,6 281,9 @@ fn scan_content(par: *parser) (text | error) = {
		case '&', '%' =>
			bufio::unreadrune(par.in, rn);
			yield scan_entity(par)?;
		case '\n' =>
			par.line += 1;
			yield rn;
		case =>
			yield rn;
		};


@@ 274,11 297,14 @@ fn scan_element(par: *parser) (token | error) = {
	let close = false;
	match (bufio::scanrune(par.in)?) {
	case io::EOF =>
		return syntaxerr;
		return par.line: syntaxerr;
	case let rn: rune =>
		switch (rn) {
		case '/' =>
			close = true;
		case '\n' =>
			par.line += 1;
			bufio::unreadrune(par.in, rn);
		case =>
			bufio::unreadrune(par.in, rn);
		};


@@ 297,7 323,7 @@ fn scan_entity(par: *parser) (rune | error) = {
	want(par, '&')?;
	let rn = match (bufio::scanrune(par.in)?) {
	case io::EOF =>
		return syntaxerr;
		return par.line: syntaxerr;
	case let rn: rune =>
		yield rn;
	};


@@ 305,7 331,9 @@ fn scan_entity(par: *parser) (rune | error) = {
	case '#' =>
		return scan_charref(par);
	case '%' =>
		return syntaxerr; // XXX: Deliberate omission: PEReference
		return par.line: syntaxerr; // XXX: Deliberate omission: PEReference
	case '\n' =>
		return par.line: syntaxerr;
	case =>
		bufio::unreadrune(par.in, rn);
		return scan_namedent(par);


@@ 316,7 344,7 @@ fn scan_charref(par: *parser) (rune | error) = {
	let base = strconv::base::DEC;
	match (bufio::scanrune(par.in)?) {
	case io::EOF =>
		return syntaxerr;
		return par.line: syntaxerr;
	case let rn: rune =>
		if (rn == 'x') {
			base = strconv::base::HEX;


@@ 329,7 357,7 @@ fn scan_charref(par: *parser) (rune | error) = {
	for (true) {
		let rn = match (bufio::scanrune(par.in)?) {
		case io::EOF =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let rn: rune =>
			yield rn;
		};


@@ 338,17 366,17 @@ fn scan_charref(par: *parser) (rune | error) = {
		} else if (rn == ';') {
			break;
		} else {
			return syntaxerr;
			return par.line: syntaxerr;
		};
	};
	if (len(strio::string(&par.entbuf)) == 0) {
		return syntaxerr;
		return par.line: syntaxerr;
	};
	match (strconv::stou32b(strio::string(&par.entbuf), base)) {
	case let u: u32 =>
		return u: rune;
	case (strconv::invalid | strconv::overflow) =>
		return syntaxerr;
		return par.line: syntaxerr;
	};
};



@@ 369,7 397,7 @@ fn scan_namedent(par: *parser) (rune | error) = {
	};
	// XXX: Deliberate ommission: this only supports the pre-defined
	// entities as defined by XML 1.0 (Fifth Edition) section 4.6.
	return syntaxerr;
	return par.line: syntaxerr;
};

fn scan_name(par: *parser, buf: *strio::dynamic_stream) (str | error) = {


@@ 377,18 405,18 @@ fn scan_name(par: *parser, buf: *strio::dynamic_stream) (str | error) = {

	const rn = match (bufio::scanrune(par.in)?) {
	case io::EOF =>
		return syntaxerr;
		return par.line: syntaxerr;
	case let rn: rune =>
		yield rn;
	};
	if (!isnamestart(rn)) {
		return syntaxerr;
		return par.line: syntaxerr;
	};
	strio::appendrune(buf, rn)!;

	for (true) match (bufio::scanrune(par.in)?) {
	case io::EOF =>
		return syntaxerr;
		return par.line: syntaxerr;
	case let rn: rune =>
		if (isname(rn)) {
			strio::appendrune(buf, rn)!;


@@ 436,7 464,7 @@ fn prolog(par: *parser) (void | error) = {
	if (encoding) {
		let attr = scan_attr(par)? as attribute;
		if (attr.0 != "encoding") {
			return syntaxerr;
			return par.line: syntaxerr;
		};
		// XXX: Deliberate omission: all values other than utf-8
		match (ascii::strcasecmp(attr.1, "utf-8")) {


@@ 460,15 488,15 @@ fn prolog(par: *parser) (void | error) = {
	if (standalone) {
		let attr = scan_attr(par)? as attribute;
		if (attr.0 != "standalone") {
			return syntaxerr;
			return par.line: syntaxerr;
		};
		// XXX: Deliberate omission: non-standalone documents
		match (ascii::strcasecmp(attr.1, "yes")) {
		case void =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let n: int =>
			if (n != 0) {
				return syntaxerr;
				return par.line: syntaxerr;
			};
		};
	};


@@ 490,10 518,10 @@ fn quote(par: *parser) (rune | error) = {
		case '"', '\'' =>
			return rn;
		case =>
			return syntaxerr;
			return par.line: syntaxerr;
		};
	case =>
		return syntaxerr;
		return par.line: syntaxerr;
	};
};



@@ 503,12 531,15 @@ fn want(par: *parser, tok: (rune | str | whitespace)...) (bool | error) = {
	case let x: rune =>
		let have = match (bufio::scanrune(par.in)?) {
		case io::EOF =>
			return syntaxerr;
			return par.line: syntaxerr;
		case let rn: rune =>
			yield rn;
		};
		if (have != x) {
			return syntaxerr;
			return par.line: syntaxerr;
		};
		if (x == '\n') {
			par.line += 1;
		};
	case let x: str =>
		let iter = strings::iter(x);


@@ 528,9 559,12 @@ fn want(par: *parser, tok: (rune | str | whitespace)...) (bool | error) = {
				bufio::unreadrune(par.in, rn);
				break;
			};
			if (rn == '\n') {
				par.line += 1;
			};
		};
		if (ws && n < 1) {
			return syntaxerr;
			return par.line: syntaxerr;
		};
		hadws = n >= 1;
	};

M types.ha => types.ha +13 -8
@@ 4,6 4,7 @@
// (c) 2021 Eyal Sawady <ecs@d2evs.net>
use bufio;
use encoding::utf8;
use fmt;
use io;
use os;
use strio;


@@ 14,6 15,7 @@ export type parser = struct {
	close: bool,
	state: state,
	tags: []str,
	line: size,

	// strio buffers:
	namebuf: strio::dynamic_stream,


@@ 43,17 45,20 @@ export type text = str;
export type token = (elementstart | elementend | attribute | text);

// A syntax error was encountered in the document.
export type syntaxerr = !void; // TODO: Add line number?
export type syntaxerr = !size;

// Any error which can occur during XML parsing.
export type error = !(syntaxerr | utf8::invalid | io::error);

// Converts an [[error]] to a user-friendly string representation.
export fn strerror(err: error) const str = match (err) {
case syntaxerr =>
	yield "Syntax error";
case utf8::invalid =>
	yield "Document is not valid UTF-8";
case let err: io::error =>
	yield io::strerror(err);
export fn strerror(err: error) const str = {
	static let buf: [2048]u8 = [0...];
	match (err) {
	case let err: syntaxerr =>
		return fmt::bsprintf(buf, "Syntax error on line {}", err: size);
	case utf8::invalid =>
		return "Document is not valid UTF-8";
	case let err: io::error =>
		return io::strerror(err);
	};
};