~mna/fastpeg

3a53f16966e476d19265aa66feacfb229b58ee02 — Martin Angers 8 months ago d8fadd0
internal/bootstrap/scanner: more scanner tests
2 files changed, 318 insertions(+), 3 deletions(-)

M internal/bootstrap/scanner/scanner.go
M internal/bootstrap/scanner/scanner_test.go
M internal/bootstrap/scanner/scanner.go => internal/bootstrap/scanner/scanner.go +23 -3
@@ 8,6 8,7 @@ package scanner
import (
	"fmt"
	"go/scanner"
	"io"
	"unicode"
	"unicode/utf8"



@@ 24,6 25,13 @@ const (
// ErrorList is a list of errors.
type ErrorList = scanner.ErrorList

// PrintError is a utility function that prints a list of errors
// to w, one error per line, if the err parameter is an ErrorList.
// Otherwise it prints the err string.
func PrintError(w io.Writer, err error) {
	scanner.PrintError(w, err)
}

// A Scanner holds the scanner's internal state while processing a
// given input. It can be allocated as part of another data structure
// but must be initialized via Init before use.


@@ 164,7 172,11 @@ func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {

		case '<':
			if s.rn != '-' {
				s.error(s.curpos-1, fmt.Sprintf("incomplete arrow symbol: illegal character %#U", s.rn))
				msg := fmt.Sprintf("incomplete arrow symbol: illegal character %#U", s.rn)
				if s.rn == eof {
					msg = "arrow symbol not terminated"
				}
				s.error(s.curpos-1, msg)
				break
			}
			s.advance()


@@ 190,7 202,11 @@ func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {

		case '$':
			if s.rn != '{' {
				s.error(s.curpos-1, fmt.Sprintf("invalid state coderef: illegal character %#U", s.rn))
				msg := fmt.Sprintf("invalid state coderef: illegal character %#U", s.rn)
				if s.rn == eof {
					msg = "state coderef not terminated"
				}
				s.error(s.curpos-1, msg)
				break
			}
			tok = token.StateCoderef


@@ 198,7 214,11 @@ func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {

		case '@':
			if s.rn != '{' {
				s.error(s.curpos-1, fmt.Sprintf("invalid state coderef: illegal character %#U", s.rn))
				msg := fmt.Sprintf("invalid action coderef: illegal character %#U", s.rn)
				if s.rn == eof {
					msg = "action coderef not terminated"
				}
				s.error(s.curpos-1, msg)
				break
			}
			tok = token.ActionCoderef

M internal/bootstrap/scanner/scanner_test.go => internal/bootstrap/scanner/scanner_test.go +295 -0
@@ 1,7 1,11 @@
package scanner

import (
	"fmt"
	"os"
	"strings"
	"testing"
	"unicode/utf8"

	"git.sr.ht/~mna/fastpeg/internal/bootstrap/token"
	"github.com/stretchr/testify/require"


@@ 90,6 94,297 @@ func TestScanner_Escapes(t *testing.T) {
	require.NoError(t, s.Err())
}

func TestScanner_Errors(t *testing.T) {
	cases := []struct {
		input  string
		output []tuple
		errMsg string
	}{
		{"", nil, ""},
		{
			string(rune(bom)) + "a",
			[]tuple{{token.Identifier, "a"}},
			"",
		},
		{
			"a" + string(rune(bom)),
			[]tuple{
				{token.Identifier, "a"},
				{token.Illegal, "\ufeff"},
			},
			"illegal byte-order mark",
		},
		{
			"\x00",
			[]tuple{{token.Illegal, "\x00"}},
			"illegal character NUL",
		},
		{
			"\xff",
			[]tuple{{token.Illegal, string(utf8.RuneError)}},
			"illegal UTF-8 encoding",
		},
		{
			"<Z",
			[]tuple{{token.Illegal, ""}, {token.Identifier, "Z"}},
			"incomplete arrow symbol",
		},
		{
			"<",
			[]tuple{{token.Illegal, ""}},
			"arrow symbol not terminated",
		},
		{
			"$Z",
			[]tuple{{token.Illegal, ""}, {token.Identifier, "Z"}},
			"invalid state coderef",
		},
		{
			"$",
			[]tuple{{token.Illegal, ""}},
			"state coderef not terminated",
		},
		{
			"${",
			[]tuple{{token.StateCoderef, ""}},
			"coderef not terminated",
		},
		{
			"@Z",
			[]tuple{{token.Illegal, ""}, {token.Identifier, "Z"}},
			"invalid action coderef",
		},
		{
			"@",
			[]tuple{{token.Illegal, ""}},
			"action coderef not terminated",
		},
		{
			"@{",
			[]tuple{{token.ActionCoderef, ""}},
			"coderef not terminated",
		},
		{
			"&Z",
			[]tuple{{token.Ampersand, ""}, {token.Identifier, "Z"}},
			"",
		},
		{
			"&",
			[]tuple{{token.Ampersand, ""}},
			"",
		},
		{
			"&{",
			[]tuple{{token.PredCoderef, ""}},
			"coderef not terminated",
		},
		{
			`"`,
			[]tuple{{token.Literal, `"`}},
			"literal not terminated",
		},
		{
			`'`,
			[]tuple{{token.Literal, `'`}},
			"literal not terminated",
		},
		{
			`"a
      b`,
			[]tuple{{token.Literal, `"a`}, {token.Identifier, "b"}},
			"literal not terminated",
		},
		{
			`'a
      b`,
			[]tuple{{token.Literal, `'a`}, {token.Identifier, "b"}},
			"literal not terminated",
		},
		{
			`[`,
			[]tuple{{token.CharClass, `[`}},
			"character class not terminated",
		},
		{
			`[a
      b`,
			[]tuple{{token.CharClass, `[a`}, {token.Identifier, "b"}},
			"character class not terminated",
		},
		{
			`[\z]`,
			[]tuple{{token.CharClass, `[\z]`}},
			"unknown escape sequence",
		},
		{
			`[\`,
			[]tuple{{token.CharClass, `[\`}},
			"escape sequence not terminated",
		},
		{
			`[\x`,
			[]tuple{{token.CharClass, `[\x`}},
			"escape sequence not terminated",
		},
		{
			`[\x0`,
			[]tuple{{token.CharClass, `[\x0`}},
			"escape sequence not terminated",
		},
		{
			`[\u000`,
			[]tuple{{token.CharClass, `[\u000`}},
			"escape sequence not terminated",
		},
		{
			`[\U0000001`,
			[]tuple{{token.CharClass, `[\U0000001`}},
			"escape sequence not terminated",
		},
		{
			`[\x0G]`,
			[]tuple{{token.CharClass, `[\x0G]`}},
			"illegal character U+0047",
		},
		{
			`[\u000G]`,
			[]tuple{{token.CharClass, `[\u000G]`}},
			"illegal character U+0047",
		},
		{
			`[\U0000000G]`,
			[]tuple{{token.CharClass, `[\U0000000G]`}},
			"illegal character U+0047",
		},
		{
			`[\uD800]`,
			[]tuple{{token.CharClass, `[\uD800]`}},
			"invalid Unicode code point",
		},
		{
			`[\U99999999]`,
			[]tuple{{token.CharClass, `[\U99999999]`}},
			"invalid Unicode code point",
		},
		{
			`[\p`,
			[]tuple{{token.CharClass, `[\p`}},
			"escape sequence not terminated",
		},
		{
			`[\p{`,
			[]tuple{{token.CharClass, `[\p{`}},
			"escape sequence not terminated",
		},
		{
			`[\P`,
			[]tuple{{token.CharClass, `[\P`}},
			"escape sequence not terminated",
		},
		{
			`[\P{`,
			[]tuple{{token.CharClass, `[\P{`}},
			"escape sequence not terminated",
		},
		{
			`[\PA]`,
			[]tuple{{token.CharClass, `[\PA]`}},
			"invalid Unicode class",
		},
		{
			`[\p{abc`,
			[]tuple{{token.CharClass, `[\p{abc`}},
			"escape sequence not terminated",
		},
		{
			`[\p{abc
      d`,
			[]tuple{{token.CharClass, `[\p{abc`}, {token.Identifier, "d"}},
			"escape sequence not terminated",
		},
	}
	for _, c := range cases {
		t.Run(c.input, func(t *testing.T) {
			var s Scanner
			result := scanAll(&s, c.input)
			require.Equal(t, c.output, result)

			if c.errMsg == "" {
				require.NoError(t, s.Err())
				return
			}

			err := s.Err()
			require.Error(t, err)
			PrintError(os.Stderr, err)
			require.Contains(t, err.Error(), c.errMsg)
		})
	}
}

func TestScanner_Pos(t *testing.T) {
	// NOTE: ⟵ is 3 bytes, ← too, BOM too (all in utf-8)

	// list of files sent to the scanner in the same FileSet
	inputs := []string{
		` start ⟵ expression*
expression <- factor (_ factor)?
`,
		`factor = "lit"
            | 'lat';`,
		"\ufeff\n\tchars←[\\a-\\r]+",
	}

	expect := `
test-0:1:2:1: Identifier[start]
test-0:1:8:7: Arrow[⟵]
test-0:1:12:11: Identifier[expression]
test-0:1:22:21: *[]
test-0:2:1:23: Identifier[expression]
test-0:2:12:34: Arrow[<-]
test-0:2:15:37: Identifier[factor]
test-0:2:22:44: ([]
test-0:2:23:45: Identifier[_]
test-0:2:25:47: Identifier[factor]
test-0:2:31:53: )[]
test-0:2:32:54: ?[]
test-0:2:34:56: EOF[]
test-1:1:1:0: Identifier[factor]
test-1:1:8:7: Arrow[=]
test-1:1:10:9: Literal["lit"]
test-1:2:13:27: Separator[|]
test-1:2:15:29: Literal['lat']
test-1:2:20:34: ;[]
test-1:2:21:35: EOF[]
test-2:2:2:5: Identifier[chars]
test-2:2:7:10: Arrow[←]
test-2:2:10:13: CharClass[[\a-\r]]
test-2:2:17:20: +[]
test-2:2:18:21: EOF[]
`

	var s Scanner
	var result strings.Builder

	fs := token.NewFileSet()
	for i, input := range inputs {
		f := fs.AddFile(fmt.Sprintf("test-%d", i), -1, len(input))
		s.Init(f, []byte(input))

		for {
			pos, tok, lit := s.Scan()
			lpos := fs.Position(pos)
			fmt.Fprintf(&result, "%s:%d:%d:%d: %s[%s]\n", lpos.Filename, lpos.Line, lpos.Column, lpos.Offset, tok, lit)
			if tok == token.EOF {
				break
			}
		}
	}

	require.Equal(t, strings.TrimSpace(expect), strings.TrimSpace(result.String()))
}

func scanAll(s *Scanner, input string) []tuple {
	var result []tuple