3a53f16966e476d19265aa66feacfb229b58ee02 — Martin Angers 5 months ago d8fadd0
internal/bootstrap/scanner: more scanner tests
2 files changed, 318 insertions(+), 3 deletions(-)

M internal/bootstrap/scanner/scanner.go
M internal/bootstrap/scanner/scanner_test.go
M internal/bootstrap/scanner/scanner.go => internal/bootstrap/scanner/scanner.go +23 -3
@@ 8,6 8,7 @@ import (
  	"fmt"
  	"go/scanner"
+ 	"io"
  	"unicode"
  	"unicode/utf8"
  


@@ 24,6 25,13 @@ // ErrorList is a list of errors.
  type ErrorList = scanner.ErrorList
  
+ // PrintError is a utility function that prints a list of errors
+ // to w, one error per line, if the err parameter is an ErrorList.
+ // Otherwise it prints the err string.
+ func PrintError(w io.Writer, err error) {
+ 	scanner.PrintError(w, err)
+ }
+ 
  // A Scanner holds the scanner's internal state while processing a
  // given input. It can be allocated as part of another data structure
  // but must be initialized via Init before use.


@@ 164,7 172,11 @@   		case '<':
  			if s.rn != '-' {
- 				s.error(s.curpos-1, fmt.Sprintf("incomplete arrow symbol: illegal character %#U", s.rn))
+ 				msg := fmt.Sprintf("incomplete arrow symbol: illegal character %#U", s.rn)
+ 				if s.rn == eof {
+ 					msg = "arrow symbol not terminated"
+ 				}
+ 				s.error(s.curpos-1, msg)
  				break
  			}
  			s.advance()


@@ 190,7 202,11 @@   		case '$':
  			if s.rn != '{' {
- 				s.error(s.curpos-1, fmt.Sprintf("invalid state coderef: illegal character %#U", s.rn))
+ 				msg := fmt.Sprintf("invalid state coderef: illegal character %#U", s.rn)
+ 				if s.rn == eof {
+ 					msg = "state coderef not terminated"
+ 				}
+ 				s.error(s.curpos-1, msg)
  				break
  			}
  			tok = token.StateCoderef


@@ 198,7 214,11 @@   		case '@':
  			if s.rn != '{' {
- 				s.error(s.curpos-1, fmt.Sprintf("invalid state coderef: illegal character %#U", s.rn))
+ 				msg := fmt.Sprintf("invalid action coderef: illegal character %#U", s.rn)
+ 				if s.rn == eof {
+ 					msg = "action coderef not terminated"
+ 				}
+ 				s.error(s.curpos-1, msg)
  				break
  			}
  			tok = token.ActionCoderef

M internal/bootstrap/scanner/scanner_test.go => internal/bootstrap/scanner/scanner_test.go +295 -0
@@ 1,7 1,11 @@ package scanner
  
  import (
+ 	"fmt"
+ 	"os"
+ 	"strings"
  	"testing"
+ 	"unicode/utf8"
  
  	"git.sr.ht/~mna/fastpeg/internal/bootstrap/token"
  	"github.com/stretchr/testify/require"


@@ 90,6 94,297 @@ require.NoError(t, s.Err())
  }
  
+ func TestScanner_Errors(t *testing.T) {
+ 	cases := []struct {
+ 		input  string
+ 		output []tuple
+ 		errMsg string
+ 	}{
+ 		{"", nil, ""},
+ 		{
+ 			string(rune(bom)) + "a",
+ 			[]tuple{{token.Identifier, "a"}},
+ 			"",
+ 		},
+ 		{
+ 			"a" + string(rune(bom)),
+ 			[]tuple{
+ 				{token.Identifier, "a"},
+ 				{token.Illegal, "\ufeff"},
+ 			},
+ 			"illegal byte-order mark",
+ 		},
+ 		{
+ 			"\x00",
+ 			[]tuple{{token.Illegal, "\x00"}},
+ 			"illegal character NUL",
+ 		},
+ 		{
+ 			"\xff",
+ 			[]tuple{{token.Illegal, string(utf8.RuneError)}},
+ 			"illegal UTF-8 encoding",
+ 		},
+ 		{
+ 			"<Z",
+ 			[]tuple{{token.Illegal, ""}, {token.Identifier, "Z"}},
+ 			"incomplete arrow symbol",
+ 		},
+ 		{
+ 			"<",
+ 			[]tuple{{token.Illegal, ""}},
+ 			"arrow symbol not terminated",
+ 		},
+ 		{
+ 			"$Z",
+ 			[]tuple{{token.Illegal, ""}, {token.Identifier, "Z"}},
+ 			"invalid state coderef",
+ 		},
+ 		{
+ 			"$",
+ 			[]tuple{{token.Illegal, ""}},
+ 			"state coderef not terminated",
+ 		},
+ 		{
+ 			"${",
+ 			[]tuple{{token.StateCoderef, ""}},
+ 			"coderef not terminated",
+ 		},
+ 		{
+ 			"@Z",
+ 			[]tuple{{token.Illegal, ""}, {token.Identifier, "Z"}},
+ 			"invalid action coderef",
+ 		},
+ 		{
+ 			"@",
+ 			[]tuple{{token.Illegal, ""}},
+ 			"action coderef not terminated",
+ 		},
+ 		{
+ 			"@{",
+ 			[]tuple{{token.ActionCoderef, ""}},
+ 			"coderef not terminated",
+ 		},
+ 		{
+ 			"&Z",
+ 			[]tuple{{token.Ampersand, ""}, {token.Identifier, "Z"}},
+ 			"",
+ 		},
+ 		{
+ 			"&",
+ 			[]tuple{{token.Ampersand, ""}},
+ 			"",
+ 		},
+ 		{
+ 			"&{",
+ 			[]tuple{{token.PredCoderef, ""}},
+ 			"coderef not terminated",
+ 		},
+ 		{
+ 			`"`,
+ 			[]tuple{{token.Literal, `"`}},
+ 			"literal not terminated",
+ 		},
+ 		{
+ 			`'`,
+ 			[]tuple{{token.Literal, `'`}},
+ 			"literal not terminated",
+ 		},
+ 		{
+ 			`"a
+       b`,
+ 			[]tuple{{token.Literal, `"a`}, {token.Identifier, "b"}},
+ 			"literal not terminated",
+ 		},
+ 		{
+ 			`'a
+       b`,
+ 			[]tuple{{token.Literal, `'a`}, {token.Identifier, "b"}},
+ 			"literal not terminated",
+ 		},
+ 		{
+ 			`[`,
+ 			[]tuple{{token.CharClass, `[`}},
+ 			"character class not terminated",
+ 		},
+ 		{
+ 			`[a
+       b`,
+ 			[]tuple{{token.CharClass, `[a`}, {token.Identifier, "b"}},
+ 			"character class not terminated",
+ 		},
+ 		{
+ 			`[\z]`,
+ 			[]tuple{{token.CharClass, `[\z]`}},
+ 			"unknown escape sequence",
+ 		},
+ 		{
+ 			`[\`,
+ 			[]tuple{{token.CharClass, `[\`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\x`,
+ 			[]tuple{{token.CharClass, `[\x`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\x0`,
+ 			[]tuple{{token.CharClass, `[\x0`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\u000`,
+ 			[]tuple{{token.CharClass, `[\u000`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\U0000001`,
+ 			[]tuple{{token.CharClass, `[\U0000001`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\x0G]`,
+ 			[]tuple{{token.CharClass, `[\x0G]`}},
+ 			"illegal character U+0047",
+ 		},
+ 		{
+ 			`[\u000G]`,
+ 			[]tuple{{token.CharClass, `[\u000G]`}},
+ 			"illegal character U+0047",
+ 		},
+ 		{
+ 			`[\U0000000G]`,
+ 			[]tuple{{token.CharClass, `[\U0000000G]`}},
+ 			"illegal character U+0047",
+ 		},
+ 		{
+ 			`[\uD800]`,
+ 			[]tuple{{token.CharClass, `[\uD800]`}},
+ 			"invalid Unicode code point",
+ 		},
+ 		{
+ 			`[\U99999999]`,
+ 			[]tuple{{token.CharClass, `[\U99999999]`}},
+ 			"invalid Unicode code point",
+ 		},
+ 		{
+ 			`[\p`,
+ 			[]tuple{{token.CharClass, `[\p`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\p{`,
+ 			[]tuple{{token.CharClass, `[\p{`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\P`,
+ 			[]tuple{{token.CharClass, `[\P`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\P{`,
+ 			[]tuple{{token.CharClass, `[\P{`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\PA]`,
+ 			[]tuple{{token.CharClass, `[\PA]`}},
+ 			"invalid Unicode class",
+ 		},
+ 		{
+ 			`[\p{abc`,
+ 			[]tuple{{token.CharClass, `[\p{abc`}},
+ 			"escape sequence not terminated",
+ 		},
+ 		{
+ 			`[\p{abc
+       d`,
+ 			[]tuple{{token.CharClass, `[\p{abc`}, {token.Identifier, "d"}},
+ 			"escape sequence not terminated",
+ 		},
+ 	}
+ 	for _, c := range cases {
+ 		t.Run(c.input, func(t *testing.T) {
+ 			var s Scanner
+ 			result := scanAll(&s, c.input)
+ 			require.Equal(t, c.output, result)
+ 
+ 			if c.errMsg == "" {
+ 				require.NoError(t, s.Err())
+ 				return
+ 			}
+ 
+ 			err := s.Err()
+ 			require.Error(t, err)
+ 			PrintError(os.Stderr, err)
+ 			require.Contains(t, err.Error(), c.errMsg)
+ 		})
+ 	}
+ }
+ 
+ func TestScanner_Pos(t *testing.T) {
+ 	// NOTE: ⟵ is 3 bytes, ← too, BOM too (all in utf-8)
+ 
+ 	// list of files sent to the scanner in the same FileSet
+ 	inputs := []string{
+ 		` start ⟵ expression*
+ expression <- factor (_ factor)?
+ `,
+ 		`factor = "lit"
+             | 'lat';`,
+ 		"\ufeff\n\tchars←[\\a-\\r]+",
+ 	}
+ 
+ 	expect := `
+ test-0:1:2:1: Identifier[start]
+ test-0:1:8:7: Arrow[⟵]
+ test-0:1:12:11: Identifier[expression]
+ test-0:1:22:21: *[]
+ test-0:2:1:23: Identifier[expression]
+ test-0:2:12:34: Arrow[<-]
+ test-0:2:15:37: Identifier[factor]
+ test-0:2:22:44: ([]
+ test-0:2:23:45: Identifier[_]
+ test-0:2:25:47: Identifier[factor]
+ test-0:2:31:53: )[]
+ test-0:2:32:54: ?[]
+ test-0:2:34:56: EOF[]
+ test-1:1:1:0: Identifier[factor]
+ test-1:1:8:7: Arrow[=]
+ test-1:1:10:9: Literal["lit"]
+ test-1:2:13:27: Separator[|]
+ test-1:2:15:29: Literal['lat']
+ test-1:2:20:34: ;[]
+ test-1:2:21:35: EOF[]
+ test-2:2:2:5: Identifier[chars]
+ test-2:2:7:10: Arrow[←]
+ test-2:2:10:13: CharClass[[\a-\r]]
+ test-2:2:17:20: +[]
+ test-2:2:18:21: EOF[]
+ `
+ 
+ 	var s Scanner
+ 	var result strings.Builder
+ 
+ 	fs := token.NewFileSet()
+ 	for i, input := range inputs {
+ 		f := fs.AddFile(fmt.Sprintf("test-%d", i), -1, len(input))
+ 		s.Init(f, []byte(input))
+ 
+ 		for {
+ 			pos, tok, lit := s.Scan()
+ 			lpos := fs.Position(pos)
+ 			fmt.Fprintf(&result, "%s:%d:%d:%d: %s[%s]\n", lpos.Filename, lpos.Line, lpos.Column, lpos.Offset, tok, lit)
+ 			if tok == token.EOF {
+ 				break
+ 			}
+ 		}
+ 	}
+ 
+ 	require.Equal(t, strings.TrimSpace(expect), strings.TrimSpace(result.String()))
+ }
+ 
  func scanAll(s *Scanner, input string) []tuple {
  	var result []tuple