@@ 0,0 1,238 @@
+// Package scanner implements a scanner for fastpeg input. It takes a
+// []byte as source which can then be tokenized through repeated calls
+// to the Scan method.
+package scanner
+
+import (
+ "fmt"
+ "unicode"
+ "unicode/utf8"
+
+ "git.sr.ht/~mna/fastpeg/internal/bootstrap/token"
+)
+
+const (
+ // byte order mark, only permitted as very first character
+ bom = 0xFEFF
+ // rune representing EOF
+ eof = -1
+)
+
+// A Scanner holds the scanner's internal state while processing a
+// given input. It can be allocated as part of another data structure
+// but must be initialized via Init before use.
+type Scanner struct {
+ // immutable fields
+ file *token.File
+ src []byte
+
+ // mutable state
+ rn rune // current rune
+ curpos int // start index of current rune (rn)
+ nextpos int // start index of next rune
+ linepos int // current line index
+}
+
+// Init prepares the scanner s to tokenize the input src by setting
+// the scanner at the beginning of src. The scanner uses the file set
+// file for position information and it adds line information for each
+// line. Init causes a panic if the file size does not match the src
+// size.
+func (s *Scanner) Init(file *token.File, src []byte) {
+ if file.Size() != len(src) {
+ panic(fmt.Sprintf("file size does not match input size: %d bytes vs %d bytes", file.Size(), len(src)))
+ }
+
+ s.file = file
+ s.src = src
+ s.rn = 0
+ s.curpos = 0
+ s.nextpos = 0
+ s.linepos = 0
+
+ s.advance()
+ if s.rn == bom {
+ // ignore BOM at start of input
+ s.advance()
+ }
+}
+
+// advance the scanner to the next rn in the src.
+// rn == -1 on EOF.
+func (s *Scanner) advance() {
+ if s.nextpos >= len(s.src) {
+ s.curpos = len(s.src)
+ if s.rn == '\n' {
+ s.linepos = s.curpos
+ s.file.AddLine(s.linepos)
+ }
+ s.rn = eof
+ }
+
+ s.curpos = s.nextpos
+ if s.rn == '\n' {
+ s.linepos = s.curpos
+ s.file.AddLine(s.linepos)
+ }
+
+ rn, sz := rune(s.src[s.nextpos]), 1
+ switch {
+ case rn == 0:
+ // TODO: error, illegal NUL character
+ case rn >= utf8.RuneSelf:
+ // not ascii
+ rn, sz = utf8.DecodeRune(s.src[s.nextpos:])
+ if rn == utf8.RuneError && sz == 1 {
+ // TODO: illegal utf8 encoding
+ } else if rn == bom && s.curpos > 0 {
+ // TODO: illegal BOM mark
+ }
+ }
+
+ s.nextpos += sz
+ s.rn = rn
+}
+
+func (s *Scanner) peek() byte {
+ if s.nextpos < len(s.src) {
+ return s.src[s.nextpos]
+ }
+ return 0
+}
+
+func (s *Scanner) skipWhitespace() {
+ for s.rn == ' ' || s.rn == '\t' || s.rn == '\n' || s.rn == '\r' {
+ s.advance()
+ }
+}
+
+// Scan scans the next token and returns the token position, the token,
+// and its literal string if applicable. The source end is indicated
+// by token.EOF.
+//
+// If the returned token is a literal, the literal string has the
+// corresponding value. If the returned token is token.Illegal, the
+// literal string is the offending character. In all other cases,
+// Scan returns an empty literal string.
+//
+// Scan adds line information to the file added to the file set with
+// Init. Token positions are relative to that file and thus relative
+// to the file set.
+func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
+ s.skipWhitespace()
+
+ pos = token.Pos{Pos: s.file.Pos(s.curpos)}
+ switch rn := s.rn; {
+ case isLetter(rn):
+ tok = token.Identifier
+ lit = s.scanIdentifier()
+
+ default:
+ s.advance()
+
+ switch rn {
+ case eof:
+ tok = token.EOF
+
+ case '"', '\'':
+ tok = token.Literal
+ lit = s.scanLiteral(rn)
+
+ case '[':
+ tok = token.CharClass
+ lit = s.scanCharClass()
+
+ case '<':
+ if s.rn != '-' {
+ // TODO: Illegal, report error
+ break
+ }
+ s.advance()
+ lit = "<-"
+ fallthrough
+ case '←', '⟵', '=':
+ tok = token.Arrow
+ if lit == "" {
+ lit = string(rn)
+ }
+
+ case '|', '/':
+ tok = token.Separator
+ lit = string(rn)
+
+ case '&':
+ if s.rn != '{' {
+ tok = token.Ampersand
+ break
+ }
+ tok = token.PredCoderef
+ lit = s.scanCoderef()
+
+ case '$':
+ tok = token.StateCoderef
+ lit = s.scanCoderef()
+
+ case '@':
+ tok = token.ActionCoderef
+ lit = s.scanCoderef()
+
+ case '.', ':', ';', '!', '?', '*', '+', '(', ')':
+ tok = token.FromString(string(rn))
+
+ default:
+ // s.advance already reports bom, ignore it here
+ if rn != bom {
+ // TODO: report illegal char
+ }
+ tok = token.Illegal
+ lit = string(rn)
+ }
+ }
+
+ return pos, tok, lit
+}
+
+func (s *Scanner) scanIdentifier() string {
+ start := s.curpos
+ for isLetter(s.rn) || isDigit(s.rn) {
+ s.advance()
+ }
+ return string(s.src[start:s.curpos])
+}
+
+func (s *Scanner) scanLiteral(quote rune) string {
+ // TODO: implement.
+ return ""
+}
+
+func (s *Scanner) scanCharClass() string {
+ // TODO: implement.
+ return ""
+}
+
+func (s *Scanner) scanCoderef() string {
+ // when called, s.rn is on the start '{'
+ s.advance()
+
+ s.skipWhitespace()
+ ident := s.scanIdentifier()
+ s.skipWhitespace()
+
+ if s.rn == '}' {
+ s.advance()
+ } else {
+ // TODO: error, unclosed coderef
+ }
+ if ident == "" {
+ // TODO: error, empty coderef
+ }
+ return ident
+}
+
+func isLetter(rn rune) bool {
+ return 'a' <= rn && rn <= 'z' || 'A' <= rn && rn <= 'Z' || rn == '_' || rn >= utf8.RuneSelf && unicode.IsLetter(rn)
+}
+
+func isDigit(rn rune) bool {
+ return '0' <= rn && rn <= '9' || rn == '-' || rn >= utf8.RuneSelf && unicode.IsDigit(rn)
+}