~mna/fastpeg

618791500f706c28f6b6ce7af8eb4ecbb1ad2420 — Martin Angers 5 years ago 48c5ac7
internal/bootstrap/scanner: start implementation of scanner
1 files changed, 238 insertions(+), 0 deletions(-)

A internal/bootstrap/scanner/scanner.go
A internal/bootstrap/scanner/scanner.go => internal/bootstrap/scanner/scanner.go +238 -0
@@ 0,0 1,238 @@
// Package scanner implements a scanner for fastpeg input. It takes a
// []byte as source which can then be tokenized through repeated calls
// to the Scan method.
package scanner

import (
	"fmt"
	"unicode"
	"unicode/utf8"

	"git.sr.ht/~mna/fastpeg/internal/bootstrap/token"
)

const (
	// byte order mark, only permitted as very first character
	bom = 0xFEFF
	// rune representing EOF
	eof = -1
)

// A Scanner holds the scanner's internal state while processing a
// given input. It can be allocated as part of another data structure
// but must be initialized via Init before use.
type Scanner struct {
	// immutable fields
	file *token.File
	src  []byte

	// mutable state
	rn      rune // current rune
	curpos  int  // start index of current rune (rn)
	nextpos int  // start index of next rune
	linepos int  // current line index
}

// Init prepares the scanner s to tokenize the input src by setting
// the scanner at the beginning of src. The scanner uses the file set
// file for position information and it adds line information for each
// line. Init causes a panic if the file size does not match the src
// size.
func (s *Scanner) Init(file *token.File, src []byte) {
	if file.Size() != len(src) {
		panic(fmt.Sprintf("file size does not match input size: %d bytes vs %d bytes", file.Size(), len(src)))
	}

	s.file = file
	s.src = src
	s.rn = 0
	s.curpos = 0
	s.nextpos = 0
	s.linepos = 0

	s.advance()
	if s.rn == bom {
		// ignore BOM at start of input
		s.advance()
	}
}

// advance the scanner to the next rn in the src.
// rn == -1 on EOF.
func (s *Scanner) advance() {
	if s.nextpos >= len(s.src) {
		s.curpos = len(s.src)
		if s.rn == '\n' {
			s.linepos = s.curpos
			s.file.AddLine(s.linepos)
		}
		s.rn = eof
	}

	s.curpos = s.nextpos
	if s.rn == '\n' {
		s.linepos = s.curpos
		s.file.AddLine(s.linepos)
	}

	rn, sz := rune(s.src[s.nextpos]), 1
	switch {
	case rn == 0:
		// TODO: error, illegal NUL character
	case rn >= utf8.RuneSelf:
		// not ascii
		rn, sz = utf8.DecodeRune(s.src[s.nextpos:])
		if rn == utf8.RuneError && sz == 1 {
			// TODO: illegal utf8 encoding
		} else if rn == bom && s.curpos > 0 {
			// TODO: illegal BOM mark
		}
	}

	s.nextpos += sz
	s.rn = rn
}

func (s *Scanner) peek() byte {
	if s.nextpos < len(s.src) {
		return s.src[s.nextpos]
	}
	return 0
}

func (s *Scanner) skipWhitespace() {
	for s.rn == ' ' || s.rn == '\t' || s.rn == '\n' || s.rn == '\r' {
		s.advance()
	}
}

// Scan scans the next token and returns the token position, the token,
// and its literal string if applicable. The source end is indicated
// by token.EOF.
//
// If the returned token is a literal, the literal string has the
// corresponding value. If the returned token is token.Illegal, the
// literal string is the offending character.  In all other cases,
// Scan returns an empty literal string.
//
// Scan adds line information to the file added to the file set with
// Init. Token positions are relative to that file and thus relative
// to the file set.
func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
	s.skipWhitespace()

	pos = token.Pos{Pos: s.file.Pos(s.curpos)}
	switch rn := s.rn; {
	case isLetter(rn):
		tok = token.Identifier
		lit = s.scanIdentifier()

	default:
		s.advance()

		switch rn {
		case eof:
			tok = token.EOF

		case '"', '\'':
			tok = token.Literal
			lit = s.scanLiteral(rn)

		case '[':
			tok = token.CharClass
			lit = s.scanCharClass()

		case '<':
			if s.rn != '-' {
				// TODO: Illegal, report error
				break
			}
			s.advance()
			lit = "<-"
			fallthrough
		case '←', '⟵', '=':
			tok = token.Arrow
			if lit == "" {
				lit = string(rn)
			}

		case '|', '/':
			tok = token.Separator
			lit = string(rn)

		case '&':
			if s.rn != '{' {
				tok = token.Ampersand
				break
			}
			tok = token.PredCoderef
			lit = s.scanCoderef()

		case '$':
			tok = token.StateCoderef
			lit = s.scanCoderef()

		case '@':
			tok = token.ActionCoderef
			lit = s.scanCoderef()

		case '.', ':', ';', '!', '?', '*', '+', '(', ')':
			tok = token.FromString(string(rn))

		default:
			// s.advance already reports bom, ignore it here
			if rn != bom {
				// TODO: report illegal char
			}
			tok = token.Illegal
			lit = string(rn)
		}
	}

	return pos, tok, lit
}

func (s *Scanner) scanIdentifier() string {
	start := s.curpos
	for isLetter(s.rn) || isDigit(s.rn) {
		s.advance()
	}
	return string(s.src[start:s.curpos])
}

func (s *Scanner) scanLiteral(quote rune) string {
	// TODO: implement.
	return ""
}

func (s *Scanner) scanCharClass() string {
	// TODO: implement.
	return ""
}

func (s *Scanner) scanCoderef() string {
	// when called, s.rn is on the start '{'
	s.advance()

	s.skipWhitespace()
	ident := s.scanIdentifier()
	s.skipWhitespace()

	if s.rn == '}' {
		s.advance()
	} else {
		// TODO: error, unclosed coderef
	}
	if ident == "" {
		// TODO: error, empty coderef
	}
	return ident
}

func isLetter(rn rune) bool {
	return 'a' <= rn && rn <= 'z' || 'A' <= rn && rn <= 'Z' || rn == '_' || rn >= utf8.RuneSelf && unicode.IsLetter(rn)
}

func isDigit(rn rune) bool {
	return '0' <= rn && rn <= '9' || rn == '-' || rn >= utf8.RuneSelf && unicode.IsDigit(rn)
}