~mna/fastpeg

eed734f1920d6634f98bccbf471f1a8e3441fd72 — Martin Angers 5 years ago 548de80
internal/bootstrap/scanner: finalize scanning of literals and char class
2 files changed, 170 insertions(+), 5 deletions(-)

M doc/grammar.ebnf
M internal/bootstrap/scanner/scanner.go
M doc/grammar.ebnf => doc/grammar.ebnf +6 -1
@@ 71,7 71,7 @@ LITERAL        = DQUOTE_LITERAL | SQUOTE_LITERAL .
DQUOTE_LITERAL = DQUOTE { SAFE_RUNE | SQUOTE | RBRACK | DQUOTE_ESCAPE } DQUOTE .
SQUOTE_LITERAL = SQUOTE { SAFE_RUNE | DQUOTE | RBRACK | SQUOTE_ESCAPE } SQUOTE .
CLASS          = LBRACK RANGE { RANGE } RBRACK .
RANGE          = CHAR MINUS CHAR | CHAR .
RANGE          = CHAR MINUS CHAR | CHAR | UNICODE_CLASS_ESCAPE . # \p cannot be in a range
CHAR           = SAFE_RUNE | SQUOTE | DQUOTE | CLASS_ESCAPE .
DQUOTE_ESCAPE  = COMMON_ESCAPE | BACKSLASH DQUOTE .
SQUOTE_ESCAPE  = COMMON_ESCAPE | BACKSLASH SQUOTE .


@@ 85,3 85,8 @@ UNICODE_ESCAPE = BACKSLASH 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
               | BACKSLASH 'U' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
               .
HEX_DIGIT      = '0' ... '9' | 'a' ... 'f' | 'A' ... 'F' .

UNICODE_CLASS_ESCAPE = BACKSLASH ( 'p' | 'P' ) LETTER # LETTER is a bit generous
                     | BACKSLASH ( 'p' | 'P' ) '{' IDENTIFIER '}'
                     .


M internal/bootstrap/scanner/scanner.go => internal/bootstrap/scanner/scanner.go +164 -4
@@ 83,6 83,9 @@ func (s *Scanner) advance() {
	rn, sz := rune(s.src[s.nextpos]), 1
	switch {
	case rn == 0:
		// Note that this wouldn't be an error case in the parser with byte
		// option, but this bootstrap parser only has to parse the fastpeg
		// language with is rune-based (even ascii-based, but anyway).
		s.error(s.curpos, "illegal character NUL")
	case rn >= utf8.RuneSelf:
		// not ascii


@@ 211,6 214,7 @@ func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
}

func (s *Scanner) scanIdentifier() string {
	// when called, s.rn is on the start of the identifier
	start := s.curpos
	for isLetter(s.rn) || isDigit(s.rn) {
		s.advance()


@@ 219,13 223,157 @@ func (s *Scanner) scanIdentifier() string {
}

func (s *Scanner) scanLiteral(quote rune) string {
	// TODO: implement.
	return ""
	// when called, the opening quote is already consumed
	start := s.curpos - 1
	for {
		rn := s.rn
		if rn == '\n' || rn == eof {
			s.error(start, "literal not terminated")
			break
		}
		s.advance()
		if rn == quote {
			break
		}
		if rn == '\\' {
			s.scanLiteralEsc(quote)
		}
	}
	return string(s.src[start:s.curpos])
}

func (s *Scanner) scanCharClass() string {
	// TODO: implement.
	return ""
	// when called, the '[' is already consumed
	start := s.curpos - 1
	for {
		rn := s.rn
		if rn == '\n' || rn == eof {
			s.error(start, "character class not terminated")
			break
		}
		s.advance()
		if rn == ']' {
			break
		}

		// the character range is separated by '-', but there's no
		// special handling required to parse this, if there is no pair
		// of chars around the '-', then it matches the minus symbol
		// itself and is not the "range separator". Meaning:
		//
		// [-] matches '-'
		// [^-] is "not '-'"
		// [^-a] is "not '-' nor 'a'" because '^' at first position always negates
		// [--/] is "from '-' to '/'"
		// [---] is probably invalid, but in the parser, not scanner
		// [a-] is 'a' and '-'
		// [ab-] is 'a', 'b' and '-'
		// [a-b] is "from 'a' to 'b'
		// [ab-c] is 'a' then from 'b' to 'c'

		if rn == '\\' {
			s.scanCharClassEsc()
		}
	}
	return string(s.src[start:s.curpos])
}

func (s *Scanner) scanCharClassEsc() {
	// when called, the '\\' is already consumed
	switch s.rn {
	case '[', '-', '^':
		s.advance()
		return

	case 'p', 'P':
		s.advance()
		switch s.rn {
		case 'C', 'L', 'M', 'N', 'P', 'S', 'Z':
			s.advance()
			return

		case '{':
			s.advance()
			// must be followed by an identifier
			if !isLetter(s.rn) {
				msg := "invalid Unicode class"
				if s.rn < 0 {
					msg = "escape sequence not terminated"
				}
				s.error(s.curpos, msg)
				return
			}
			s.scanIdentifier()
			if s.rn != '}' {
				s.error(s.curpos, "escape sequence not terminated")
				return
			}
			s.advance()
			return

		default:
			msg := "invalid Unicode class"
			if s.rn < 0 {
				msg = "escape sequence not terminated"
			}
			s.error(s.curpos, msg)
		}

	default:
		s.scanLiteralEsc(']')
	}
}

func (s *Scanner) scanLiteralEsc(quote rune) {
	// when called, the '\\' is already consumed
	switch s.rn {
	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
		s.advance()
		return

	case 'x':
		s.advance()
		s.scanHexValue(2, 255)
	case 'u':
		s.advance()
		s.scanHexValue(4, unicode.MaxRune)
	case 'U':
		s.advance()
		s.scanHexValue(8, unicode.MaxRune)

	default:
		msg := "unknown escape sequence"
		if s.rn < 0 {
			msg = "escape sequence not terminated"
		}
		s.error(s.curpos, msg)
	}
}

func (s *Scanner) scanHexValue(digits int, max uint32) {
	const base = 16

	escStart := s.curpos - 1 // i.e. after '\\', at the 'x', 'u' or 'U'
	var val uint32
	for digits > 0 {
		d := uint32(digitVal(s.rn))
		if d >= base {
			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.rn)
			if s.rn < 0 {
				msg = "escape sequence not terminated"
			}
			s.error(s.curpos, msg)
			return
		}

		val = val*base + d
		s.advance()
		digits--
	}

	if val > max || 0xD800 <= val && val < 0xE000 {
		s.error(escStart, "escape sequence is invalid Unicode code point")
	}
}

func (s *Scanner) scanCoderef() string {


@@ 248,6 396,18 @@ func (s *Scanner) scanCoderef() string {
	return ident
}

func digitVal(rn rune) int {
	switch {
	case '0' <= rn && rn <= '9':
		return int(rn - '0')
	case 'a' <= rn && rn <= 'f':
		return int(rn - 'a' + 10)
	case 'A' <= rn && rn <= 'F':
		return int(rn - 'A' + 10)
	}
	return 16
}

func isLetter(rn rune) bool {
	return 'a' <= rn && rn <= 'z' || 'A' <= rn && rn <= 'Z' || rn == '_' || rn >= utf8.RuneSelf && unicode.IsLetter(rn)
}