@@ 71,7 71,7 @@ LITERAL = DQUOTE_LITERAL | SQUOTE_LITERAL .
DQUOTE_LITERAL = DQUOTE { SAFE_RUNE | SQUOTE | RBRACK | DQUOTE_ESCAPE } DQUOTE .
SQUOTE_LITERAL = SQUOTE { SAFE_RUNE | DQUOTE | RBRACK | SQUOTE_ESCAPE } SQUOTE .
CLASS = LBRACK RANGE { RANGE } RBRACK .
-RANGE = CHAR MINUS CHAR | CHAR .
+RANGE = CHAR MINUS CHAR | CHAR | UNICODE_CLASS_ESCAPE . # \p cannot be in a range
CHAR = SAFE_RUNE | SQUOTE | DQUOTE | CLASS_ESCAPE .
DQUOTE_ESCAPE = COMMON_ESCAPE | BACKSLASH DQUOTE .
SQUOTE_ESCAPE = COMMON_ESCAPE | BACKSLASH SQUOTE .
@@ 85,3 85,8 @@ UNICODE_ESCAPE = BACKSLASH 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
| BACKSLASH 'U' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
.
HEX_DIGIT = '0' ... '9' | 'a' ... 'f' | 'A' ... 'F' .
+
+UNICODE_CLASS_ESCAPE = BACKSLASH ( 'p' | 'P' ) LETTER # LETTER is a bit generous
+ | BACKSLASH ( 'p' | 'P' ) '{' IDENTIFIER '}'
+ .
+
@@ 83,6 83,9 @@ func (s *Scanner) advance() {
rn, sz := rune(s.src[s.nextpos]), 1
switch {
case rn == 0:
+ // Note that this wouldn't be an error case in the parser with byte
+ // option, but this bootstrap parser only has to parse the fastpeg
+ // language with is rune-based (even ascii-based, but anyway).
s.error(s.curpos, "illegal character NUL")
case rn >= utf8.RuneSelf:
// not ascii
@@ 211,6 214,7 @@ func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
}
func (s *Scanner) scanIdentifier() string {
+ // when called, s.rn is on the start of the identifier
start := s.curpos
for isLetter(s.rn) || isDigit(s.rn) {
s.advance()
@@ 219,13 223,157 @@ func (s *Scanner) scanIdentifier() string {
}
func (s *Scanner) scanLiteral(quote rune) string {
- // TODO: implement.
- return ""
+ // when called, the opening quote is already consumed
+ start := s.curpos - 1
+ for {
+ rn := s.rn
+ if rn == '\n' || rn == eof {
+ s.error(start, "literal not terminated")
+ break
+ }
+ s.advance()
+ if rn == quote {
+ break
+ }
+ if rn == '\\' {
+ s.scanLiteralEsc(quote)
+ }
+ }
+ return string(s.src[start:s.curpos])
}
func (s *Scanner) scanCharClass() string {
- // TODO: implement.
- return ""
+ // when called, the '[' is already consumed
+ start := s.curpos - 1
+ for {
+ rn := s.rn
+ if rn == '\n' || rn == eof {
+ s.error(start, "character class not terminated")
+ break
+ }
+ s.advance()
+ if rn == ']' {
+ break
+ }
+
+ // the character range is separated by '-', but there's no
+ // special handling required to parse this, if there is no pair
+ // of chars around the '-', then it matches the minus symbol
+ // itself and is not the "range separator". Meaning:
+ //
+ // [-] matches '-'
+ // [^-] is "not '-'"
+ // [^-a] is "not '-' nor 'a'" because '^' at first position always negates
+ // [--/] is "from '-' to '/'"
+ // [---] is probably invalid, but in the parser, not scanner
+ // [a-] is 'a' and '-'
+ // [ab-] is 'a', 'b' and '-'
+ // [a-b] is "from 'a' to 'b'
+ // [ab-c] is 'a' then from 'b' to 'c'
+
+ if rn == '\\' {
+ s.scanCharClassEsc()
+ }
+ }
+ return string(s.src[start:s.curpos])
+}
+
+func (s *Scanner) scanCharClassEsc() {
+ // when called, the '\\' is already consumed
+ switch s.rn {
+ case '[', '-', '^':
+ s.advance()
+ return
+
+ case 'p', 'P':
+ s.advance()
+ switch s.rn {
+ case 'C', 'L', 'M', 'N', 'P', 'S', 'Z':
+ s.advance()
+ return
+
+ case '{':
+ s.advance()
+ // must be followed by an identifier
+ if !isLetter(s.rn) {
+ msg := "invalid Unicode class"
+ if s.rn < 0 {
+ msg = "escape sequence not terminated"
+ }
+ s.error(s.curpos, msg)
+ return
+ }
+ s.scanIdentifier()
+ if s.rn != '}' {
+ s.error(s.curpos, "escape sequence not terminated")
+ return
+ }
+ s.advance()
+ return
+
+ default:
+ msg := "invalid Unicode class"
+ if s.rn < 0 {
+ msg = "escape sequence not terminated"
+ }
+ s.error(s.curpos, msg)
+ }
+
+ default:
+ s.scanLiteralEsc(']')
+ }
+}
+
+func (s *Scanner) scanLiteralEsc(quote rune) {
+ // when called, the '\\' is already consumed
+ switch s.rn {
+ case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
+ s.advance()
+ return
+
+ case 'x':
+ s.advance()
+ s.scanHexValue(2, 255)
+ case 'u':
+ s.advance()
+ s.scanHexValue(4, unicode.MaxRune)
+ case 'U':
+ s.advance()
+ s.scanHexValue(8, unicode.MaxRune)
+
+ default:
+ msg := "unknown escape sequence"
+ if s.rn < 0 {
+ msg = "escape sequence not terminated"
+ }
+ s.error(s.curpos, msg)
+ }
+}
+
+func (s *Scanner) scanHexValue(digits int, max uint32) {
+ const base = 16
+
+ escStart := s.curpos - 1 // i.e. after '\\', at the 'x', 'u' or 'U'
+ var val uint32
+ for digits > 0 {
+ d := uint32(digitVal(s.rn))
+ if d >= base {
+ msg := fmt.Sprintf("illegal character %#U in escape sequence", s.rn)
+ if s.rn < 0 {
+ msg = "escape sequence not terminated"
+ }
+ s.error(s.curpos, msg)
+ return
+ }
+
+ val = val*base + d
+ s.advance()
+ digits--
+ }
+
+ if val > max || 0xD800 <= val && val < 0xE000 {
+ s.error(escStart, "escape sequence is invalid Unicode code point")
+ }
}
func (s *Scanner) scanCoderef() string {
@@ 248,6 396,18 @@ func (s *Scanner) scanCoderef() string {
return ident
}
+func digitVal(rn rune) int {
+ switch {
+ case '0' <= rn && rn <= '9':
+ return int(rn - '0')
+ case 'a' <= rn && rn <= 'f':
+ return int(rn - 'a' + 10)
+ case 'A' <= rn && rn <= 'F':
+ return int(rn - 'A' + 10)
+ }
+ return 16
+}
+
func isLetter(rn rune) bool {
return 'a' <= rn && rn <= 'z' || 'A' <= rn && rn <= 'Z' || rn == '_' || rn >= utf8.RuneSelf && unicode.IsLetter(rn)
}