M doc/grammar.ebnf => doc/grammar.ebnf +70 -56
@@ 5,69 5,83 @@
# [0]: https://golang.org/ref/spec#Notation
grammar = definition { definition } .
-definition = identifier ARROW expression [ EOS ] .
+
+# TODO: how to handle EOS when it is missing? Needs a newline, but newline
+# may appear elsewhere too.
+definition = IDENTIFIER ARROW expression [ EOS ] .
+
expression = sequence { SEPARATOR sequence } .
-sequence = label { label } .
-label = [ identifier COLON ] prefix .
-prefix = AND coderef # TODO: move coderefs before labels, no sense in having id:${code}
- | DOLLAR coderef
- | AT coderef
- | [ AND | NOT ] suffix
+
+sequence = code { code } .
+
+code = AND CODEREF
+ | DOLLAR CODEREF
+ | AT CODEREF
+ | label
.
+
+label = [ IDENTIFIER COLON ] prefix .
+
+prefix = [ AND | NOT ] suffix .
+
suffix = primary [ QUESTION | STAR | PLUS ] .
-primary = identifier
+
+# the parser makes sure that the identifier is not followed by ARROW
+# (which would make it the start of a definition instead).
+primary = IDENTIFIER
| LPAREN expression RPAREN
- | literal
- | class
+ | LITERAL
+ | CLASS
| DOT
.
-coderef = LBRACE identifier RBRACE .
-identifier = LETTER { LETTER | NUMBER } .
-literal = dquote_literal | squote_literal .
-dquote_literal = DQUOTE { SAFE_RUNE | SQUOTE | RBRACK | dquote_escape } DQUOTE .
-squote_literal = SQUOTE { SAFE_RUNE | DQUOTE | RBRACK | squote_escape } SQUOTE .
-class = LBRACK range { range } RBRACK .
-range = char MINUS char | char .
-char = SAFE_RUNE | SQUOTE | DQUOTE | class_escape .
-dquote_escape = common_escape | BACKSLASH DQUOTE .
-squote_escape = common_escape | BACKSLASH SQUOTE .
-class_escape = common_escape | BACKSLASH LBRACK | BACKSLASH RBRACK | BACKSLASH MINUS | BACKSLASH CHEVRON .
-common_escape = hex_escape
- | unicode_escape
- | BACKSLASH ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | BACKSLASH )
- .
-hex_escape = BACKSLASH 'x' HEX_DIGIT HEX_DIGIT .
-unicode_escape = BACKSLASH 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
- | BACKSLASH 'U' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
- .
# For reference, the terminal productions are loosely defined here.
# They are handled by the scanner.
-ARROW = '<-' | '←' | '⟵' | '=' .
-EOS = ';' . # EOS stands for "end of statement".
-SEPARATOR = '|' | '/' .
-COLON = ':' .
-AND = '&' .
-DOLLAR = '$' .
-AT = '@' .
-NOT = '!' .
-QUESTION = '?' .
-STAR = '*' .
-PLUS = '+' .
-LPAREN = '(' .
-RPAREN = ')' .
-DOT = '.' .
-LBRACE = '{' .
-RBRACE = '}' .
-LETTER = any unicode letter or '_' .
-NUMBER = any unicode number or '-' .
-LBRACK = '[' .
-RBRACK = ']' .
-DQUOTE = '"' .
-SQUOTE = '\'' .
-SAFE_RUNE = any unicode code point except '\n', '\\', '"', '\'' and ']' .
-BACKSLASH = '\\' .
-MINUS = '-' .
-CHEVRON = '^' .
-HEX_DIGIT = '0' ... '9' | 'a' ... 'f' | 'A' ... 'F' .
+IDENTIFIER = LETTER { LETTER | NUMBER } .
+ARROW = '<-' | '←' | '⟵' | '=' .
+EOS = ';' . # EOS stands for "end of statement".
+SEPARATOR = '|' | '/' .
+COLON = ':' .
+AND = '&' .
+DOLLAR = '$' .
+AT = '@' .
+NOT = '!' .
+QUESTION = '?' .
+STAR = '*' .
+PLUS = '+' .
+LPAREN = '(' .
+RPAREN = ')' .
+DOT = '.' .
+LBRACE = '{' .
+RBRACE = '}' .
+LETTER = any unicode letter or '_' .
+NUMBER = any unicode number or '-' .
+LBRACK = '[' .
+RBRACK = ']' .
+DQUOTE = '"' .
+SQUOTE = '\'' .
+SAFE_RUNE = any unicode code point except '\n', '\\', '"', '\'' and ']' .
+BACKSLASH = '\\' .
+MINUS = '-' .
+CHEVRON = '^' .
+
+CODEREF = LBRACE IDENTIFIER RBRACE .
+LITERAL = DQUOTE_LITERAL | SQUOTE_LITERAL .
+DQUOTE_LITERAL = DQUOTE { SAFE_RUNE | SQUOTE | RBRACK | DQUOTE_ESCAPE } DQUOTE .
+SQUOTE_LITERAL = SQUOTE { SAFE_RUNE | DQUOTE | RBRACK | SQUOTE_ESCAPE } SQUOTE .
+CLASS = LBRACK RANGE { RANGE } RBRACK .
+RANGE = CHAR MINUS CHAR | CHAR .
+CHAR = SAFE_RUNE | SQUOTE | DQUOTE | CLASS_ESCAPE .
+DQUOTE_ESCAPE = COMMON_ESCAPE | BACKSLASH DQUOTE .
+SQUOTE_ESCAPE = COMMON_ESCAPE | BACKSLASH SQUOTE .
+CLASS_ESCAPE = COMMON_ESCAPE | BACKSLASH LBRACK | BACKSLASH RBRACK | BACKSLASH MINUS | BACKSLASH CHEVRON .
+COMMON_ESCAPE = HEX_ESCAPE
+ | UNICODE_ESCAPE
+ | BACKSLASH ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | BACKSLASH )
+ .
+HEX_ESCAPE = BACKSLASH 'x' HEX_DIGIT HEX_DIGIT .
+UNICODE_ESCAPE = BACKSLASH 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
+ | BACKSLASH 'U' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
+ .
+HEX_DIGIT = '0' ... '9' | 'a' ... 'f' | 'A' ... 'F' .
A go.mod => go.mod +3 -0
@@ 0,0 1,3 @@
+module git.sr.ht/~mna/fastpeg
+
+go 1.12
A internal/bootstrap/token/token.go => internal/bootstrap/token/token.go +106 -0
@@ 0,0 1,106 @@
+// Package token defines constants representing the lexical tokens
+// of the fastpeg language and basic operations on tokens.
+package token
+
+import (
+ "fmt"
+ "go/token"
+)
+
+type (
+ // A File is a handle for a file belonging to a FileSet.
+ // A File has a name, size, and line offset table.
+ File struct{ token.File }
+
+ // A FileSet represents a set of source files. Methods of
+ // file sets are synchronized; multiple goroutines may invoke
+ // them concurrently.
+ FileSet struct{ token.FileSet }
+
+ // Pos is a compact encoding of a source position within a file set.
+ // It can be converted into a Position for a more convenient,
+ // but much larger, representation.
+ Pos struct{ token.Pos }
+
+ // Position describes an arbitrary source position including the
+ // file, line, and column location. A Position is valid if the
+ // line number is > 0.
+ Position struct{ token.Position }
+
+ // Token is the set of lexical tokens of the fastpeg language.
+ Token int
+)
+
+// List of possible tokens.
+const (
+ Illegal Token = iota
+ EOF
+ Comment
+
+ // literals
+ litStart
+ Identifier
+ Literal
+ CharClass
+ Arrow // either '<-', '←', '⟵', '='
+ Separator // either '|' or '/'
+ PredCoderef // &{ Identifier }, literal is the identifier
+ StateCoderef // ${ Identifier }, literal is the identifier
+ ActionCoderef // @{ Identifier }, literal is the identifier
+ litEnd
+
+ // operators and punctuation
+ symStart
+ Dot // .
+ Colon // :
+ Semicolon // ;
+ Ampersand // &
+ Exclamation // !
+ Question // ?
+ Star // *
+ Plus // +
+ Lparen // (
+ Rparen // )
+ symEnd
+)
+
+var stringTokens = [...]string{
+ Illegal: "Illegal",
+ EOF: "EOF",
+ Comment: "Comment",
+ Identifier: "Identifier",
+ Literal: "Literal",
+ CharClass: "CharClass",
+ Arrow: "Arrow",
+ Separator: "Separator",
+ PredCoderef: "PredCoderef",
+ StateCoderef: "StateCoderef",
+ ActionCoderef: "ActionCoderef",
+ Dot: ".",
+ Colon: ":",
+ Semicolon: ";",
+ Ampersand: "&",
+ Exclamation: "!",
+ Question: "?",
+ Star: "*",
+ Plus: "+",
+ Lparen: "(",
+ Rparen: ")",
+}
+
+// IsLiteral returns true if token t is an identifier or a literal.
+func (t Token) IsLiteral() bool {
+ return t > litStart && t < litEnd
+}
+
+// String returns the string representation of token t.
+func (t Token) String() string {
+ var s string
+ if t > 0 && int(t) < len(stringTokens) {
+ s = stringTokens[t]
+ }
+ if s == "" {
+ s = fmt.Sprintf("token(%d)", t)
+ }
+ return s
+}
M matcher.go => matcher.go +27 -21
@@ 3,7 3,7 @@ package fastpeg
import (
"bytes"
"reflect"
- "sync"
+ "unicode"
"unicode/utf8"
)
@@ 35,7 35,7 @@ type literal struct {
func (l literal) match(src []byte) int {
if bytes.Equal(l.lit, src) {
- return len(b)
+ return len(src)
}
return noMatch
}
@@ 45,7 45,7 @@ type anyRune struct{}
func (r anyRune) match(src []byte) int {
// matches any valid rune, does not match invalid utf-8 encoding
- rn, sz, ok := decodeRune(src)
+ _, sz, ok := decodeRune(src)
if !ok {
return noMatch
}
@@ 84,17 84,20 @@ type choice struct {
}
func (c choice) match(src []byte) int {
- var wg sync.WaitGroup
-
- // TODO: run all in parallel, needs context to stop early, keep farthest match failure position
- wg.Add(len(c.ms))
- for _, m := range c.ms {
- go func() {
- m.match(src)
- wg.Done()
- }()
- }
- wg.Wait()
+ /*
+ var wg sync.WaitGroup
+
+ // TODO: run all in parallel, needs context to stop early, keep farthest match failure position
+ wg.Add(len(c.ms))
+ for _, m := range c.ms {
+ go func() {
+ m.match(src)
+ wg.Done()
+ }()
+ }
+ wg.Wait()
+ */
+ return noMatch
}
// R1 R2
@@ 140,7 143,7 @@ type optional struct {
}
func (o optional) match(src []byte) int {
- n := r.m.match(src[start:])
+ n := o.m.match(src)
if n == noMatch {
return emptyMatch
}
@@ 189,10 192,12 @@ type predicateFunc struct {
func (f predicateFunc) match(src []byte) int {
// TODO: verify in compile step that it has the proper return value
- vs := reflect.Call(f.fn)
- if vs[0].Bool() {
- return emptyMatch
- }
+ /*
+ vs := reflect.Call(f.fn)
+ if vs[0].Bool() {
+ return emptyMatch
+ }
+ */
return noMatch
}
@@ 203,11 208,12 @@ type actionFunc struct {
func (f actionFunc) match(src []byte) int {
// not called during matching
// TODO: should not be a matcher
+ return noMatch
}
func decodeRune(b []byte) (rn rune, sz int, ok bool) {
- rn, sz := utf8.DecodeRune(src)
- if rn == unicode.RuneError && sz < 2 {
+ rn, sz = utf8.DecodeRune(b)
+ if rn == utf8.RuneError && sz < 2 {
return rn, sz, false
}
return rn, sz, true