A => .gitignore +5 -0
@@ 1,5 @@
+# ignore direnv file
+.envrc
+
+# ignore coverage, profiling, etc. 'out' files
+*.out
A => doc/api.go +44 -0
@@ 1,44 @@
+package doc
+
+type Parser struct {
+ // Actions maps a grammar rule name to an action function. The function is called
+ // only on the final, resolved parse tree.
+ Actions FuncMap
+
+ // States maps a state call identifier to a state function. The function is called
+ // during parsing and as such might be called multiple times, it should be as short
+ // and efficient as possible. It can read and write state that will be available to
+ // subsequent state and predicate calls.
+ States FuncMap
+
+ // Predicates maps a predicate call identifier to a predicate function. The function
+ // is called during parsing and as such might be called multiple times, it should be
+ // as short and efficient as possible. It has read-only access to state data stored
+ // by state functions. It returns a boolean that controls the matching logic.
+ Predicates FuncMap
+}
+
+// Parse input according to the grammar represented by p.
+func (p *Parser) Parse(input []byte) error {
+ return nil
+}
+
+// Compile a Parser from the provided grammar.
+func Compile(grammar []byte) (*Parser, error) {
+ return nil, nil
+}
+
+// ActionFuncMap maps a grammar rule name to a function. The function
+// must accept two arguments:
+// * []byte : the slice of input matched by the rule
+// * Struct pointer : a struct where its fields match the name of the captures
+// in the rule. Associated values will be set as field values. The fields
+// types must match the type of the values (be assignable from it).
+//
+// The function may return 0, 1 or 2 values. If one, it is the value of the
+// match for this rule, if 2, the first value is the value of the match and
+// the second must be of type error and adds an error to the parser, without
+// impacting the parse tree (i.e. what matched still matches, but the parser
+// will exit with - at least - this error).
+
+type FuncMap map[string]interface{}
A => doc/goals.md +26 -0
@@ 1,26 @@
+# Goals
+
+* Zero allocation - parsing is done by slicing through the input byte slice
+* Massively parallel - choices are attempted in parallel, stop as soon as possible, only the higher-priority successful parse is kept
+* Grammar-only - no code blocks in the grammar, code blocks are associated with rules via a map passed to the parser, making grammars reusable with multiple code actions (e.g. AST generation, debug printing, etc.)
+* No action code execution during parsing - matches only record the functions that will be called at the end, on the successful parse tree
+* Stateful code support - a distinct map of rules-to-function allows execution of stateful "code blocks" that can alter the parse tree, but since all match attempts are in parallel, that state is used by a single goroutine and doesn't have to backtrack, it simply dies if that branch fails
+* Byte and rune support - option to use bytes or runes (of course the grammar needs to be built for this, literals and character classes can still represent unicode runes, but would never match)
+* Cancelable - the parser supports context.Context (and that's how branches should be stopped when a higher priority one has matched)
+* Must be fast - should be extremely fast, needs to benchmark favorably against pigeon and other parser generators / combinators
+* Strongly-typed - associated functions are strongly-typed, reflect-based thunks are generated
+* No code generation - works directly on an input grammar, compiles it, and can reuse it to parse multiple inputs
+* Error reporting - should have great error reporting, should be ~easy to have further match location with the parallel work
+ - in particular, record position of rule/match pattern in the original grammar so an error can say "failed to match at line L col C rule R, expected one of RULES|LITERALS, got ..."
+* Stats - branches launched, rules executed, etc.
+
+## Stretch goals
+
+* Location-aware parse - ability to parse from a specific location, using state from a previous parse for what came before (useful for e.g. editors) - might not be required if whole-file parsing is fast enough
+
+## Dropped goals
+
+* Separated lexer support - while PEGs don't need a lexer (it is part of the PEG itself), it can be useful to simplify that grammar and keeping them separate; as such a parse output can be used as lexer (as input) to another parse - a pipeline of parsers
+ - Not really useful in practice (PEG can describe tokens)
+ - How would a grammar that expects a specific lexer work? How would it specify those tokens? Literals not allowed anymore? Raises a lot of design issues.
+
A => doc/syntax.peg +23 -0
@@ 1,23 @@
+# Any of '=', '<-', unicode left arrows can be used
+entry = 'literal' _ rule ; # semicolon optional, but allowed
+
+# comment is pound sign, much less confusing than slashes in a PEG
+rule = A | B | C # pipe or slash can be used for ordered choice
+
+# convention: use all uppercase for terminals, lowercase for rules
+A = 'A'
+B = 'B'
+C = 'C'
+
+rule = ( A &B ) / ( A !C ) # pattern predicates
+
+# state function reference is an identifier inside ${} braces,
+# predicate function reference is an identifier inside &{} braces.
+# There is no !{} predicate, just invert the return condition for the
+# same effect. Action functions are identified by @{} braces.
+rule = ${ state_func } A &{ predicate_func } @{ action_func }
+
+# capture value (aka variables) are an id followed by a colon and the
+# captured pattern. Those are made available to the action function
+# and are strongly-typed.
+rule = id:ALPHA+
A => matcher.go +214 -0
@@ 1,214 @@
+package fastpeg
+
+import (
+ "bytes"
+ "reflect"
+ "sync"
+ "unicode/utf8"
+)
+
+const (
+ emptyMatch = 0
+ noMatch = -1
+)
+
+type matcher interface {
+ match([]byte) int
+}
+
+type rule struct {
+ // TODO: position, for rule and each expression
+ name string
+ m matcher
+}
+
+func (r rule) match(src []byte) int {
+ // a rule matches if its expression matches
+ return r.m.match(src)
+}
+
+// "literal" or 'literal'
+type literal struct {
+ // TODO: prevent 0-byte literals (empty match), should not be allowed
+ lit []byte
+}
+
+func (l literal) match(src []byte) int {
+ if bytes.Equal(l.lit, src) {
+ return len(b)
+ }
+ return noMatch
+}
+
+// . (with utf-8 rune option)
+type anyRune struct{}
+
+func (r anyRune) match(src []byte) int {
+ // matches any valid rune, does not match invalid utf-8 encoding
+ rn, sz, ok := decodeRune(src)
+ if !ok {
+ return noMatch
+ }
+ return sz
+}
+
+// . (with byte option)
+type anyByte struct{}
+
+func (b anyByte) match(src []byte) int {
+ if len(src) > 0 {
+ return 1
+ }
+ return noMatch
+}
+
+type charClass struct {
+ // TODO: use https://godoc.org/golang.org/x/text/unicode/rangetable to build/optimize tables
+ rt *unicode.RangeTable
+}
+
+func (c charClass) match(src []byte) int {
+ rn, sz, ok := decodeRune(src)
+ if !ok {
+ return noMatch
+ }
+ if unicode.Is(c.rt, rn) {
+ return sz
+ }
+ return noMatch
+}
+
+// R1 / R2
+type choice struct {
+ ms []matcher
+}
+
+func (c choice) match(src []byte) int {
+ var wg sync.WaitGroup
+
+ // TODO: run all in parallel, needs context to stop early, keep farthest match failure position
+ wg.Add(len(c.ms))
+ for _, m := range c.ms {
+ go func() {
+ m.match(src)
+ wg.Done()
+ }()
+ }
+ wg.Wait()
+}
+
+// R1 R2
+type sequence struct {
+ ms []matcher
+}
+
+func (s sequence) match(src []byte) int {
+ var start int
+ for _, m := range s.ms {
+ n := m.match(src[start:])
+ if n == noMatch {
+ return noMatch
+ }
+ start += n
+ }
+ return start
+}
+
+// name:R
+type label struct {
+ name string
+ m matcher
+}
+
+// &R or !R
+type predicate struct {
+ m matcher
+ negate bool
+}
+
+func (p predicate) match(src []byte) int {
+ n := p.m.match(src)
+ if (n == noMatch) == p.negate {
+ return emptyMatch
+ }
+ return noMatch
+}
+
+// R?
+type optional struct {
+ m matcher
+}
+
+func (o optional) match(src []byte) int {
+ n := r.m.match(src[start:])
+ if n == noMatch {
+ return emptyMatch
+ }
+ return n
+}
+
+// R+ or R*
+type repeat struct {
+ m matcher // TODO: must not be a predicate (infinite loop)
+ optional bool
+}
+
+func (r repeat) match(src []byte) int {
+ // TODO: detect empty matches, will result in infinite loop (e.g. an optional matcher)
+ var start int
+ for {
+ n := r.m.match(src[start:])
+ if n == noMatch {
+ if start == 0 {
+ // because no predicate matcher is allowed, start == 0 means
+ // no match yet
+ if r.optional {
+ return emptyMatch
+ }
+ return noMatch
+ }
+ return start
+ }
+ start += n
+ }
+}
+
+type stateFunc struct {
+ // TODO: atoms for func names?
+ fn reflect.Value
+}
+
+func (f stateFunc) match(src []byte) int {
+ // state functions always match
+ return emptyMatch
+}
+
+type predicateFunc struct {
+ fn reflect.Value
+}
+
+func (f predicateFunc) match(src []byte) int {
+ // TODO: verify in compile step that it has the proper return value
+ vs := reflect.Call(f.fn)
+ if vs[0].Bool() {
+ return emptyMatch
+ }
+ return noMatch
+}
+
+type actionFunc struct {
+ fn reflect.Value
+}
+
+func (f actionFunc) match(src []byte) int {
+ // not called during matching
+ // TODO: should not be a matcher
+}
+
+func decodeRune(b []byte) (rn rune, sz int, ok bool) {
+ rn, sz := utf8.DecodeRune(src)
+ if rn == unicode.RuneError && sz < 2 {
+ return rn, sz, false
+ }
+ return rn, sz, true
+}