~mna/fastpeg

e9adbde7a0c81be18fd2967e67ed4bcfa00ee9bf — Martin Angers 5 years ago
initial commit
5 files changed, 312 insertions(+), 0 deletions(-)

A .gitignore
A doc/api.go
A doc/goals.md
A doc/syntax.peg
A matcher.go
A  => .gitignore +5 -0
@@ 1,5 @@
# ignore direnv file
.envrc

# ignore coverage, profiling, etc. 'out' files
*.out

A  => doc/api.go +44 -0
@@ 1,44 @@
package doc

type Parser struct {
	// Actions maps a grammar rule name to an action function. The function is called
	// only on the final, resolved parse tree.
	Actions FuncMap

	// States maps a state call identifier to a state function. The function is called
	// during parsing and as such might be called multiple times, it should be as short
	// and efficient as possible. It can read and write state that will be available to
	// subsequent state and predicate calls.
	States FuncMap

	// Predicates maps a predicate call identifier to a predicate function. The function
	// is called during parsing and as such might be called multiple times, it should be
	// as short and efficient as possible. It has read-only access to state data stored
	// by state functions. It returns a boolean that controls the matching logic.
	Predicates FuncMap
}

// Parse input according to the grammar represented by p.
func (p *Parser) Parse(input []byte) error {
	return nil
}

// Compile a Parser from the provided grammar.
func Compile(grammar []byte) (*Parser, error) {
	return nil, nil
}

// ActionFuncMap maps a grammar rule name to a function. The function
// must accept two arguments:
// * []byte : the slice of input matched by the rule
// * Struct pointer : a struct where its fields match the name of the captures
//   in the rule. Associated values will be set as field values. The fields
//   types must match the type of the values (be assignable from it).
//
// The function may return 0, 1 or 2 values. If one, it is the value of the
// match for this rule, if 2, the first value is the value of the match and
// the second must be of type error and adds an error to the parser, without
// impacting the parse tree (i.e. what matched still matches, but the parser
// will exit with - at least - this error).

type FuncMap map[string]interface{}

A  => doc/goals.md +26 -0
@@ 1,26 @@
# Goals

* Zero allocation - parsing is done by slicing through the input byte slice
* Massively parallel - choices are attempted in parallel, stop as soon as possible, only the higher-priority successful parse is kept
* Grammar-only - no code blocks in the grammar, code blocks are associated with rules via a map passed to the parser, making grammars reusable with multiple code actions (e.g. AST generation, debug printing, etc.)
* No action code execution during parsing - matches only record the functions that will be called at the end, on the successful parse tree
* Stateful code support - a distinct map of rules-to-function allows execution of stateful "code blocks" that can alter the parse tree, but since all match attempts are in parallel, that state is used by a single goroutine and doesn't have to backtrack, it simply dies if that branch fails
* Byte and rune support - option to use bytes or runes (of course the grammar needs to be built for this, literals and character classes can still represent unicode runes, but would never match)
* Cancelable - the parser supports context.Context (and that's how branches should be stopped when a higher priority one has matched)
* Must be fast - should be extremely fast, needs to benchmark favorably against pigeon and other parser generators / combinators
* Strongly-typed - associated functions are strongly-typed, reflect-based thunks are generated
* No code generation - works directly on an input grammar, compiles it, and can reuse it to parse multiple inputs
* Error reporting - should have great error reporting, should be ~easy to have further match location with the parallel work
  - in particular, record position of rule/match pattern in the original grammar so an error can say "failed to match at line L col C rule R, expected one of RULES|LITERALS, got ..."
* Stats - branches launched, rules executed, etc.

## Stretch goals

* Location-aware parse - ability to parse from a specific location, using state from a previous parse for what came before (useful for e.g. editors) - might not be required if whole-file parsing is fast enough

## Dropped goals

* Separated lexer support - while PEGs don't need a lexer (it is part of the PEG itself), it can be useful to simplify that grammar and keeping them separate; as such a parse output can be used as lexer (as input) to another parse - a pipeline of parsers
  - Not really useful in practice (PEG can describe tokens)
  - How would a grammar that expects a specific lexer work? How would it specify those tokens? Literals not allowed anymore? Raises a lot of design issues.


A  => doc/syntax.peg +23 -0
@@ 1,23 @@
# Any of '=', '<-', unicode left arrows can be used
entry = 'literal' _ rule ; # semicolon optional, but allowed

# comment is pound sign, much less confusing than slashes in a PEG
rule = A | B | C # pipe or slash can be used for ordered choice

# convention: use all uppercase for terminals, lowercase for rules
A = 'A'
B = 'B'
C = 'C'

rule = ( A &B ) / ( A !C ) # pattern predicates

# state function reference is an identifier inside ${} braces,
# predicate function reference is an identifier inside &{} braces.
# There is no !{} predicate, just invert the return condition for the
# same effect. Action functions are identified by @{} braces.
rule = ${ state_func } A &{ predicate_func } @{ action_func }

# capture value (aka variables) are an id followed by a colon and the
# captured pattern. Those are made available to the action function
# and are strongly-typed.
rule = id:ALPHA+

A  => matcher.go +214 -0
@@ 1,214 @@
package fastpeg

import (
	"bytes"
	"reflect"
	"sync"
	"unicode/utf8"
)

const (
	emptyMatch = 0
	noMatch    = -1
)

type matcher interface {
	match([]byte) int
}

type rule struct {
	// TODO: position, for rule and each expression
	name string
	m    matcher
}

func (r rule) match(src []byte) int {
	// a rule matches if its expression matches
	return r.m.match(src)
}

// "literal" or 'literal'
type literal struct {
	// TODO: prevent 0-byte literals (empty match), should not be allowed
	lit []byte
}

func (l literal) match(src []byte) int {
	if bytes.Equal(l.lit, src) {
		return len(b)
	}
	return noMatch
}

// . (with utf-8 rune option)
type anyRune struct{}

func (r anyRune) match(src []byte) int {
	// matches any valid rune, does not match invalid utf-8 encoding
	rn, sz, ok := decodeRune(src)
	if !ok {
		return noMatch
	}
	return sz
}

// . (with byte option)
type anyByte struct{}

func (b anyByte) match(src []byte) int {
	if len(src) > 0 {
		return 1
	}
	return noMatch
}

type charClass struct {
	// TODO: use https://godoc.org/golang.org/x/text/unicode/rangetable to build/optimize tables
	rt *unicode.RangeTable
}

func (c charClass) match(src []byte) int {
	rn, sz, ok := decodeRune(src)
	if !ok {
		return noMatch
	}
	if unicode.Is(c.rt, rn) {
		return sz
	}
	return noMatch
}

// R1 / R2
type choice struct {
	ms []matcher
}

func (c choice) match(src []byte) int {
	var wg sync.WaitGroup

	// TODO: run all in parallel, needs context to stop early, keep farthest match failure position
	wg.Add(len(c.ms))
	for _, m := range c.ms {
		go func() {
			m.match(src)
			wg.Done()
		}()
	}
	wg.Wait()
}

// R1 R2
type sequence struct {
	ms []matcher
}

func (s sequence) match(src []byte) int {
	var start int
	for _, m := range s.ms {
		n := m.match(src[start:])
		if n == noMatch {
			return noMatch
		}
		start += n
	}
	return start
}

// name:R
type label struct {
	name string
	m    matcher
}

// &R or !R
type predicate struct {
	m      matcher
	negate bool
}

func (p predicate) match(src []byte) int {
	n := p.m.match(src)
	if (n == noMatch) == p.negate {
		return emptyMatch
	}
	return noMatch
}

// R?
type optional struct {
	m matcher
}

func (o optional) match(src []byte) int {
	n := r.m.match(src[start:])
	if n == noMatch {
		return emptyMatch
	}
	return n
}

// R+ or R*
type repeat struct {
	m        matcher // TODO: must not be a predicate (infinite loop)
	optional bool
}

func (r repeat) match(src []byte) int {
	// TODO: detect empty matches, will result in infinite loop (e.g. an optional matcher)
	var start int
	for {
		n := r.m.match(src[start:])
		if n == noMatch {
			if start == 0 {
				// because no predicate matcher is allowed, start == 0 means
				// no match yet
				if r.optional {
					return emptyMatch
				}
				return noMatch
			}
			return start
		}
		start += n
	}
}

type stateFunc struct {
	// TODO: atoms for func names?
	fn reflect.Value
}

func (f stateFunc) match(src []byte) int {
	// state functions always match
	return emptyMatch
}

type predicateFunc struct {
	fn reflect.Value
}

func (f predicateFunc) match(src []byte) int {
	// TODO: verify in compile step that it has the proper return value
	vs := reflect.Call(f.fn)
	if vs[0].Bool() {
		return emptyMatch
	}
	return noMatch
}

type actionFunc struct {
	fn reflect.Value
}

func (f actionFunc) match(src []byte) int {
	// not called during matching
	// TODO: should not be a matcher
}

func decodeRune(b []byte) (rn rune, sz int, ok bool) {
	rn, sz := utf8.DecodeRune(src)
	if rn == unicode.RuneError && sz < 2 {
		return rn, sz, false
	}
	return rn, sz, true
}