~mna/fastpeg

8dd3e8c064fbca4f913a96e8289d13a265f86786 — Martin Angers 5 years ago e67c799
tune the ebnf grammar, implement token package
4 files changed, 206 insertions(+), 77 deletions(-)

M doc/grammar.ebnf
A go.mod
A internal/bootstrap/token/token.go
M matcher.go
M doc/grammar.ebnf => doc/grammar.ebnf +70 -56
@@ 5,69 5,83 @@
# [0]: https://golang.org/ref/spec#Notation

grammar        = definition { definition } .
definition     = identifier ARROW expression [ EOS ] .

# TODO: how to handle EOS when it is missing? Needs a newline, but newline
# may appear elsewhere too.
definition     = IDENTIFIER ARROW expression [ EOS ] .

expression     = sequence { SEPARATOR sequence } .
sequence       = label { label } .
label          = [ identifier COLON ] prefix .
prefix         = AND coderef # TODO: move coderefs before labels, no sense in having id:${code}
               | DOLLAR coderef
               | AT coderef
               | [ AND | NOT ] suffix

sequence       = code { code } .

code           = AND CODEREF
               | DOLLAR CODEREF
               | AT CODEREF
               | label
               .

label          = [ IDENTIFIER COLON ] prefix .

prefix         = [ AND | NOT ] suffix .

suffix         = primary [ QUESTION | STAR | PLUS ] .
primary        = identifier

# the parser makes sure that the identifier is not followed by ARROW
# (which would make it the start of a definition instead).
primary        = IDENTIFIER
               | LPAREN expression RPAREN
               | literal
               | class
               | LITERAL
               | CLASS
               | DOT
               .
coderef        = LBRACE identifier RBRACE .
identifier     = LETTER { LETTER | NUMBER } .
literal        = dquote_literal | squote_literal .
dquote_literal = DQUOTE { SAFE_RUNE | SQUOTE | RBRACK | dquote_escape } DQUOTE .
squote_literal = SQUOTE { SAFE_RUNE | DQUOTE | RBRACK | squote_escape } SQUOTE .
class          = LBRACK range { range } RBRACK .
range          = char MINUS char | char .
char           = SAFE_RUNE | SQUOTE | DQUOTE | class_escape .
dquote_escape  = common_escape | BACKSLASH DQUOTE .
squote_escape  = common_escape | BACKSLASH SQUOTE .
class_escape   = common_escape | BACKSLASH LBRACK | BACKSLASH RBRACK | BACKSLASH MINUS | BACKSLASH CHEVRON .
common_escape  = hex_escape
               | unicode_escape
               | BACKSLASH ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | BACKSLASH )
               .
hex_escape     = BACKSLASH 'x' HEX_DIGIT HEX_DIGIT .
unicode_escape = BACKSLASH 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
               | BACKSLASH 'U' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
               .

# For reference, the terminal productions are loosely defined here.
# They are handled by the scanner.

ARROW     = '<-' | '←' | '⟵' | '=' .
EOS       = ';' .                    # EOS stands for "end of statement".
SEPARATOR = '|' | '/' .
COLON     = ':' .
AND       = '&' .
DOLLAR    = '$' .
AT        = '@' .
NOT       = '!' .
QUESTION  = '?' .
STAR      = '*' .
PLUS      = '+' .
LPAREN    = '(' .
RPAREN    = ')' .
DOT       = '.' .
LBRACE    = '{' .
RBRACE    = '}' .
LETTER    = any unicode letter or '_' .
NUMBER    = any unicode number or '-' .
LBRACK    = '[' .
RBRACK    = ']' .
DQUOTE    = '"' .
SQUOTE    = '\'' .
SAFE_RUNE = any unicode code point except '\n', '\\', '"', '\'' and ']' .
BACKSLASH = '\\' .
MINUS     = '-' .
CHEVRON   = '^' .
HEX_DIGIT = '0' ... '9' | 'a' ... 'f' | 'A' ... 'F' .
IDENTIFIER = LETTER { LETTER | NUMBER } .
ARROW      = '<-' | '←' | '⟵' | '=' .
EOS        = ';' . # EOS stands for "end of statement".
SEPARATOR  = '|' | '/' .
COLON      = ':' .
AND        = '&' .
DOLLAR     = '$' .
AT         = '@' .
NOT        = '!' .
QUESTION   = '?' .
STAR       = '*' .
PLUS       = '+' .
LPAREN     = '(' .
RPAREN     = ')' .
DOT        = '.' .
LBRACE     = '{' .
RBRACE     = '}' .
LETTER     = any unicode letter or '_' .
NUMBER     = any unicode number or '-' .
LBRACK     = '[' .
RBRACK     = ']' .
DQUOTE     = '"' .
SQUOTE     = '\'' .
SAFE_RUNE  = any unicode code point except '\n', '\\', '"', '\'' and ']' .
BACKSLASH  = '\\' .
MINUS      = '-' .
CHEVRON    = '^' .

CODEREF        = LBRACE IDENTIFIER RBRACE .
LITERAL        = DQUOTE_LITERAL | SQUOTE_LITERAL .
DQUOTE_LITERAL = DQUOTE { SAFE_RUNE | SQUOTE | RBRACK | DQUOTE_ESCAPE } DQUOTE .
SQUOTE_LITERAL = SQUOTE { SAFE_RUNE | DQUOTE | RBRACK | SQUOTE_ESCAPE } SQUOTE .
CLASS          = LBRACK RANGE { RANGE } RBRACK .
RANGE          = CHAR MINUS CHAR | CHAR .
CHAR           = SAFE_RUNE | SQUOTE | DQUOTE | CLASS_ESCAPE .
DQUOTE_ESCAPE  = COMMON_ESCAPE | BACKSLASH DQUOTE .
SQUOTE_ESCAPE  = COMMON_ESCAPE | BACKSLASH SQUOTE .
CLASS_ESCAPE   = COMMON_ESCAPE | BACKSLASH LBRACK | BACKSLASH RBRACK | BACKSLASH MINUS | BACKSLASH CHEVRON .
COMMON_ESCAPE  = HEX_ESCAPE
               | UNICODE_ESCAPE
               | BACKSLASH ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | BACKSLASH )
               .
HEX_ESCAPE     = BACKSLASH 'x' HEX_DIGIT HEX_DIGIT .
UNICODE_ESCAPE = BACKSLASH 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
               | BACKSLASH 'U' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
               .
HEX_DIGIT      = '0' ... '9' | 'a' ... 'f' | 'A' ... 'F' .

A go.mod => go.mod +3 -0
@@ 0,0 1,3 @@
module git.sr.ht/~mna/fastpeg

go 1.12

A internal/bootstrap/token/token.go => internal/bootstrap/token/token.go +106 -0
@@ 0,0 1,106 @@
// Package token defines constants representing the lexical tokens
// of the fastpeg language and basic operations on tokens.
package token

import (
	"fmt"
	"go/token"
)

type (
	// A File is a handle for a file belonging to a FileSet.
	// A File has a name, size, and line offset table.
	File struct{ token.File }

	// A FileSet represents a set of source files. Methods of
	// file sets are synchronized; multiple goroutines may invoke
	// them concurrently.
	FileSet struct{ token.FileSet }

	// Pos is a compact encoding of a source position within a file set.
	// It can be converted into a Position for a more convenient,
	// but much larger, representation.
	Pos struct{ token.Pos }

	// Position describes an arbitrary source position including the
	// file, line, and column location. A Position is valid if the
	// line number is > 0.
	Position struct{ token.Position }

	// Token is the set of lexical tokens of the fastpeg language.
	Token int
)

// List of possible tokens.
const (
	Illegal Token = iota
	EOF
	Comment

	// literals
	litStart
	Identifier
	Literal
	CharClass
	Arrow         // either '<-', '←', '⟵', '='
	Separator     // either '|' or '/'
	PredCoderef   // &{ Identifier }, literal is the identifier
	StateCoderef  // ${ Identifier }, literal is the identifier
	ActionCoderef // @{ Identifier }, literal is the identifier
	litEnd

	// operators and punctuation
	symStart
	Dot         // .
	Colon       // :
	Semicolon   // ;
	Ampersand   // &
	Exclamation // !
	Question    // ?
	Star        // *
	Plus        // +
	Lparen      // (
	Rparen      // )
	symEnd
)

var stringTokens = [...]string{
	Illegal:       "Illegal",
	EOF:           "EOF",
	Comment:       "Comment",
	Identifier:    "Identifier",
	Literal:       "Literal",
	CharClass:     "CharClass",
	Arrow:         "Arrow",
	Separator:     "Separator",
	PredCoderef:   "PredCoderef",
	StateCoderef:  "StateCoderef",
	ActionCoderef: "ActionCoderef",
	Dot:           ".",
	Colon:         ":",
	Semicolon:     ";",
	Ampersand:     "&",
	Exclamation:   "!",
	Question:      "?",
	Star:          "*",
	Plus:          "+",
	Lparen:        "(",
	Rparen:        ")",
}

// IsLiteral returns true if token t is an identifier or a literal.
func (t Token) IsLiteral() bool {
	return t > litStart && t < litEnd
}

// String returns the string representation of token t.
func (t Token) String() string {
	var s string
	if t > 0 && int(t) < len(stringTokens) {
		s = stringTokens[t]
	}
	if s == "" {
		s = fmt.Sprintf("token(%d)", t)
	}
	return s
}

M matcher.go => matcher.go +27 -21
@@ 3,7 3,7 @@ package fastpeg
import (
	"bytes"
	"reflect"
	"sync"
	"unicode"
	"unicode/utf8"
)



@@ 35,7 35,7 @@ type literal struct {

func (l literal) match(src []byte) int {
	if bytes.Equal(l.lit, src) {
		return len(b)
		return len(src)
	}
	return noMatch
}


@@ 45,7 45,7 @@ type anyRune struct{}

func (r anyRune) match(src []byte) int {
	// matches any valid rune, does not match invalid utf-8 encoding
	rn, sz, ok := decodeRune(src)
	_, sz, ok := decodeRune(src)
	if !ok {
		return noMatch
	}


@@ 84,17 84,20 @@ type choice struct {
}

func (c choice) match(src []byte) int {
	var wg sync.WaitGroup

	// TODO: run all in parallel, needs context to stop early, keep farthest match failure position
	wg.Add(len(c.ms))
	for _, m := range c.ms {
		go func() {
			m.match(src)
			wg.Done()
		}()
	}
	wg.Wait()
	/*
		var wg sync.WaitGroup

		// TODO: run all in parallel, needs context to stop early, keep farthest match failure position
		wg.Add(len(c.ms))
		for _, m := range c.ms {
			go func() {
				m.match(src)
				wg.Done()
			}()
		}
		wg.Wait()
	*/
	return noMatch
}

// R1 R2


@@ 140,7 143,7 @@ type optional struct {
}

func (o optional) match(src []byte) int {
	n := r.m.match(src[start:])
	n := o.m.match(src)
	if n == noMatch {
		return emptyMatch
	}


@@ 189,10 192,12 @@ type predicateFunc struct {

func (f predicateFunc) match(src []byte) int {
	// TODO: verify in compile step that it has the proper return value
	vs := reflect.Call(f.fn)
	if vs[0].Bool() {
		return emptyMatch
	}
	/*
		vs := reflect.Call(f.fn)
		if vs[0].Bool() {
			return emptyMatch
		}
	*/
	return noMatch
}



@@ 203,11 208,12 @@ type actionFunc struct {
func (f actionFunc) match(src []byte) int {
	// not called during matching
	// TODO: should not be a matcher
	return noMatch
}

func decodeRune(b []byte) (rn rune, sz int, ok bool) {
	rn, sz := utf8.DecodeRune(src)
	if rn == unicode.RuneError && sz < 2 {
	rn, sz = utf8.DecodeRune(b)
	if rn == utf8.RuneError && sz < 2 {
		return rn, sz, false
	}
	return rn, sz, true