~mna/siberian

33a158bab7662861381138d3cdfeb63c90019c82 — Martin Angers 4 years ago ca2d2cc
implement EBNF alt/seq/repeat, tests
7 files changed, 353 insertions(+), 64 deletions(-)

M .gitignore
M ascii_table.go
M ascii_table_test.go
M ebnf.go
A ebnf_test.go
M matcher.go
M matcher_test.go
M .gitignore => .gitignore +3 -0
@@ 4,5 4,8 @@
# output of various helper commands
*.out

# test binaries generated e.g. for profiling
*.test

# binaries
/bin/

M ascii_table.go => ascii_table.go +6 -6
@@ 31,19 31,19 @@ func (t *ASCIITable) Unset(chars ...byte) {
	}
}

// AllowedBytes returns -1, true if all bytes in b are allowed by this ascii
// Are returns -1, true if all bytes in b are allowed by this ascii
// table, or the index of the first disallowed byte and false.
func (t *ASCIITable) AllowedBytes(b []byte) (n int, ok bool) {
func (t *ASCIITable) Are(b []byte) (n int, ok bool) {
	for i, ch := range b {
		if !t.Allowed(ch) {
		if !t.Is(ch) {
			return i, false
		}
	}
	return -1, true
}

// Allowed returns true if ch is allowed, false otherwise.
func (t *ASCIITable) Allowed(ch byte) bool {
// Is returns true if ch is allowed, false otherwise.
func (t *ASCIITable) Is(ch byte) bool {
	if ch > unicode.MaxASCII {
		return false
	}


@@ 87,7 87,7 @@ func (t *ASCIITable) String() string {
	}

	for i := byte(0); i <= unicode.MaxASCII; i++ {
		if t.Allowed(i) {
		if t.Is(i) {
			if last == 0xff {
				last = i
			}

M ascii_table_test.go => ascii_table_test.go +12 -4
@@ 44,6 44,14 @@ func TestASCIITable(t *testing.T) {
	}
}

func TestASCIITable_IsNonASCII(t *testing.T) {
	var at ASCIITable
	at.Set('a', 'b')
	if at.Is('\xff') {
		t.Fatalf("want false, got true")
	}
}

func mustAllowASCII(t *testing.T, at *ASCIITable, allow string, strict bool) {
	t.Helper()
	mustASCII(t, at, allow, "", strict)


@@ 57,21 65,21 @@ func mustDisallowASCII(t *testing.T, at *ASCIITable, disallow string, strict boo
func mustASCII(t *testing.T, at *ASCIITable, allow, disallow string, strict bool) {
	for i := byte(0); i <= unicode.MaxASCII; i++ {
		if strings.IndexByte(allow, i) >= 0 {
			if !at.Allowed(i) {
			if !at.Is(i) {
				t.Errorf("%x (%[1]d) should be allowed", i)
			}
		} else if strict && allow != "" {
			if at.Allowed(i) {
			if at.Is(i) {
				t.Errorf("%x (%[1]d) should not be allowed", i)
			}
		}

		if strings.IndexByte(disallow, i) >= 0 {
			if at.Allowed(i) {
			if at.Is(i) {
				t.Errorf("%x (%[1]d) should not be allowed", i)
			}
		} else if strict && disallow != "" {
			if !at.Allowed(i) {
			if !at.Is(i) {
				t.Errorf("%x (%[1]d) should be allowed", i)
			}
		}

M ebnf.go => ebnf.go +58 -9
@@ 1,26 1,75 @@
package siberian

type Grammar struct{}
import "fmt"

type Prod struct {
	Name string
// Alt is a Matcher that matches if any of the alternatives matches.
// The first match is used, so order of the matchers to try matters.
// If Ms is empty, Alt doesn't match.
type Alt struct {
	Ms []Matcher
}

type Alt struct {
// Match implements Matcher for Alt.
func (a *Alt) Match(b []byte) int {
	for _, m := range a.Ms {
		if n := m.Match(b); n >= 0 {
			return n
		}
	}
	return -1
}

// Repeat is a Matcher that matches M at least Min times and at most
// Max times. Min can be zero, in which case Repeat always matches,
// and Max can be -1 for no limit.
type Repeat struct {
	M   Matcher
	Min int
	Max int
}

type Seq struct {
// Match implements Matcher for Repeat.
func (r *Repeat) Match(b []byte) int {
	// handle impossible cases
	if r.Min < 0 || (r.Max >= 0 && r.Max < r.Min) {
		panic(fmt.Sprintf("invalid Repeat limits: min=%d; max=%d", r.Min, r.Max))
	}

	var n, count int
	for {
		if count == r.Max {
			return n
		}

		nn := r.M.Match(b)
		if nn < 0 {
			if count >= r.Min {
				return n
			}
			return -1
		}
		n += nn
		count++
		b = b[nn:]
	}
}

type Group struct {
// Seq is a Matcher that matches if the Ms all match in sequence.
// If Ms is empty, Seq is a 0 match.
type Seq struct {
	Ms []Matcher
}

type Tok struct {
	// TODO: a token may be defined by a grammar (where e.g. the tokens
	// are single runes).
// Match implement Matcher for Seq.
func (s *Seq) Match(b []byte) int {
	var n int
	for _, m := range s.Ms {
		nn := m.Match(b)
		if nn < 0 {
			return nn
		}
		b = b[nn:]
		n += nn
	}
	return n
}

A ebnf_test.go => ebnf_test.go +164 -0
@@ 0,0 1,164 @@
package siberian

import "testing"

func TestAlt(t *testing.T) {
	cases := []matcherTest{
		{"", -1},
		{"a", 1},
		{"ab", 1},
		{"abc", 1},
		{"b", -1},
		{"bc", 2},
		{"bcd", 2},
		{"bcdef", 2},
		{"de", -1},
		{"def", 3},
		{"g", -1},
	}
	alt := Alt{
		Ms: []Matcher{
			Equal("a"),
			Equal("bc"),
			Equal("def"),
		},
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			n := alt.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}

func TestRepeat_ZeroOrOne(t *testing.T) {
	cases := []matcherTest{
		{"", 0},  // zero match is ok
		{"a", 0}, // zero match is ok
		{"ab", 2},
		{"aba", 2},
		{"abab", 2},   // one is max
		{"ababab", 2}, // one is max
		{"cabab", 0},
	}
	rep := Repeat{
		M:   Equal("ab"),
		Min: 0,
		Max: 1,
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			n := rep.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}

func TestRepeat_OneOrMore(t *testing.T) {
	cases := []matcherTest{
		{"", -1},
		{"a", -1},
		{"ab", 2},
		{"aba", 2},
		{"abab", 4},
		{"ababab", 6},
		{"cabab", -1},
	}
	rep := Repeat{
		M:   Equal("ab"),
		Min: 1,
		Max: -1,
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			n := rep.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}

func TestRepeat_ZeroOrMore(t *testing.T) {
	cases := []matcherTest{
		{"", 0},
		{"a", 0},
		{"ab", 2},
		{"aba", 2},
		{"abab", 4},
		{"ababab", 6},
		{"cabab", 0},
	}
	rep := Repeat{
		M:   Equal("ab"),
		Min: 0,
		Max: -1,
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			n := rep.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}

func TestRepeat_TwoOrThree(t *testing.T) {
	cases := []matcherTest{
		{"", -1},
		{"a", -1},
		{"ab", -1},
		{"aba", -1},
		{"abab", 4},
		{"ababab", 6},
		{"abababab", 6},
		{"cabab", -1},
	}
	rep := Repeat{
		M:   Equal("ab"),
		Min: 2,
		Max: 3,
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			n := rep.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}

func TestSeq(t *testing.T) {
	cases := []matcherTest{
		{"", -1},
		{"a", -1},
		{"ab", -1},
		{"abc", -1},
		{"abcd", -1},
		{"abcde", -1},
		{"abcdef", 6},
		{"abcdefg", 6},
		{"zabcdef", -1},
	}
	seq := Seq{
		Ms: []Matcher{
			Equal("a"),
			Equal("bc"),
			Equal("def"),
		},
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			n := seq.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}

M matcher.go => matcher.go +31 -23
@@ 9,31 9,36 @@ import (
	"unicode/utf8"
)

// NOTE: Matchers can implement optional interfaces to e.g. indicate a
// friendly name to be used in messages?

// Matcher defines the method to identify a match on the input bytes.
// Match must return the length of the match, which may be 0. A
// negative value must be returned to indicate no match.
type Matcher interface {
	Match(b []byte) (int, bool)
	Match(b []byte) int
}

// MatcherFunc is a function type that implements Matcher by calling
// itself.
type MatcherFunc func([]byte) (int, bool)
type MatcherFunc func([]byte) int

// Match implements Matcher for MatcherFunc by calling itself.
func (f MatcherFunc) Match(b []byte) (int, bool) {
func (f MatcherFunc) Match(b []byte) int {
	return f(b)
}

// ASCII returns a Matcher that matches if the next byte is in the provided
// ASCII table of allowed bytes.
func ASCII(tbl *ASCIITable) MatcherFunc {
	return func(b []byte) (int, bool) {
	return func(b []byte) int {
		if len(b) == 0 {
			return 0, false
			return -1
		}
		if tbl.Allowed(b[0]) {
			return 1, true
		if tbl.Is(b[0]) {
			return 1
		}
		return 0, false
		return -1
	}
}



@@ 41,15 46,15 @@ func ASCII(tbl *ASCIITable) MatcherFunc {
// range table. The package golang.org/x/text/unicode/rangetableIndex can be
// used to construct a range table.
func Unicode(rt *unicode.RangeTable) MatcherFunc {
	return func(b []byte) (int, bool) {
	return func(b []byte) int {
		r, sz := utf8.DecodeRune(b)
		if r == utf8.RuneError && sz < 2 {
			return 0, false
			return -1
		}
		if unicode.Is(rt, r) {
			return sz, true
			return sz
		}
		return 0, false
		return -1
	}
}



@@ 60,22 65,22 @@ func Regexp(re *regexp.Regexp) MatcherFunc {
	if !strings.HasPrefix(re.String(), "^") {
		panic(fmt.Sprintf("regular expression %q must be anchored to start of input", re.String()))
	}
	return func(b []byte) (int, bool) {
	return func(b []byte) int {
		if ixs := re.FindIndex(b); ixs != nil {
			return ixs[1], true
			return ixs[1]
		}
		return 0, false
		return -1
	}
}

// Equal returns a Matcher that matches s exactly.
func Equal(s string) MatcherFunc {
	sb := []byte(s)
	return func(b []byte) (int, bool) {
		if bytes.Equal(b, sb) {
			return len(sb), true
	return func(b []byte) int {
		if bytes.HasPrefix(b, sb) {
			return len(sb)
		}
		return 0, false
		return -1
	}
}



@@ 83,10 88,13 @@ func Equal(s string) MatcherFunc {
// (case-insensitive).
func EqualFold(s string) MatcherFunc {
	sb := []byte(s)
	return func(b []byte) (int, bool) {
		if bytes.EqualFold(b, sb) {
			return len(sb), true
	return func(b []byte) int {
		if len(b) < len(sb) {
			return -1
		}
		if bytes.EqualFold(b[:len(sb)], sb) {
			return len(sb)
		}
		return 0, false
		return -1
	}
}

M matcher_test.go => matcher_test.go +79 -22
@@ 1,6 1,7 @@
package siberian

import (
	"regexp"
	"testing"
	"unicode"
)


@@ 8,57 9,113 @@ import (
type matcherTest struct {
	in  string
	len int
	ok  bool
}

func TestASCII(t *testing.T) {
	cases := []matcherTest{
		{"", 0, false},
		{"a", 1, true},
		{"z", 0, false},
		{"ab", 1, true},
		{"abc", 1, true},
		{"az", 1, true},
		{"za", 0, false},
		{"", -1},
		{"a", 1},
		{"z", -1},
		{"ab", 1},
		{"abc", 1},
		{"az", 1},
		{"za", -1},
	}
	var at ASCIITable
	at.Set([]byte("abc")...)
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			m := ASCII(&at)
			n, ok := m.Match([]byte(c.in))
			n := m.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
			if ok != c.ok {
				t.Errorf("want match? %t, got %t", c.ok, ok)
			}
		})
	}
}

func TestUnicode(t *testing.T) {
	cases := []matcherTest{
		{"", 0, false},
		{"a", 1, true},
		{"A", 0, false},
		{"α", 2, true},
		{"α!", 2, true},
		{"!α", 0, false},
		{"", -1},
		{"a", 1},
		{"A", -1},
		{"α", 2},
		{"α!", 2},
		{"!α", -1},
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			m := Unicode(unicode.Ll) // letter, lowercase
			n, ok := m.Match([]byte(c.in))
			n := m.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
			if ok != c.ok {
				t.Errorf("want match? %t, got %t", c.ok, ok)
			}
		})
	}
}

func TestRegexp(t *testing.T) {
	cases := []matcherTest{
		{"", -1},
		{"a", 1},
		{"ab", 2},
		{"abc", 3},
		{"abcabcd", 6},
		{"xabcabcd", -1},
	}
	re := regexp.MustCompile(`^[a-c]+`)
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			m := Regexp(re)
			n := m.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}

func TestEqual(t *testing.T) {
	cases := []matcherTest{
		{"", -1},
		{"a", -1},
		{"ab", -1},
		{"abc", 3},
		{"ABC", -1},
		{"abcd", 3},
		{"abcD", 3},
		{"AbC", -1},
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			m := Equal("abc")
			n := m.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}

func TestEqualFold(t *testing.T) {
	cases := []matcherTest{
		{"", -1},
		{"a", -1},
		{"ab", -1},
		{"abc", 3},
		{"ABC", 3},
		{"abcd", 3},
		{"abcD", 3},
		{"AbC", 3},
		{"AbDc", -1},
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			m := EqualFold("abc")
			n := m.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
		})
	}
}