~mna/siberian

a7838bf519797cc7a83b9101787bb65e57badc1b — Martin Angers 4 years ago 914f2ad
add matchers
2 files changed, 116 insertions(+), 24 deletions(-)

M ebnf.go
M ebnf_test.go
M ebnf.go => ebnf.go +106 -14
@@ 1,15 1,20 @@
package siberian

import (
	"bytes"
	"fmt"
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
)

// asciiTable encodes the allowed ASCII values on 128 bits.
type asciiTable [2]uint64
// ASCIITable encodes the allowed ASCII values on 128 bits.
type ASCIITable [2]uint64

func (t *asciiTable) set(chars ...byte) {
// Set sets the allowed chars in the ASCII table. It panics if one of the chars
// is outside the ASCII range.
func (t *ASCIITable) Set(chars ...byte) {
	for _, ch := range chars {
		if ch > unicode.MaxASCII {
			panic(fmt.Sprintf("%x is outside the ASCII range", ch))


@@ 18,7 23,9 @@ func (t *asciiTable) set(chars ...byte) {
	}
}

func (t *asciiTable) unset(chars ...byte) {
// Unset disallows the chars in the ASCII table. It panics if one of the chars
// is outside the ASCII range.
func (t *ASCIITable) Unset(chars ...byte) {
	for _, ch := range chars {
		if ch > unicode.MaxASCII {
			panic(fmt.Sprintf("%x is outside the ASCII range", ch))


@@ 27,25 34,27 @@ func (t *asciiTable) unset(chars ...byte) {
	}
}

// returns -1, true if all bytes in b are allowed by this ascii table,
// or the index of the first disallowed byte and false.
func (t *asciiTable) allowedBytes(b []byte) (n int, ok bool) {
// AllowedBytes returns -1, true if all bytes in b are allowed by this ascii
// table, or the index of the first disallowed byte and false.
func (t *ASCIITable) AllowedBytes(b []byte) (n int, ok bool) {
	for i, ch := range b {
		if !t.allowed(ch) {
		if !t.Allowed(ch) {
			return i, false
		}
	}
	return -1, true
}

func (t *asciiTable) allowed(ch byte) bool {
// Allowed returns true if ch is allowed, false otherwise.
func (t *ASCIITable) Allowed(ch byte) bool {
	if ch > unicode.MaxASCII {
		panic(fmt.Sprintf("%x is outside the ASCII range", ch))
		return false
	}
	return t[ch/64]&(1<<uint64(ch%64)) != 0
}

func (t asciiTable) GoString() string {
// GoString returns the debugging string representation of the ASCII table.
func (t ASCIITable) GoString() string {
	return strings.TrimSpace(fmt.Sprintf(`
...6.........5.........4.........3.........2.........1.........0
%064b


@@ 54,14 63,15 @@ func (t asciiTable) GoString() string {
`, t[0], t[1]))
}

func (t *asciiTable) printable(b byte) string {
func (t *ASCIITable) printable(b byte) string {
	if b > 0x20 && b < 0x7f {
		return string(b)
	}
	return fmt.Sprintf("x%02x", b)
}

func (t *asciiTable) String() string {
// String returns the string representation of the ASCII table.
func (t *ASCIITable) String() string {
	var buf strings.Builder
	buf.WriteByte('[')



@@ 80,7 90,7 @@ func (t *asciiTable) String() string {
	}

	for i := byte(0); i <= unicode.MaxASCII; i++ {
		if t.allowed(i) {
		if t.Allowed(i) {
			if last == 0xff {
				last = i
			}


@@ 102,6 112,10 @@ func (t *asciiTable) String() string {

type Grammar struct{}

type Prod struct {
	Name string
}

type Alt struct {
}



@@ 120,3 134,81 @@ type Tok struct {
	// TODO: a token may be defined by a grammar (where e.g. the tokens
	// are single runes).
}

type Matcher interface {
	Match(b []byte) (int, bool)
}

type MatcherFunc func([]byte) (int, bool)

func (f MatcherFunc) Match(b []byte) (int, bool) {
	return f(b)
}

// ASCII returns a Matcher that matches if the next byte is in the provided
// ASCII table of allowed bytes.
func ASCII(table *ASCIITable) MatcherFunc {
	return func(b []byte) (int, bool) {
		if len(b) == 0 {
			return 0, false
		}
		if table.Allowed(b[0]) {
			return 1, true
		}
		return 0, false
	}
}

// Unicode returns a Matcher that matches if the next rune is in the provided
// range table. The package golang.org/x/text/unicode/rangetableIndex can be
// used to construct a range table.
func Unicode(rt *unicode.RangeTable) MatcherFunc {
	return func(b []byte) (int, bool) {
		r, sz := utf8.DecodeRune(b)
		if r == utf8.RuneError && sz < 2 {
			return 0, false
		}
		if unicode.Is(rt, r) {
			return sz, true
		}
		return 0, false
	}
}

// Regexp returns a Matcher that tries to match the regular expression.
// It panics if re is not anchored to the beginning of the input (start
// with "^").
func Regexp(re *regexp.Regexp) MatcherFunc {
	if !strings.HasPrefix(re.String(), "^") {
		panic(fmt.Sprintf("regular expression %q must be anchored to start of input", re.String()))
	}
	return func(b []byte) (int, bool) {
		if ixs := re.FindIndex(b); ixs != nil {
			return ixs[1], true
		}
		return 0, false
	}
}

// Equal returns a Matcher that matches s exactly.
func Equal(s string) MatcherFunc {
	sb := []byte(s)
	return func(b []byte) (int, bool) {
		if bytes.Equal(b, sb) {
			return len(sb), true
		}
		return 0, false
	}
}

// EqualFold returns a Matcher that matches s under Unicode case-folding
// (case-insensitive).
func EqualFold(s string) MatcherFunc {
	sb := []byte(s)
	return func(b []byte) (int, bool) {
		if bytes.EqualFold(b, sb) {
			return len(sb), true
		}
		return 0, false
	}
}

M ebnf_test.go => ebnf_test.go +10 -10
@@ 27,12 27,12 @@ func TestASCIITable(t *testing.T) {
	}
	for _, c := range cases {
		t.Run(fmt.Sprintf("set: %q, unset: %q", c.set, c.unset), func(t *testing.T) {
			var at asciiTable
			var at ASCIITable

			at.set([]byte(c.set)...)
			at.Set([]byte(c.set)...)
			mustAllowASCII(t, &at, c.set, true)

			at.unset([]byte(c.unset)...)
			at.Unset([]byte(c.unset)...)
			mustDisallowASCII(t, &at, c.unset, false)

			t.Logf("\n%#v\n", at)


@@ 43,34 43,34 @@ func TestASCIITable(t *testing.T) {
	}
}

func mustAllowASCII(t *testing.T, at *asciiTable, allow string, strict bool) {
func mustAllowASCII(t *testing.T, at *ASCIITable, allow string, strict bool) {
	t.Helper()
	mustASCII(t, at, allow, "", strict)
}

func mustDisallowASCII(t *testing.T, at *asciiTable, disallow string, strict bool) {
func mustDisallowASCII(t *testing.T, at *ASCIITable, disallow string, strict bool) {
	t.Helper()
	mustASCII(t, at, "", disallow, strict)
}

func mustASCII(t *testing.T, at *asciiTable, allow, disallow string, strict bool) {
func mustASCII(t *testing.T, at *ASCIITable, allow, disallow string, strict bool) {
	for i := byte(0); i <= unicode.MaxASCII; i++ {
		if strings.IndexByte(allow, i) >= 0 {
			if !at.allowed(i) {
			if !at.Allowed(i) {
				t.Errorf("%x (%[1]d) should be allowed", i)
			}
		} else if strict && allow != "" {
			if at.allowed(i) {
			if at.Allowed(i) {
				t.Errorf("%x (%[1]d) should not be allowed", i)
			}
		}

		if strings.IndexByte(disallow, i) >= 0 {
			if at.allowed(i) {
			if at.Allowed(i) {
				t.Errorf("%x (%[1]d) should not be allowed", i)
			}
		} else if strict && disallow != "" {
			if !at.allowed(i) {
			if !at.Allowed(i) {
				t.Errorf("%x (%[1]d) should be allowed", i)
			}
		}