~mna/siberian

8336a4cddcdb89ad7cac411bde6f839c1ac81cc8 — Martin Angers 4 years ago a7838bf
refactor, test
5 files changed, 298 insertions(+), 188 deletions(-)

A ascii_table.go
R ebnf_test.go => ascii_table_test.go
M ebnf.go
A matcher.go
A matcher_test.go
A ascii_table.go => ascii_table.go +108 -0
@@ 0,0 1,108 @@
package siberian

import (
	"fmt"
	"strings"
	"unicode"
)

// ASCIITable encodes the allowed ASCII values on 128 bits.
type ASCIITable [2]uint64

// Set sets the allowed chars in the ASCII table. It panics if one of the chars
// is outside the ASCII range.
func (t *ASCIITable) Set(chars ...byte) {
	for _, ch := range chars {
		if ch > unicode.MaxASCII {
			panic(fmt.Sprintf("%x is outside the ASCII range", ch))
		}
		t[ch/64] |= 1 << uint64(ch%64)
	}
}

// Unset disallows the chars in the ASCII table. It panics if one of the chars
// is outside the ASCII range.
func (t *ASCIITable) Unset(chars ...byte) {
	for _, ch := range chars {
		if ch > unicode.MaxASCII {
			panic(fmt.Sprintf("%x is outside the ASCII range", ch))
		}
		t[ch/64] &^= 1 << uint64(ch%64)
	}
}

// AllowedBytes returns -1, true if all bytes in b are allowed by this ascii
// table, or the index of the first disallowed byte and false.
func (t *ASCIITable) AllowedBytes(b []byte) (n int, ok bool) {
	for i, ch := range b {
		if !t.Allowed(ch) {
			return i, false
		}
	}
	return -1, true
}

// Allowed returns true if ch is allowed, false otherwise.
func (t *ASCIITable) Allowed(ch byte) bool {
	if ch > unicode.MaxASCII {
		return false
	}
	return t[ch/64]&(1<<uint64(ch%64)) != 0
}

// GoString returns the debugging string representation of the ASCII table.
func (t ASCIITable) GoString() string {
	return strings.TrimSpace(fmt.Sprintf(`
...6.........5.........4.........3.........2.........1.........0
%064b
.......2.........1.........0.........9.........8.........7......
%064b
`, t[0], t[1]))
}

func (t *ASCIITable) printable(b byte) string {
	if b > 0x20 && b < 0x7f {
		return string(b)
	}
	return fmt.Sprintf("x%02x", b)
}

// String returns the string representation of the ASCII table.
func (t *ASCIITable) String() string {
	var buf strings.Builder
	buf.WriteByte('[')

	var last byte = 0xff
	writeFromLastTo := func(end byte) {
		if buf.Len() > 1 {
			buf.WriteByte(',')
		}
		if last == end {
			buf.WriteString(t.printable(last))
		} else {
			buf.WriteString(t.printable(last))
			buf.WriteByte('-')
			buf.WriteString(t.printable(end))
		}
	}

	for i := byte(0); i <= unicode.MaxASCII; i++ {
		if t.Allowed(i) {
			if last == 0xff {
				last = i
			}
			continue
		}
		if last == 0xff {
			continue
		}
		writeFromLastTo(i - 1)
		last = 0xff
	}
	if last != 0xff {
		writeFromLastTo(unicode.MaxASCII)
	}

	buf.WriteByte(']')
	return buf.String()
}

R ebnf_test.go => ascii_table_test.go +34 -0
@@ 2,6 2,7 @@ package siberian

import (
	"fmt"
	"math/rand"
	"strings"
	"testing"
	"unicode"


@@ 77,6 78,13 @@ func mustASCII(t *testing.T, at *ASCIITable, allow, disallow string, strict bool
	}
}

// NOTE: tests that a bit set based on uint64 is more space-efficient
// than a table of 128 booleans. At some point the compiler may be able
// to generate the array of booleans as efficiently as a bit set, and when
// it does so, it will result in simpler code, but for now booleans take
// a byte each (it probably never will as the size change would be a
// breaking change, and i.e. how would you take the address of a given
// bool which is now a single bit).
func TestASCIITableSize(t *testing.T) {
	var b bool
	szBool := unsafe.Sizeof(b)


@@ 94,3 102,29 @@ func TestASCIITableSize(t *testing.T) {
		t.Fatalf("size of [128]bool same or smaller than [2]uint64: %d vs %d", szArrayBool, szArrayUint)
	}
}

// The following benchmark tests whether bit-shifting/and'ing is more
// efficient than the more readable division and modulo, and it is not
// (at least on amd64), presumably because the div/mod is constant and
// the compiler already generates the bit-based approach.
var BenchResult ASCIITable

func BenchmarkASCIITable_DivMod(b *testing.B) {
	ch := byte(rand.Intn(128))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for j := 0; j < 1000; j++ {
			BenchResult[ch/64] |= 1 << uint64(ch%64)
		}
	}
}

func BenchmarkASCIITable_BitShift(b *testing.B) {
	ch := byte(rand.Intn(128))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for j := 0; j < 1000; j++ {
			BenchResult[ch>>6] |= 1 << uint64(ch&63)
		}
	}
}

M ebnf.go => ebnf.go +0 -188
@@ 1,115 1,5 @@
package siberian

import (
	"bytes"
	"fmt"
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
)

// ASCIITable encodes the allowed ASCII values on 128 bits.
type ASCIITable [2]uint64

// Set sets the allowed chars in the ASCII table. It panics if one of the chars
// is outside the ASCII range.
func (t *ASCIITable) Set(chars ...byte) {
	for _, ch := range chars {
		if ch > unicode.MaxASCII {
			panic(fmt.Sprintf("%x is outside the ASCII range", ch))
		}
		t[ch/64] |= 1 << uint64(ch%64)
	}
}

// Unset disallows the chars in the ASCII table. It panics if one of the chars
// is outside the ASCII range.
func (t *ASCIITable) Unset(chars ...byte) {
	for _, ch := range chars {
		if ch > unicode.MaxASCII {
			panic(fmt.Sprintf("%x is outside the ASCII range", ch))
		}
		t[ch/64] &^= 1 << uint64(ch%64)
	}
}

// AllowedBytes returns -1, true if all bytes in b are allowed by this ascii
// table, or the index of the first disallowed byte and false.
func (t *ASCIITable) AllowedBytes(b []byte) (n int, ok bool) {
	for i, ch := range b {
		if !t.Allowed(ch) {
			return i, false
		}
	}
	return -1, true
}

// Allowed returns true if ch is allowed, false otherwise.
func (t *ASCIITable) Allowed(ch byte) bool {
	if ch > unicode.MaxASCII {
		return false
	}
	return t[ch/64]&(1<<uint64(ch%64)) != 0
}

// GoString returns the debugging string representation of the ASCII table.
func (t ASCIITable) GoString() string {
	return strings.TrimSpace(fmt.Sprintf(`
...6.........5.........4.........3.........2.........1.........0
%064b
.......2.........1.........0.........9.........8.........7......
%064b
`, t[0], t[1]))
}

func (t *ASCIITable) printable(b byte) string {
	if b > 0x20 && b < 0x7f {
		return string(b)
	}
	return fmt.Sprintf("x%02x", b)
}

// String returns the string representation of the ASCII table.
func (t *ASCIITable) String() string {
	var buf strings.Builder
	buf.WriteByte('[')

	var last byte = 0xff
	writeFromLastTo := func(end byte) {
		if buf.Len() > 1 {
			buf.WriteByte(',')
		}
		if last == end {
			buf.WriteString(t.printable(last))
		} else {
			buf.WriteString(t.printable(last))
			buf.WriteByte('-')
			buf.WriteString(t.printable(end))
		}
	}

	for i := byte(0); i <= unicode.MaxASCII; i++ {
		if t.Allowed(i) {
			if last == 0xff {
				last = i
			}
			continue
		}
		if last == 0xff {
			continue
		}
		writeFromLastTo(i - 1)
		last = 0xff
	}
	if last != 0xff {
		writeFromLastTo(unicode.MaxASCII)
	}

	buf.WriteByte(']')
	return buf.String()
}

type Grammar struct{}

type Prod struct {


@@ 134,81 24,3 @@ type Tok struct {
	// TODO: a token may be defined by a grammar (where e.g. the tokens
	// are single runes).
}

type Matcher interface {
	Match(b []byte) (int, bool)
}

type MatcherFunc func([]byte) (int, bool)

func (f MatcherFunc) Match(b []byte) (int, bool) {
	return f(b)
}

// ASCII returns a Matcher that matches if the next byte is in the provided
// ASCII table of allowed bytes.
func ASCII(table *ASCIITable) MatcherFunc {
	return func(b []byte) (int, bool) {
		if len(b) == 0 {
			return 0, false
		}
		if table.Allowed(b[0]) {
			return 1, true
		}
		return 0, false
	}
}

// Unicode returns a Matcher that matches if the next rune is in the provided
// range table. The package golang.org/x/text/unicode/rangetableIndex can be
// used to construct a range table.
func Unicode(rt *unicode.RangeTable) MatcherFunc {
	return func(b []byte) (int, bool) {
		r, sz := utf8.DecodeRune(b)
		if r == utf8.RuneError && sz < 2 {
			return 0, false
		}
		if unicode.Is(rt, r) {
			return sz, true
		}
		return 0, false
	}
}

// Regexp returns a Matcher that tries to match the regular expression.
// It panics if re is not anchored to the beginning of the input (start
// with "^").
func Regexp(re *regexp.Regexp) MatcherFunc {
	if !strings.HasPrefix(re.String(), "^") {
		panic(fmt.Sprintf("regular expression %q must be anchored to start of input", re.String()))
	}
	return func(b []byte) (int, bool) {
		if ixs := re.FindIndex(b); ixs != nil {
			return ixs[1], true
		}
		return 0, false
	}
}

// Equal returns a Matcher that matches s exactly.
func Equal(s string) MatcherFunc {
	sb := []byte(s)
	return func(b []byte) (int, bool) {
		if bytes.Equal(b, sb) {
			return len(sb), true
		}
		return 0, false
	}
}

// EqualFold returns a Matcher that matches s under Unicode case-folding
// (case-insensitive).
func EqualFold(s string) MatcherFunc {
	sb := []byte(s)
	return func(b []byte) (int, bool) {
		if bytes.EqualFold(b, sb) {
			return len(sb), true
		}
		return 0, false
	}
}

A matcher.go => matcher.go +92 -0
@@ 0,0 1,92 @@
package siberian

import (
	"bytes"
	"fmt"
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
)

// Matcher defines the method to identify a match on the input bytes.
type Matcher interface {
	Match(b []byte) (int, bool)
}

// MatcherFunc is a function type that implements Matcher by calling
// itself.
type MatcherFunc func([]byte) (int, bool)

// Match implements Matcher for MatcherFunc by calling itself.
func (f MatcherFunc) Match(b []byte) (int, bool) {
	return f(b)
}

// ASCII returns a Matcher that matches if the next byte is in the provided
// ASCII table of allowed bytes.
func ASCII(tbl *ASCIITable) MatcherFunc {
	return func(b []byte) (int, bool) {
		if len(b) == 0 {
			return 0, false
		}
		if tbl.Allowed(b[0]) {
			return 1, true
		}
		return 0, false
	}
}

// Unicode returns a Matcher that matches if the next rune is in the provided
// range table. The package golang.org/x/text/unicode/rangetableIndex can be
// used to construct a range table.
func Unicode(rt *unicode.RangeTable) MatcherFunc {
	return func(b []byte) (int, bool) {
		r, sz := utf8.DecodeRune(b)
		if r == utf8.RuneError && sz < 2 {
			return 0, false
		}
		if unicode.Is(rt, r) {
			return sz, true
		}
		return 0, false
	}
}

// Regexp returns a Matcher that tries to match the regular expression.
// It panics if re is not anchored to the beginning of the input (start
// with "^").
func Regexp(re *regexp.Regexp) MatcherFunc {
	if !strings.HasPrefix(re.String(), "^") {
		panic(fmt.Sprintf("regular expression %q must be anchored to start of input", re.String()))
	}
	return func(b []byte) (int, bool) {
		if ixs := re.FindIndex(b); ixs != nil {
			return ixs[1], true
		}
		return 0, false
	}
}

// Equal returns a Matcher that matches s exactly.
func Equal(s string) MatcherFunc {
	sb := []byte(s)
	return func(b []byte) (int, bool) {
		if bytes.Equal(b, sb) {
			return len(sb), true
		}
		return 0, false
	}
}

// EqualFold returns a Matcher that matches s under Unicode case-folding
// (case-insensitive).
func EqualFold(s string) MatcherFunc {
	sb := []byte(s)
	return func(b []byte) (int, bool) {
		if bytes.EqualFold(b, sb) {
			return len(sb), true
		}
		return 0, false
	}
}

A matcher_test.go => matcher_test.go +64 -0
@@ 0,0 1,64 @@
package siberian

import (
	"testing"
	"unicode"
)

type matcherTest struct {
	in  string
	len int
	ok  bool
}

func TestASCII(t *testing.T) {
	cases := []matcherTest{
		{"", 0, false},
		{"a", 1, true},
		{"z", 0, false},
		{"ab", 1, true},
		{"abc", 1, true},
		{"az", 1, true},
		{"za", 0, false},
	}
	var at ASCIITable
	at.Set([]byte("abc")...)
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			m := ASCII(&at)
			n, ok := m.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
			if ok != c.ok {
				t.Errorf("want match? %t, got %t", c.ok, ok)
			}
		})
	}
}

func TestUnicode(t *testing.T) {
	cases := []matcherTest{
		{"", 0, false},
		{"a", 1, true},
		{"A", 0, false},
		{"α", 2, true},
		{"α!", 2, true},
		{"!α", 0, false},
	}
	for _, c := range cases {
		t.Run(c.in, func(t *testing.T) {
			m := Unicode(unicode.Ll) // letter, lowercase
			n, ok := m.Match([]byte(c.in))
			if n != c.len {
				t.Errorf("want match of length %d, got %d", c.len, n)
			}
			if ok != c.ok {
				t.Errorf("want match? %t, got %t", c.ok, ok)
			}
		})
	}
}

func TestRegexp(t *testing.T) {
}