~shabbyrobe/go-porter2

878ac8636ff0ca3cdaf3c4b3b168e350bdbc5b00 — shabbyrobe 4 years ago bf9adef
Add AlreadyLower, crumb a few nanos
3 files changed, 60 insertions(+), 11 deletions(-)

M porter2.go
M suffix.go
M suffix_test.go
M porter2.go => porter2.go +13 -11
@@ 15,6 15,11 @@ type StemFlag int

const (
	UTF8Lower = 1 << iota

	// If you know for sure the input is already lower-cased, this will
	// avoid the case conversion step. Don't pass this if you aren't
	// sure otherwise you'll get dodgy results.
	AlreadyLower
)

// Stem takes the string 'word' and stems it according to the porter2 rules.


@@ 37,20 42,19 @@ func StemBytes(word []byte, flag StemFlag) []byte {
	// XXX: ASCII-only seems OK to me, but using bytes.ToLower will potentially
	// reduce the size of the term space for non-ASCII terms, which do occur in
	// the data I'm using from time to time.
	var s []byte
	var s = word
	if flag&UTF8Lower != 0 {
		s = bytes.ToLower(word)
	} else {
		s = toLower(word)
		s = bytes.ToLower(s)
	} else if flag&AlreadyLower == 0 {
		s = toLower(s)
	}

	// Is it exception?
	if len(s) <= 2 {
		return s
	}
	if rep, ex := exceptions1.Find(s); ex {
		return rep
	}
	if len(s) <= 2 {
		return word
	}
	if s[0] == '\'' {
		s = s[1:]
	}


@@ 65,9 69,7 @@ func StemBytes(word []byte, flag StemFlag) []byte {
	r1, r2 := getR1R2(s)

	// Step 0
	s = removeSuffix_apos_s_apos(s)
	s = removeSuffix_apos_s(s)
	s = removeSuffix_apos(s)
	s = removeSuffix_apos_s_apos_all(s)

	// Step 1a
	if i := suffixPos_sses(s); i >= 0 {

M suffix.go => suffix.go +16 -0
@@ 128,6 128,22 @@ func suffixPos_eedly(s []byte) int {
	return -1
}

func removeSuffix_apos_s_apos_all(s []byte) []byte {
	// Formerly these three calls:
	// s = removeSuffix_apos_s_apos(s)
	// s = removeSuffix_apos_s(s)
	// s = removeSuffix_apos(s)

	end := len(s) - 1
	if s[end] == '\'' {
		end--
	}
	if end >= 1 && s[end-1] == '\'' && s[end] == 's' {
		return s[:end-1]
	}
	return s[:end+1]
}

func removeSuffix_apos_s_apos(s []byte) []byte {
	l := len(s)
	if l >= 3 && s[l-3] == '\'' && s[l-2] == 's' && s[l-1] == '\'' {

M suffix_test.go => suffix_test.go +31 -0
@@ 5,6 5,37 @@ import (
	"testing"
)

func TestRemoveAposEtc(t *testing.T) {
	for idx, tc := range []struct {
		in  string
		out string
	}{
		{"foo", "foo"},
		{"foos", "foos"},
		{"foo's", "foo"},
		{"foos's", "foos"},
		{"'s", ""},
		{"s's", "s"},
		{"'s'", ""},
		{"'", ""},
		{"s'", "s"},
		{"s's'", "s"},
	} {
		t.Run(fmt.Sprintf("%d", idx), func(t *testing.T) {
			r1 := removeSuffix_apos_s_apos_all([]byte(tc.in))
			r2 := removeSuffix_apos_s_apos([]byte(tc.in))
			r2 = removeSuffix_apos_s(r2)
			r2 = removeSuffix_apos(r2)
			if string(r1) != tc.out {
				t.Fatal(string(r1), "!=", tc.out)
			}
			if string(r1) != string(r2) {
				t.Fatal(string(r1), "!=", string(r2))
			}
		})
	}
}

func BenchmarkSuffixPos_eedly(b *testing.B) {
	for idx, bc := range []struct {
		in []byte