~tsileo/blobstash

ref: b178995e346d blobstash/pkg/docstore/textsearch/textsearch.go -rw-r--r-- 3.0 KiB
b178995eThomas Sileo vendor: update deps 2 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
Package textsearch implements basic text search features (for matching text fields of JSON documents).
*/
package textsearch // import "a4.io/blobstash/pkg/textsearch"

import (
	"bytes"
	"fmt"
	"strings"
	"text/scanner"

	"github.com/blevesearch/segment"
	lru "github.com/hashicorp/golang-lru"
	porterstemmer "github.com/reiver/go-porterstemmer"
)

// cache for `string` -> `SearchTerms`
var searchTermsCache, _ = lru.New(128)

// searchTerm holds a single search term
type searchTerm struct {
	prefix     string // `+` (for required match) or `-` (for excluding doc matching the term)
	term       string
	exactMatch bool // true if the search term was quoted for exact match
}

// SearchTerms holds a parsed text search query
type SearchTerms []*searchTerm

// IndexedDoc holds a parsed "document"
type IndexedDoc struct {
	Content string         `msgpack:"c"`
	Stems   map[string]int `msgpack:"s"`
}

// NewIndexedDoc returns a parsed "document"
func NewIndexedDoc(doc map[string]interface{}, fields []string) (*IndexedDoc, error) {
	parts := []string{}
	stems := map[string]int{}
	for _, field := range fields {
		if dat, ok := doc[field]; ok {
			parts = append(parts, dat.(string))
			segmenter := segment.NewWordSegmenter(bytes.NewReader([]byte(dat.(string))))
			for segmenter.Segment() {
				if segmenter.Type() == segment.Letter {
					stem := porterstemmer.StemString(segmenter.Text())
					if _, ok := stems[stem]; ok {
						stems[stem] += 1
					} else {
						stems[stem] = 1
					}
				}
			}
			if err := segmenter.Err(); err != nil {
				return nil, err
			}
		}
	}
	content := strings.Join(parts, " ")

	return &IndexedDoc{Content: content, Stems: stems}, nil
}

// ParseTextQuery returns a parsed text query
func ParseTextQuery(q string) SearchTerms {
	if cached, ok := searchTermsCache.Get(q); ok {
		fmt.Printf("ParseTextQuery form cache")
		return cached.(SearchTerms)
	}
	var s scanner.Scanner
	s.Init(strings.NewReader(q))
	out := SearchTerms{}
	var prefix, term string
	var exactMatch bool
	for tok := s.Scan(); tok != scanner.EOF; tok = s.Scan() {
		term = s.TokenText()

		if term == "+" || term == "-" {
			prefix = term
			continue
		}

		if strings.HasPrefix(term, "\"") && strings.HasSuffix(term, "\"") {
			exactMatch = true
			term = term[1 : len(term)-1]
		}

		if !exactMatch {
			term = porterstemmer.StemString(term)
		}

		out = append(out, &searchTerm{
			prefix:     prefix,
			term:       term,
			exactMatch: exactMatch,
		})

		prefix = ""
		exactMatch = false
	}
	searchTermsCache.Add(q, out)
	return out
}

// Match returns true if the query matches the given `IndexedDoc`
func (terms SearchTerms) Match(d *IndexedDoc) bool {
	match := false

	for _, st := range terms {
		cond := false
		switch {
		case st.exactMatch:
			cond = strings.Contains(d.Content, st.term)
		default:
			_, cond = d.Stems[st.term]
		}

		if st.prefix == "+" {
			if !cond {
				return false
			}
		} else if st.prefix == "-" {
			if cond {
				return false
			} else {
				match = true
			}
		}

		if cond {
			match = true
		}
	}

	return match
}