gopls/internal/fuzzy/symbol.go

1	// Copyright 2021 The Go Authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	package fuzzy
6
7	import (
8	"unicode"
9	)
10
11	// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
12	// of the form:
13	//
14	// example.com/path/to/package.object.field
15	//
16	// Knowing that we are matching symbols like this allows us to make the
17	// following optimizations:
18	// - We can incorporate right-to-left relevance directly into the score
19	// calculation.
20	// - We can match from right to left, discarding leading bytes if the input is
21	// too long.
22	// - We just take the right-most match without losing too much precision. This
23	// allows us to use an O(n) algorithm.
24	// - We can operate directly on chunked strings; in many cases we will
25	// be storing the package path and/or package name separately from the
26	// symbol or identifiers, so doing this avoids allocating strings.
27	// - We can return the index of the right-most match, allowing us to trim
28	// irrelevant qualification.
29	//
30	// This implementation is experimental, serving as a reference fast algorithm
31	// to compare to the fuzzy algorithm implemented by Matcher.
32	type SymbolMatcher struct {
33	// Using buffers of length 256 is both a reasonable size for most qualified
34	// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
35	pattern [256]rune
36	patternLen uint8
37	inputBuffer [256]rune // avoid allocating when considering chunks
38	roles [256]uint32 // which roles does a rune play (word start, etc.)
39	segments [256]uint8 // how many segments from the right is each rune
40	}
41
42	const (
43	segmentStart uint32 = 1 << iota
44	wordStart
45	separator
46	)
47
48	// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
49	// search pattern.
50	//
51	// Currently this matcher only accepts case-insensitive fuzzy patterns.
52	//
53	// An empty pattern matches no input.
54	func NewSymbolMatcher(pattern string) *SymbolMatcher {
55	m := &SymbolMatcher{}
56	for _, p := range pattern {
57	m.pattern[m.patternLen] = unicode.ToLower(p)
58	m.patternLen++
59	if m.patternLen == 255 \|\| int(m.patternLen) == len(pattern) {
60	// break at 255 so that we can represent patternLen with a uint8.
61	break
62	}
63	}
64	return m
65	}
66
67	// Match looks for the right-most match of the search pattern within the symbol
68	// represented by concatenating the given chunks, returning its offset and
69	// score.
70	//
71	// If a match is found, the first return value will hold the absolute byte
72	// offset within all chunks for the start of the symbol. In other words, the
73	// index of the match within strings.Join(chunks, ""). If no match is found,
74	// the first return value will be -1.
75	//
76	// The second return value will be the score of the match, which is always
77	// between 0 and 1, inclusive. A score of 0 indicates no match.
78	func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
79	// Explicit behavior for an empty pattern.
80	//
81	// As a minor optimization, this also avoids nilness checks later on, since
82	// the compiler can prove that m != nil.
83	if m.patternLen == 0 {
84	return -1, 0
85	}
86
87	// First phase: populate the input buffer with lower-cased runes.
88	//
89	// We could also check for a forward match here, but since we'd have to write
90	// the entire input anyway this has negligible impact on performance.
91
92	var (
93	inputLen = uint8(0)
94	modifiers = wordStart \| segmentStart
95	)
96
97	input:
98	for _, chunk := range chunks {
99	for _, r := range chunk {
100	if r == '.' \|\| r == '/' {
101	modifiers \|= separator
102	}
103	// optimization: avoid calls to unicode.ToLower, which can't be inlined.
104	l := r
105	if r <= unicode.MaxASCII {
106	if 'A' <= r && r <= 'Z' {
107	l = r + 'a' - 'A'
108	}
109	} else {
110	l = unicode.ToLower(r)
111	}
112	if l != r {
113	modifiers \|= wordStart
114	}
115	m.inputBuffer[inputLen] = l
116	m.roles[inputLen] = modifiers
117	inputLen++
118	if m.roles[inputLen-1]&separator != 0 {
119	modifiers = wordStart \| segmentStart
120	} else {
121	modifiers = 0
122	}
123	// TODO: we should prefer the right-most input if it overflows, rather
124	// than the left-most as we're doing here.
125	if inputLen == 255 {
126	break input
127	}
128	}
129	}
130
131	// Second phase: find the right-most match, and count segments from the
132	// right.
133
134	var (
135	pi = uint8(m.patternLen - 1) // pattern index
136	p = m.pattern[pi] // pattern rune
137	start = -1 // start offset of match
138	rseg = uint8(0)
139	)
140	const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
141
142	for ii := inputLen - 1; ; ii-- {
143	r := m.inputBuffer[ii]
144	if rseg < maxSeg && m.roles[ii]&separator != 0 {
145	rseg++
146	}
147	m.segments[ii] = rseg
148	if p == r {
149	if pi == 0 {
150	start = int(ii)
151	break
152	}
153	pi--
154	p = m.pattern[pi]
155	}
156	// Don't check ii >= 0 in the loop condition: ii is a uint8.
157	if ii == 0 {
158	break
159	}
160	}
161
162	if start < 0 {
163	// no match: skip scoring
164	return -1, 0
165	}
166
167	// Third phase: find the shortest match, and compute the score.
168
169	// Score is the average score for each character.
170	//
171	// A character score is the multiple of:
172	// 1. 1.0 if the character starts a segment, .8 if the character start a
173	// mid-segment word, otherwise 0.6. This carries over to immediately
174	// following characters.
175	// 2. For the final character match, the multiplier from (1) is reduced to
176	// .8 if the next character in the input is a mid-segment word, or 0.6 if
177	// the next character in the input is not a word or segment start. This
178	// ensures that we favor whole-word or whole-segment matches over prefix
179	// matches.
180	// 3. 1.0 if the character is part of the last segment, otherwise
181	// 1.0-.2*<segments from the right>, with a max segment count of 3.
182	//
183	// This is a very naive algorithm, but it is fast. There's lots of prior art
184	// here, and we should leverage it. For example, we could explicitly consider
185	// character distance, and exact matches of words or segments.
186	//
187	// Also note that this might not actually find the highest scoring match, as
188	// doing so could require a non-linear algorithm, depending on how the score
189	// is calculated.
190
191	pi = 0
192	p = m.pattern[pi]
193
194	const (
195	segStreak = 1.0
196	wordStreak = 0.8
197	noStreak = 0.6
198	perSegment = 0.2 // we count at most 3 segments above
199	)
200
201	streakBonus := noStreak
202	totScore := 0.0
203	for ii := uint8(start); ii < inputLen; ii++ {
204	r := m.inputBuffer[ii]
205	if r == p {
206	pi++
207	p = m.pattern[pi]
208	// Note: this could be optimized with some bit operations.
209	switch {
210	case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus:
211	streakBonus = segStreak
212	case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus:
213	streakBonus = wordStreak
214	}
215	finalChar := pi >= m.patternLen
216	// finalCost := 1.0
217	if finalChar && streakBonus > noStreak {
218	switch {
219	case ii == inputLen-1 \|\| m.roles[ii+1]&segmentStart != 0:
220	// Full segment: no reduction
221	case m.roles[ii+1]&wordStart != 0:
222	streakBonus = wordStreak
223	default:
224	streakBonus = noStreak
225	}
226	}
227	totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment)
228	if finalChar {
229	break
230	}
231	} else {
232	streakBonus = noStreak
233	}
234	}
235
236	return start, totScore / float64(m.patternLen)
237	}
238

Members

GoPLS Viewer