| 1 | // Copyright 2019 The Go Authors. All rights reserved. |
|---|---|
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Package fuzzy implements a fuzzy matching algorithm. |
| 6 | package fuzzy |
| 7 | |
| 8 | import ( |
| 9 | "bytes" |
| 10 | "fmt" |
| 11 | ) |
| 12 | |
| 13 | const ( |
| 14 | // MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs |
| 15 | // will be truncated to this size. |
| 16 | MaxInputSize = 127 |
| 17 | // MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer |
| 18 | // inputs are truncated to this size. |
| 19 | MaxPatternSize = 63 |
| 20 | ) |
| 21 | |
| 22 | type scoreVal int |
| 23 | |
| 24 | func (s scoreVal) val() int { |
| 25 | return int(s) >> 1 |
| 26 | } |
| 27 | |
| 28 | func (s scoreVal) prevK() int { |
| 29 | return int(s) & 1 |
| 30 | } |
| 31 | |
| 32 | func score(val int, prevK int /*0 or 1*/) scoreVal { |
| 33 | return scoreVal(val<<1 + prevK) |
| 34 | } |
| 35 | |
| 36 | // Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern. |
| 37 | // The matcher does not support parallel usage. |
| 38 | type Matcher struct { |
| 39 | pattern string |
| 40 | patternLower []byte // lower-case version of the pattern |
| 41 | patternShort []byte // first characters of the pattern |
| 42 | caseSensitive bool // set if the pattern is mix-cased |
| 43 | |
| 44 | patternRoles []RuneRole // the role of each character in the pattern |
| 45 | roles []RuneRole // the role of each character in the tested string |
| 46 | |
| 47 | scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal |
| 48 | |
| 49 | scoreScale float32 |
| 50 | |
| 51 | lastCandidateLen int // in bytes |
| 52 | lastCandidateMatched bool |
| 53 | |
| 54 | // Reusable buffers to avoid allocating for every candidate. |
| 55 | // - inputBuf stores the concatenated input chunks |
| 56 | // - lowerBuf stores the last candidate in lower-case |
| 57 | // - rolesBuf stores the calculated roles for each rune in the last |
| 58 | // candidate. |
| 59 | inputBuf [MaxInputSize]byte |
| 60 | lowerBuf [MaxInputSize]byte |
| 61 | rolesBuf [MaxInputSize]RuneRole |
| 62 | } |
| 63 | |
| 64 | func (m *Matcher) bestK(i, j int) int { |
| 65 | if m.scores[i][j][0].val() < m.scores[i][j][1].val() { |
| 66 | return 1 |
| 67 | } |
| 68 | return 0 |
| 69 | } |
| 70 | |
| 71 | // NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern. |
| 72 | func NewMatcher(pattern string) *Matcher { |
| 73 | if len(pattern) > MaxPatternSize { |
| 74 | pattern = pattern[:MaxPatternSize] |
| 75 | } |
| 76 | |
| 77 | m := &Matcher{ |
| 78 | pattern: pattern, |
| 79 | patternLower: toLower([]byte(pattern), nil), |
| 80 | } |
| 81 | |
| 82 | for i, c := range m.patternLower { |
| 83 | if pattern[i] != c { |
| 84 | m.caseSensitive = true |
| 85 | break |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | if len(pattern) > 3 { |
| 90 | m.patternShort = m.patternLower[:3] |
| 91 | } else { |
| 92 | m.patternShort = m.patternLower |
| 93 | } |
| 94 | |
| 95 | m.patternRoles = RuneRoles([]byte(pattern), nil) |
| 96 | |
| 97 | if len(pattern) > 0 { |
| 98 | maxCharScore := 4 |
| 99 | m.scoreScale = 1 / float32(maxCharScore*len(pattern)) |
| 100 | } |
| 101 | |
| 102 | return m |
| 103 | } |
| 104 | |
| 105 | // Score returns the score returned by matching the candidate to the pattern. |
| 106 | // This is not designed for parallel use. Multiple candidates must be scored sequentially. |
| 107 | // Returns a score between 0 and 1 (0 - no match, 1 - perfect match). |
| 108 | func (m *Matcher) Score(candidate string) float32 { |
| 109 | return m.ScoreChunks([]string{candidate}) |
| 110 | } |
| 111 | |
| 112 | func (m *Matcher) ScoreChunks(chunks []string) float32 { |
| 113 | candidate := fromChunks(chunks, m.inputBuf[:]) |
| 114 | if len(candidate) > MaxInputSize { |
| 115 | candidate = candidate[:MaxInputSize] |
| 116 | } |
| 117 | lower := toLower(candidate, m.lowerBuf[:]) |
| 118 | m.lastCandidateLen = len(candidate) |
| 119 | |
| 120 | if len(m.pattern) == 0 { |
| 121 | // Empty patterns perfectly match candidates. |
| 122 | return 1 |
| 123 | } |
| 124 | |
| 125 | if m.match(candidate, lower) { |
| 126 | sc := m.computeScore(candidate, lower) |
| 127 | if sc > minScore/2 && !m.poorMatch() { |
| 128 | m.lastCandidateMatched = true |
| 129 | if len(m.pattern) == len(candidate) { |
| 130 | // Perfect match. |
| 131 | return 1 |
| 132 | } |
| 133 | |
| 134 | if sc < 0 { |
| 135 | sc = 0 |
| 136 | } |
| 137 | normalizedScore := float32(sc) * m.scoreScale |
| 138 | if normalizedScore > 1 { |
| 139 | normalizedScore = 1 |
| 140 | } |
| 141 | |
| 142 | return normalizedScore |
| 143 | } |
| 144 | } |
| 145 | |
| 146 | m.lastCandidateMatched = false |
| 147 | return 0 |
| 148 | } |
| 149 | |
| 150 | const minScore = -10000 |
| 151 | |
| 152 | // MatchedRanges returns matches ranges for the last scored string as a flattened array of |
| 153 | // [begin, end) byte offset pairs. |
| 154 | func (m *Matcher) MatchedRanges() []int { |
| 155 | if len(m.pattern) == 0 || !m.lastCandidateMatched { |
| 156 | return nil |
| 157 | } |
| 158 | i, j := m.lastCandidateLen, len(m.pattern) |
| 159 | if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 { |
| 160 | return nil |
| 161 | } |
| 162 | |
| 163 | var ret []int |
| 164 | k := m.bestK(i, j) |
| 165 | for i > 0 { |
| 166 | take := (k == 1) |
| 167 | k = m.scores[i][j][k].prevK() |
| 168 | if take { |
| 169 | if len(ret) == 0 || ret[len(ret)-1] != i { |
| 170 | ret = append(ret, i) |
| 171 | ret = append(ret, i-1) |
| 172 | } else { |
| 173 | ret[len(ret)-1] = i - 1 |
| 174 | } |
| 175 | j-- |
| 176 | } |
| 177 | i-- |
| 178 | } |
| 179 | // Reverse slice. |
| 180 | for i := 0; i < len(ret)/2; i++ { |
| 181 | ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i] |
| 182 | } |
| 183 | return ret |
| 184 | } |
| 185 | |
| 186 | func (m *Matcher) match(candidate []byte, candidateLower []byte) bool { |
| 187 | i, j := 0, 0 |
| 188 | for ; i < len(candidateLower) && j < len(m.patternLower); i++ { |
| 189 | if candidateLower[i] == m.patternLower[j] { |
| 190 | j++ |
| 191 | } |
| 192 | } |
| 193 | if j != len(m.patternLower) { |
| 194 | return false |
| 195 | } |
| 196 | |
| 197 | // The input passes the simple test against pattern, so it is time to classify its characters. |
| 198 | // Character roles are used below to find the last segment. |
| 199 | m.roles = RuneRoles(candidate, m.rolesBuf[:]) |
| 200 | |
| 201 | return true |
| 202 | } |
| 203 | |
| 204 | func (m *Matcher) computeScore(candidate []byte, candidateLower []byte) int { |
| 205 | pattLen, candLen := len(m.pattern), len(candidate) |
| 206 | |
| 207 | for j := 0; j <= len(m.pattern); j++ { |
| 208 | m.scores[0][j][0] = minScore << 1 |
| 209 | m.scores[0][j][1] = minScore << 1 |
| 210 | } |
| 211 | m.scores[0][0][0] = score(0, 0) // Start with 0. |
| 212 | |
| 213 | segmentsLeft, lastSegStart := 1, 0 |
| 214 | for i := 0; i < candLen; i++ { |
| 215 | if m.roles[i] == RSep { |
| 216 | segmentsLeft++ |
| 217 | lastSegStart = i + 1 |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | // A per-character bonus for a consecutive match. |
| 222 | consecutiveBonus := 2 |
| 223 | wordIdx := 0 // Word count within segment. |
| 224 | for i := 1; i <= candLen; i++ { |
| 225 | |
| 226 | role := m.roles[i-1] |
| 227 | isHead := role == RHead |
| 228 | |
| 229 | if isHead { |
| 230 | wordIdx++ |
| 231 | } else if role == RSep && segmentsLeft > 1 { |
| 232 | wordIdx = 0 |
| 233 | segmentsLeft-- |
| 234 | } |
| 235 | |
| 236 | var skipPenalty int |
| 237 | if i == 1 || (i-1) == lastSegStart { |
| 238 | // Skipping the start of first or last segment. |
| 239 | skipPenalty++ |
| 240 | } |
| 241 | |
| 242 | for j := 0; j <= pattLen; j++ { |
| 243 | // By default, we don't have a match. Fill in the skip data. |
| 244 | m.scores[i][j][1] = minScore << 1 |
| 245 | |
| 246 | // Compute the skip score. |
| 247 | k := 0 |
| 248 | if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() { |
| 249 | k = 1 |
| 250 | } |
| 251 | |
| 252 | skipScore := m.scores[i-1][j][k].val() |
| 253 | // Do not penalize missing characters after the last matched segment. |
| 254 | if j != pattLen { |
| 255 | skipScore -= skipPenalty |
| 256 | } |
| 257 | m.scores[i][j][0] = score(skipScore, k) |
| 258 | |
| 259 | if j == 0 || candidateLower[i-1] != m.patternLower[j-1] { |
| 260 | // Not a match. |
| 261 | continue |
| 262 | } |
| 263 | pRole := m.patternRoles[j-1] |
| 264 | |
| 265 | if role == RTail && pRole == RHead { |
| 266 | if j > 1 { |
| 267 | // Not a match: a head in the pattern matches a tail character in the candidate. |
| 268 | continue |
| 269 | } |
| 270 | // Special treatment for the first character of the pattern. We allow |
| 271 | // matches in the middle of a word if they are long enough, at least |
| 272 | // min(3, pattern.length) characters. |
| 273 | if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) { |
| 274 | continue |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | // Compute the char score. |
| 279 | var charScore int |
| 280 | // Bonus 1: the char is in the candidate's last segment. |
| 281 | if segmentsLeft <= 1 { |
| 282 | charScore++ |
| 283 | } |
| 284 | // Bonus 2: Case match or a Head in the pattern aligns with one in the word. |
| 285 | // Single-case patterns lack segmentation signals and we assume any character |
| 286 | // can be a head of a segment. |
| 287 | if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) { |
| 288 | charScore++ |
| 289 | } |
| 290 | |
| 291 | // Penalty 1: pattern char is Head, candidate char is Tail. |
| 292 | if role == RTail && pRole == RHead { |
| 293 | charScore-- |
| 294 | } |
| 295 | // Penalty 2: first pattern character matched in the middle of a word. |
| 296 | if j == 1 && role == RTail { |
| 297 | charScore -= 4 |
| 298 | } |
| 299 | |
| 300 | // Third dimension encodes whether there is a gap between the previous match and the current |
| 301 | // one. |
| 302 | for k := 0; k < 2; k++ { |
| 303 | sc := m.scores[i-1][j-1][k].val() + charScore |
| 304 | |
| 305 | isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart |
| 306 | if isConsecutive { |
| 307 | // Bonus 3: a consecutive match. First character match also gets a bonus to |
| 308 | // ensure prefix final match score normalizes to 1.0. |
| 309 | // Logically, this is a part of charScore, but we have to compute it here because it |
| 310 | // only applies for consecutive matches (k == 1). |
| 311 | sc += consecutiveBonus |
| 312 | } |
| 313 | if k == 0 { |
| 314 | // Penalty 3: Matching inside a segment (and previous char wasn't matched). Penalize for the lack |
| 315 | // of alignment. |
| 316 | if role == RTail || role == RUCTail { |
| 317 | sc -= 3 |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | if sc > m.scores[i][j][1].val() { |
| 322 | m.scores[i][j][1] = score(sc, k) |
| 323 | } |
| 324 | } |
| 325 | } |
| 326 | } |
| 327 | |
| 328 | result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val() |
| 329 | |
| 330 | return result |
| 331 | } |
| 332 | |
| 333 | // ScoreTable returns the score table computed for the provided candidate. Used only for debugging. |
| 334 | func (m *Matcher) ScoreTable(candidate string) string { |
| 335 | var buf bytes.Buffer |
| 336 | |
| 337 | var line1, line2, separator bytes.Buffer |
| 338 | line1.WriteString("\t") |
| 339 | line2.WriteString("\t") |
| 340 | for j := 0; j < len(m.pattern); j++ { |
| 341 | line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j])) |
| 342 | separator.WriteString("----------------") |
| 343 | } |
| 344 | |
| 345 | buf.WriteString(line1.String()) |
| 346 | buf.WriteString("\n") |
| 347 | buf.WriteString(separator.String()) |
| 348 | buf.WriteString("\n") |
| 349 | |
| 350 | for i := 1; i <= len(candidate); i++ { |
| 351 | line1.Reset() |
| 352 | line2.Reset() |
| 353 | |
| 354 | line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1])) |
| 355 | line2.WriteString("\t") |
| 356 | |
| 357 | for j := 1; j <= len(m.pattern); j++ { |
| 358 | line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK()))) |
| 359 | line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK()))) |
| 360 | } |
| 361 | buf.WriteString(line1.String()) |
| 362 | buf.WriteString("\n") |
| 363 | buf.WriteString(line2.String()) |
| 364 | buf.WriteString("\n") |
| 365 | buf.WriteString(separator.String()) |
| 366 | buf.WriteString("\n") |
| 367 | } |
| 368 | |
| 369 | return buf.String() |
| 370 | } |
| 371 | |
| 372 | func dir(prevK int) rune { |
| 373 | if prevK == 0 { |
| 374 | return 'M' |
| 375 | } |
| 376 | return 'H' |
| 377 | } |
| 378 | |
| 379 | func (m *Matcher) poorMatch() bool { |
| 380 | if len(m.pattern) < 2 { |
| 381 | return false |
| 382 | } |
| 383 | |
| 384 | i, j := m.lastCandidateLen, len(m.pattern) |
| 385 | k := m.bestK(i, j) |
| 386 | |
| 387 | var counter, len int |
| 388 | for i > 0 { |
| 389 | take := (k == 1) |
| 390 | k = m.scores[i][j][k].prevK() |
| 391 | if take { |
| 392 | len++ |
| 393 | if k == 0 && len < 3 && m.roles[i-1] == RTail { |
| 394 | // Short match in the middle of a word |
| 395 | counter++ |
| 396 | if counter > 1 { |
| 397 | return true |
| 398 | } |
| 399 | } |
| 400 | j-- |
| 401 | } else { |
| 402 | len = 0 |
| 403 | } |
| 404 | i-- |
| 405 | } |
| 406 | return false |
| 407 | } |
| 408 | |
| 409 | // BestMatch returns the name most similar to the |
| 410 | // pattern, using fuzzy matching, or the empty string. |
| 411 | func BestMatch(pattern string, names []string) string { |
| 412 | fuzz := NewMatcher(pattern) |
| 413 | best := "" |
| 414 | highScore := float32(0) // minimum score is 0 (no match) |
| 415 | for _, name := range names { |
| 416 | // TODO: Improve scoring algorithm. |
| 417 | score := fuzz.Score(name) |
| 418 | if score > highScore { |
| 419 | highScore = score |
| 420 | best = name |
| 421 | } else if score == 0 { |
| 422 | // Order matters in the fuzzy matching algorithm. If we find no match |
| 423 | // when matching the target to the identifier, try matching the identifier |
| 424 | // to the target. |
| 425 | revFuzz := NewMatcher(name) |
| 426 | revScore := revFuzz.Score(pattern) |
| 427 | if revScore > highScore { |
| 428 | highScore = revScore |
| 429 | best = name |
| 430 | } |
| 431 | } |
| 432 | } |
| 433 | return best |
| 434 | } |
| 435 |
Members