annotate clang-tools-extra/clangd/FuzzyMatch.cpp @ 221:79ff65ed7e25

LLVM12 Original
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 15 Jun 2021 19:15:29 +0900
parents 1d019706d866
children c4bab56944e8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
150
anatofuz
parents:
diff changeset
1 //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
anatofuz
parents:
diff changeset
2 //
anatofuz
parents:
diff changeset
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
anatofuz
parents:
diff changeset
4 // See https://llvm.org/LICENSE.txt for license information.
anatofuz
parents:
diff changeset
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
anatofuz
parents:
diff changeset
6 //
anatofuz
parents:
diff changeset
7 //===----------------------------------------------------------------------===//
anatofuz
parents:
diff changeset
8 //
anatofuz
parents:
diff changeset
9 // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
anatofuz
parents:
diff changeset
10 // we consider the possible partial match states:
anatofuz
parents:
diff changeset
11 //
anatofuz
parents:
diff changeset
12 // u n i q u e _ p t r
anatofuz
parents:
diff changeset
13 // +---------------------
anatofuz
parents:
diff changeset
14 // |A . . . . . . . . . .
anatofuz
parents:
diff changeset
15 // u|
anatofuz
parents:
diff changeset
16 // |. . . . . . . . . . .
anatofuz
parents:
diff changeset
17 // _|
anatofuz
parents:
diff changeset
18 // |. . . . . . . O . . .
anatofuz
parents:
diff changeset
19 // p|
anatofuz
parents:
diff changeset
20 // |. . . . . . . . . . B
anatofuz
parents:
diff changeset
21 //
anatofuz
parents:
diff changeset
22 // Each dot represents some prefix of the pattern being matched against some
anatofuz
parents:
diff changeset
23 // prefix of the word.
anatofuz
parents:
diff changeset
24 // - A is the initial state: '' matched against ''
anatofuz
parents:
diff changeset
25 // - O is an intermediate state: 'u_' matched against 'unique_'
anatofuz
parents:
diff changeset
26 // - B is the target state: 'u_p' matched against 'unique_ptr'
anatofuz
parents:
diff changeset
27 //
anatofuz
parents:
diff changeset
28 // We aim to find the best path from A->B.
anatofuz
parents:
diff changeset
29 // - Moving right (consuming a word character)
anatofuz
parents:
diff changeset
30 // Always legal: not all word characters must match.
anatofuz
parents:
diff changeset
31 // - Moving diagonally (consuming both a word and pattern character)
anatofuz
parents:
diff changeset
32 // Legal if the characters match.
anatofuz
parents:
diff changeset
33 // - Moving down (consuming a pattern character) is never legal.
anatofuz
parents:
diff changeset
34 // Never legal: all pattern characters must match something.
anatofuz
parents:
diff changeset
35 // Characters are matched case-insensitively.
anatofuz
parents:
diff changeset
36 // The first pattern character may only match the start of a word segment.
anatofuz
parents:
diff changeset
37 //
anatofuz
parents:
diff changeset
38 // The scoring is based on heuristics:
anatofuz
parents:
diff changeset
39 // - when matching a character, apply a bonus or penalty depending on the
anatofuz
parents:
diff changeset
40 // match quality (does case match, do word segments align, etc)
anatofuz
parents:
diff changeset
41 // - when skipping a character, apply a penalty if it hurts the match
anatofuz
parents:
diff changeset
42 // (it starts a word segment, or splits the matched region, etc)
anatofuz
parents:
diff changeset
43 //
anatofuz
parents:
diff changeset
44 // These heuristics require the ability to "look backward" one character, to
anatofuz
parents:
diff changeset
45 // see whether it was matched or not. Therefore the dynamic-programming matrix
anatofuz
parents:
diff changeset
46 // has an extra dimension (last character matched).
anatofuz
parents:
diff changeset
47 // Each entry also has an additional flag indicating whether the last-but-one
anatofuz
parents:
diff changeset
48 // character matched, which is needed to trace back through the scoring table
anatofuz
parents:
diff changeset
49 // and reconstruct the match.
anatofuz
parents:
diff changeset
50 //
anatofuz
parents:
diff changeset
51 // We treat strings as byte-sequences, so only ASCII has first-class support.
anatofuz
parents:
diff changeset
52 //
anatofuz
parents:
diff changeset
53 // This algorithm was inspired by VS code's client-side filtering, and aims
anatofuz
parents:
diff changeset
54 // to be mostly-compatible.
anatofuz
parents:
diff changeset
55 //
anatofuz
parents:
diff changeset
56 //===----------------------------------------------------------------------===//
anatofuz
parents:
diff changeset
57
anatofuz
parents:
diff changeset
58 #include "FuzzyMatch.h"
anatofuz
parents:
diff changeset
59 #include "llvm/ADT/Optional.h"
anatofuz
parents:
diff changeset
60 #include "llvm/Support/Format.h"
anatofuz
parents:
diff changeset
61
anatofuz
parents:
diff changeset
62 namespace clang {
anatofuz
parents:
diff changeset
63 namespace clangd {
anatofuz
parents:
diff changeset
64
anatofuz
parents:
diff changeset
65 constexpr int FuzzyMatcher::MaxPat;
anatofuz
parents:
diff changeset
66 constexpr int FuzzyMatcher::MaxWord;
anatofuz
parents:
diff changeset
67
anatofuz
parents:
diff changeset
68 static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
anatofuz
parents:
diff changeset
69 // A "negative infinity" score that won't overflow.
anatofuz
parents:
diff changeset
70 // We use this to mark unreachable states and forbidden solutions.
anatofuz
parents:
diff changeset
71 // Score field is 15 bits wide, min value is -2^14, we use half of that.
anatofuz
parents:
diff changeset
72 static constexpr int AwfulScore = -(1 << 13);
anatofuz
parents:
diff changeset
73 static bool isAwful(int S) { return S < AwfulScore / 2; }
anatofuz
parents:
diff changeset
74 static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
anatofuz
parents:
diff changeset
75
anatofuz
parents:
diff changeset
76 FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
anatofuz
parents:
diff changeset
77 : PatN(std::min<int>(MaxPat, Pattern.size())),
anatofuz
parents:
diff changeset
78 ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
anatofuz
parents:
diff changeset
79 std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
anatofuz
parents:
diff changeset
80 for (int I = 0; I < PatN; ++I)
anatofuz
parents:
diff changeset
81 LowPat[I] = lower(Pat[I]);
anatofuz
parents:
diff changeset
82 Scores[0][0][Miss] = {0, Miss};
anatofuz
parents:
diff changeset
83 Scores[0][0][Match] = {AwfulScore, Miss};
anatofuz
parents:
diff changeset
84 for (int P = 0; P <= PatN; ++P)
anatofuz
parents:
diff changeset
85 for (int W = 0; W < P; ++W)
anatofuz
parents:
diff changeset
86 for (Action A : {Miss, Match})
anatofuz
parents:
diff changeset
87 Scores[P][W][A] = {AwfulScore, Miss};
anatofuz
parents:
diff changeset
88 PatTypeSet = calculateRoles(llvm::StringRef(Pat, PatN),
anatofuz
parents:
diff changeset
89 llvm::makeMutableArrayRef(PatRole, PatN));
anatofuz
parents:
diff changeset
90 }
anatofuz
parents:
diff changeset
91
anatofuz
parents:
diff changeset
92 llvm::Optional<float> FuzzyMatcher::match(llvm::StringRef Word) {
anatofuz
parents:
diff changeset
93 if (!(WordContainsPattern = init(Word)))
anatofuz
parents:
diff changeset
94 return llvm::None;
anatofuz
parents:
diff changeset
95 if (!PatN)
anatofuz
parents:
diff changeset
96 return 1;
anatofuz
parents:
diff changeset
97 buildGraph();
anatofuz
parents:
diff changeset
98 auto Best = std::max(Scores[PatN][WordN][Miss].Score,
anatofuz
parents:
diff changeset
99 Scores[PatN][WordN][Match].Score);
anatofuz
parents:
diff changeset
100 if (isAwful(Best))
anatofuz
parents:
diff changeset
101 return llvm::None;
anatofuz
parents:
diff changeset
102 float Score =
anatofuz
parents:
diff changeset
103 ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
anatofuz
parents:
diff changeset
104 // If the pattern is as long as the word, we have an exact string match,
anatofuz
parents:
diff changeset
105 // since every pattern character must match something.
anatofuz
parents:
diff changeset
106 if (WordN == PatN)
anatofuz
parents:
diff changeset
107 Score *= 2; // May not be perfect 2 if case differs in a significant way.
anatofuz
parents:
diff changeset
108 return Score;
anatofuz
parents:
diff changeset
109 }
anatofuz
parents:
diff changeset
110
anatofuz
parents:
diff changeset
111 // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
anatofuz
parents:
diff changeset
112 // The top 6 bits of the char select the byte, the bottom 2 select the offset.
anatofuz
parents:
diff changeset
113 // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
anatofuz
parents:
diff changeset
114 constexpr static uint8_t CharTypes[] = {
anatofuz
parents:
diff changeset
115 0x00, 0x00, 0x00, 0x00, // Control characters
anatofuz
parents:
diff changeset
116 0x00, 0x00, 0x00, 0x00, // Control characters
anatofuz
parents:
diff changeset
117 0xff, 0xff, 0xff, 0xff, // Punctuation
anatofuz
parents:
diff changeset
118 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
anatofuz
parents:
diff changeset
119 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
anatofuz
parents:
diff changeset
120 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
anatofuz
parents:
diff changeset
121 0x57, 0x55, 0x55, 0x55, // ` and a-o
anatofuz
parents:
diff changeset
122 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
anatofuz
parents:
diff changeset
123 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
anatofuz
parents:
diff changeset
124 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
anatofuz
parents:
diff changeset
125 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
anatofuz
parents:
diff changeset
126 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
anatofuz
parents:
diff changeset
127 };
anatofuz
parents:
diff changeset
128
anatofuz
parents:
diff changeset
129 // The Role can be determined from the Type of a character and its neighbors:
anatofuz
parents:
diff changeset
130 //
anatofuz
parents:
diff changeset
131 // Example | Chars | Type | Role
anatofuz
parents:
diff changeset
132 // ---------+--------------+-----
anatofuz
parents:
diff changeset
133 // F(o)oBar | Foo | Ull | Tail
anatofuz
parents:
diff changeset
134 // Foo(B)ar | oBa | lUl | Head
anatofuz
parents:
diff changeset
135 // (f)oo | ^fo | Ell | Head
anatofuz
parents:
diff changeset
136 // H(T)TP | HTT | UUU | Tail
anatofuz
parents:
diff changeset
137 //
anatofuz
parents:
diff changeset
138 // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
anatofuz
parents:
diff changeset
139 // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
anatofuz
parents:
diff changeset
140 // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
anatofuz
parents:
diff changeset
141 constexpr static uint8_t CharRoles[] = {
anatofuz
parents:
diff changeset
142 // clang-format off
anatofuz
parents:
diff changeset
143 // Curr= Empty Lower Upper Separ
anatofuz
parents:
diff changeset
144 /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
anatofuz
parents:
diff changeset
145 /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
anatofuz
parents:
diff changeset
146 /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
anatofuz
parents:
diff changeset
147 /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
anatofuz
parents:
diff changeset
148 // clang-format on
anatofuz
parents:
diff changeset
149 };
anatofuz
parents:
diff changeset
150
anatofuz
parents:
diff changeset
151 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
anatofuz
parents:
diff changeset
152 return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
anatofuz
parents:
diff changeset
153 }
anatofuz
parents:
diff changeset
154 CharTypeSet calculateRoles(llvm::StringRef Text,
anatofuz
parents:
diff changeset
155 llvm::MutableArrayRef<CharRole> Roles) {
anatofuz
parents:
diff changeset
156 assert(Text.size() == Roles.size());
anatofuz
parents:
diff changeset
157 if (Text.size() == 0)
anatofuz
parents:
diff changeset
158 return 0;
anatofuz
parents:
diff changeset
159 CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
anatofuz
parents:
diff changeset
160 CharTypeSet TypeSet = 1 << Type;
anatofuz
parents:
diff changeset
161 // Types holds a sliding window of (Prev, Curr, Next) types.
anatofuz
parents:
diff changeset
162 // Initial value is (Empty, Empty, type of Text[0]).
anatofuz
parents:
diff changeset
163 int Types = Type;
anatofuz
parents:
diff changeset
164 // Rotate slides in the type of the next character.
anatofuz
parents:
diff changeset
165 auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
anatofuz
parents:
diff changeset
166 for (unsigned I = 0; I < Text.size() - 1; ++I) {
anatofuz
parents:
diff changeset
167 // For each character, rotate in the next, and look up the role.
anatofuz
parents:
diff changeset
168 Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
anatofuz
parents:
diff changeset
169 TypeSet |= 1 << Type;
anatofuz
parents:
diff changeset
170 Rotate(Type);
anatofuz
parents:
diff changeset
171 Roles[I] = packedLookup<CharRole>(CharRoles, Types);
anatofuz
parents:
diff changeset
172 }
anatofuz
parents:
diff changeset
173 // For the last character, the "next character" is Empty.
anatofuz
parents:
diff changeset
174 Rotate(Empty);
anatofuz
parents:
diff changeset
175 Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
anatofuz
parents:
diff changeset
176 return TypeSet;
anatofuz
parents:
diff changeset
177 }
anatofuz
parents:
diff changeset
178
anatofuz
parents:
diff changeset
179 // Sets up the data structures matching Word.
anatofuz
parents:
diff changeset
180 // Returns false if we can cheaply determine that no match is possible.
anatofuz
parents:
diff changeset
181 bool FuzzyMatcher::init(llvm::StringRef NewWord) {
anatofuz
parents:
diff changeset
182 WordN = std::min<int>(MaxWord, NewWord.size());
anatofuz
parents:
diff changeset
183 if (PatN > WordN)
anatofuz
parents:
diff changeset
184 return false;
anatofuz
parents:
diff changeset
185 std::copy(NewWord.begin(), NewWord.begin() + WordN, Word);
anatofuz
parents:
diff changeset
186 if (PatN == 0)
anatofuz
parents:
diff changeset
187 return true;
anatofuz
parents:
diff changeset
188 for (int I = 0; I < WordN; ++I)
anatofuz
parents:
diff changeset
189 LowWord[I] = lower(Word[I]);
anatofuz
parents:
diff changeset
190
anatofuz
parents:
diff changeset
191 // Cheap subsequence check.
anatofuz
parents:
diff changeset
192 for (int W = 0, P = 0; P != PatN; ++W) {
anatofuz
parents:
diff changeset
193 if (W == WordN)
anatofuz
parents:
diff changeset
194 return false;
anatofuz
parents:
diff changeset
195 if (LowWord[W] == LowPat[P])
anatofuz
parents:
diff changeset
196 ++P;
anatofuz
parents:
diff changeset
197 }
anatofuz
parents:
diff changeset
198
anatofuz
parents:
diff changeset
199 // FIXME: some words are hard to tokenize algorithmically.
anatofuz
parents:
diff changeset
200 // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
anatofuz
parents:
diff changeset
201 // We could add a tokenization dictionary for common stdlib names.
anatofuz
parents:
diff changeset
202 WordTypeSet = calculateRoles(llvm::StringRef(Word, WordN),
anatofuz
parents:
diff changeset
203 llvm::makeMutableArrayRef(WordRole, WordN));
anatofuz
parents:
diff changeset
204 return true;
anatofuz
parents:
diff changeset
205 }
anatofuz
parents:
diff changeset
206
anatofuz
parents:
diff changeset
207 // The forwards pass finds the mappings of Pattern onto Word.
anatofuz
parents:
diff changeset
208 // Score = best score achieved matching Word[..W] against Pat[..P].
anatofuz
parents:
diff changeset
209 // Unlike other tables, indices range from 0 to N *inclusive*
anatofuz
parents:
diff changeset
210 // Matched = whether we chose to match Word[W] with Pat[P] or not.
anatofuz
parents:
diff changeset
211 //
anatofuz
parents:
diff changeset
212 // Points are mostly assigned to matched characters, with 1 being a good score
anatofuz
parents:
diff changeset
213 // and 3 being a great one. So we treat the score range as [0, 3 * PatN].
anatofuz
parents:
diff changeset
214 // This range is not strict: we can apply larger bonuses/penalties, or penalize
anatofuz
parents:
diff changeset
215 // non-matched characters.
anatofuz
parents:
diff changeset
216 void FuzzyMatcher::buildGraph() {
anatofuz
parents:
diff changeset
217 for (int W = 0; W < WordN; ++W) {
anatofuz
parents:
diff changeset
218 Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
anatofuz
parents:
diff changeset
219 Miss};
anatofuz
parents:
diff changeset
220 Scores[0][W + 1][Match] = {AwfulScore, Miss};
anatofuz
parents:
diff changeset
221 }
anatofuz
parents:
diff changeset
222 for (int P = 0; P < PatN; ++P) {
anatofuz
parents:
diff changeset
223 for (int W = P; W < WordN; ++W) {
anatofuz
parents:
diff changeset
224 auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
anatofuz
parents:
diff changeset
225
anatofuz
parents:
diff changeset
226 auto MatchMissScore = PreMiss[Match].Score;
anatofuz
parents:
diff changeset
227 auto MissMissScore = PreMiss[Miss].Score;
anatofuz
parents:
diff changeset
228 if (P < PatN - 1) { // Skipping trailing characters is always free.
anatofuz
parents:
diff changeset
229 MatchMissScore -= skipPenalty(W, Match);
anatofuz
parents:
diff changeset
230 MissMissScore -= skipPenalty(W, Miss);
anatofuz
parents:
diff changeset
231 }
anatofuz
parents:
diff changeset
232 Score[Miss] = (MatchMissScore > MissMissScore)
anatofuz
parents:
diff changeset
233 ? ScoreInfo{MatchMissScore, Match}
anatofuz
parents:
diff changeset
234 : ScoreInfo{MissMissScore, Miss};
anatofuz
parents:
diff changeset
235
anatofuz
parents:
diff changeset
236 auto &PreMatch = Scores[P][W];
anatofuz
parents:
diff changeset
237 auto MatchMatchScore =
anatofuz
parents:
diff changeset
238 allowMatch(P, W, Match)
anatofuz
parents:
diff changeset
239 ? PreMatch[Match].Score + matchBonus(P, W, Match)
anatofuz
parents:
diff changeset
240 : AwfulScore;
anatofuz
parents:
diff changeset
241 auto MissMatchScore = allowMatch(P, W, Miss)
anatofuz
parents:
diff changeset
242 ? PreMatch[Miss].Score + matchBonus(P, W, Miss)
anatofuz
parents:
diff changeset
243 : AwfulScore;
anatofuz
parents:
diff changeset
244 Score[Match] = (MatchMatchScore > MissMatchScore)
anatofuz
parents:
diff changeset
245 ? ScoreInfo{MatchMatchScore, Match}
anatofuz
parents:
diff changeset
246 : ScoreInfo{MissMatchScore, Miss};
anatofuz
parents:
diff changeset
247 }
anatofuz
parents:
diff changeset
248 }
anatofuz
parents:
diff changeset
249 }
anatofuz
parents:
diff changeset
250
anatofuz
parents:
diff changeset
251 bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
anatofuz
parents:
diff changeset
252 if (LowPat[P] != LowWord[W])
anatofuz
parents:
diff changeset
253 return false;
anatofuz
parents:
diff changeset
254 // We require a "strong" match:
anatofuz
parents:
diff changeset
255 // - for the first pattern character. [foo] !~ "barefoot"
anatofuz
parents:
diff changeset
256 // - after a gap. [pat] !~ "patnther"
anatofuz
parents:
diff changeset
257 if (Last == Miss) {
anatofuz
parents:
diff changeset
258 // We're banning matches outright, so conservatively accept some other cases
anatofuz
parents:
diff changeset
259 // where our segmentation might be wrong:
anatofuz
parents:
diff changeset
260 // - allow matching B in ABCDef (but not in NDEBUG)
anatofuz
parents:
diff changeset
261 // - we'd like to accept print in sprintf, but too many false positives
anatofuz
parents:
diff changeset
262 if (WordRole[W] == Tail &&
anatofuz
parents:
diff changeset
263 (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower)))
anatofuz
parents:
diff changeset
264 return false;
anatofuz
parents:
diff changeset
265 }
anatofuz
parents:
diff changeset
266 return true;
anatofuz
parents:
diff changeset
267 }
anatofuz
parents:
diff changeset
268
anatofuz
parents:
diff changeset
269 int FuzzyMatcher::skipPenalty(int W, Action Last) const {
anatofuz
parents:
diff changeset
270 if (W == 0) // Skipping the first character.
anatofuz
parents:
diff changeset
271 return 3;
anatofuz
parents:
diff changeset
272 if (WordRole[W] == Head) // Skipping a segment.
anatofuz
parents:
diff changeset
273 return 1; // We want to keep this lower than a consecutive match bonus.
anatofuz
parents:
diff changeset
274 // Instead of penalizing non-consecutive matches, we give a bonus to a
anatofuz
parents:
diff changeset
275 // consecutive match in matchBonus. This produces a better score distribution
anatofuz
parents:
diff changeset
276 // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
anatofuz
parents:
diff changeset
277 return 0;
anatofuz
parents:
diff changeset
278 }
anatofuz
parents:
diff changeset
279
anatofuz
parents:
diff changeset
280 int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
anatofuz
parents:
diff changeset
281 assert(LowPat[P] == LowWord[W]);
anatofuz
parents:
diff changeset
282 int S = 1;
anatofuz
parents:
diff changeset
283 bool IsPatSingleCase =
anatofuz
parents:
diff changeset
284 (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper);
anatofuz
parents:
diff changeset
285 // Bonus: case matches, or a Head in the pattern aligns with one in the word.
anatofuz
parents:
diff changeset
286 // Single-case patterns lack segmentation signals and we assume any character
anatofuz
parents:
diff changeset
287 // can be a head of a segment.
anatofuz
parents:
diff changeset
288 if (Pat[P] == Word[W] ||
anatofuz
parents:
diff changeset
289 (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head)))
anatofuz
parents:
diff changeset
290 ++S;
anatofuz
parents:
diff changeset
291 // Bonus: a consecutive match. First character match also gets a bonus to
anatofuz
parents:
diff changeset
292 // ensure prefix final match score normalizes to 1.0.
anatofuz
parents:
diff changeset
293 if (W == 0 || Last == Match)
anatofuz
parents:
diff changeset
294 S += 2;
anatofuz
parents:
diff changeset
295 // Penalty: matching inside a segment (and previous char wasn't matched).
anatofuz
parents:
diff changeset
296 if (WordRole[W] == Tail && P && Last == Miss)
anatofuz
parents:
diff changeset
297 S -= 3;
anatofuz
parents:
diff changeset
298 // Penalty: a Head in the pattern matches in the middle of a word segment.
anatofuz
parents:
diff changeset
299 if (PatRole[P] == Head && WordRole[W] == Tail)
anatofuz
parents:
diff changeset
300 --S;
anatofuz
parents:
diff changeset
301 // Penalty: matching the first pattern character in the middle of a segment.
anatofuz
parents:
diff changeset
302 if (P == 0 && WordRole[W] == Tail)
anatofuz
parents:
diff changeset
303 S -= 4;
anatofuz
parents:
diff changeset
304 assert(S <= PerfectBonus);
anatofuz
parents:
diff changeset
305 return S;
anatofuz
parents:
diff changeset
306 }
anatofuz
parents:
diff changeset
307
anatofuz
parents:
diff changeset
308 llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
anatofuz
parents:
diff changeset
309 llvm::SmallString<256> Result;
anatofuz
parents:
diff changeset
310 OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against ["
anatofuz
parents:
diff changeset
311 << llvm::StringRef(Pat, PatN) << "] ===\n";
anatofuz
parents:
diff changeset
312 if (PatN == 0) {
anatofuz
parents:
diff changeset
313 OS << "Pattern is empty: perfect match.\n";
anatofuz
parents:
diff changeset
314 return Result = llvm::StringRef(Word, WordN);
anatofuz
parents:
diff changeset
315 }
anatofuz
parents:
diff changeset
316 if (WordN == 0) {
anatofuz
parents:
diff changeset
317 OS << "Word is empty: no match.\n";
anatofuz
parents:
diff changeset
318 return Result;
anatofuz
parents:
diff changeset
319 }
anatofuz
parents:
diff changeset
320 if (!WordContainsPattern) {
anatofuz
parents:
diff changeset
321 OS << "Substring check failed.\n";
anatofuz
parents:
diff changeset
322 return Result;
anatofuz
parents:
diff changeset
323 } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
anatofuz
parents:
diff changeset
324 Scores[PatN][WordN][Miss].Score))) {
anatofuz
parents:
diff changeset
325 OS << "Substring check passed, but all matches are forbidden\n";
anatofuz
parents:
diff changeset
326 }
anatofuz
parents:
diff changeset
327 if (!(PatTypeSet & 1 << Upper))
anatofuz
parents:
diff changeset
328 OS << "Lowercase query, so scoring ignores case\n";
anatofuz
parents:
diff changeset
329
anatofuz
parents:
diff changeset
330 // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
anatofuz
parents:
diff changeset
331 // The Score table has cumulative scores, subtracting along this path gives
anatofuz
parents:
diff changeset
332 // us the per-letter scores.
anatofuz
parents:
diff changeset
333 Action Last =
anatofuz
parents:
diff changeset
334 (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
anatofuz
parents:
diff changeset
335 ? Match
anatofuz
parents:
diff changeset
336 : Miss;
anatofuz
parents:
diff changeset
337 int S[MaxWord];
anatofuz
parents:
diff changeset
338 Action A[MaxWord];
anatofuz
parents:
diff changeset
339 for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
anatofuz
parents:
diff changeset
340 A[W] = Last;
anatofuz
parents:
diff changeset
341 const auto &Cell = Scores[P + 1][W + 1][Last];
anatofuz
parents:
diff changeset
342 if (Last == Match)
anatofuz
parents:
diff changeset
343 --P;
anatofuz
parents:
diff changeset
344 const auto &Prev = Scores[P + 1][W][Cell.Prev];
anatofuz
parents:
diff changeset
345 S[W] = Cell.Score - Prev.Score;
anatofuz
parents:
diff changeset
346 Last = Cell.Prev;
anatofuz
parents:
diff changeset
347 }
anatofuz
parents:
diff changeset
348 for (int I = 0; I < WordN; ++I) {
anatofuz
parents:
diff changeset
349 if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
anatofuz
parents:
diff changeset
350 Result.push_back('[');
anatofuz
parents:
diff changeset
351 if (A[I] == Miss && I > 0 && A[I - 1] == Match)
anatofuz
parents:
diff changeset
352 Result.push_back(']');
anatofuz
parents:
diff changeset
353 Result.push_back(Word[I]);
anatofuz
parents:
diff changeset
354 }
anatofuz
parents:
diff changeset
355 if (A[WordN - 1] == Match)
anatofuz
parents:
diff changeset
356 Result.push_back(']');
anatofuz
parents:
diff changeset
357
anatofuz
parents:
diff changeset
358 for (char C : llvm::StringRef(Word, WordN))
anatofuz
parents:
diff changeset
359 OS << " " << C << " ";
anatofuz
parents:
diff changeset
360 OS << "\n";
anatofuz
parents:
diff changeset
361 for (int I = 0, J = 0; I < WordN; I++)
anatofuz
parents:
diff changeset
362 OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
anatofuz
parents:
diff changeset
363 OS << "\n";
anatofuz
parents:
diff changeset
364 for (int I = 0; I < WordN; I++)
anatofuz
parents:
diff changeset
365 OS << llvm::format("%2d ", S[I]);
anatofuz
parents:
diff changeset
366 OS << "\n";
anatofuz
parents:
diff changeset
367
anatofuz
parents:
diff changeset
368 OS << "\nSegmentation:";
anatofuz
parents:
diff changeset
369 OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n ";
anatofuz
parents:
diff changeset
370 for (int I = 0; I < WordN; ++I)
anatofuz
parents:
diff changeset
371 OS << "?-+ "[static_cast<int>(WordRole[I])];
anatofuz
parents:
diff changeset
372 OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n ";
anatofuz
parents:
diff changeset
373 for (int I = 0; I < PatN; ++I)
anatofuz
parents:
diff changeset
374 OS << "?-+ "[static_cast<int>(PatRole[I])];
anatofuz
parents:
diff changeset
375 OS << "\n";
anatofuz
parents:
diff changeset
376
anatofuz
parents:
diff changeset
377 OS << "\nScoring table (last-Miss, last-Match):\n";
anatofuz
parents:
diff changeset
378 OS << " | ";
anatofuz
parents:
diff changeset
379 for (char C : llvm::StringRef(Word, WordN))
anatofuz
parents:
diff changeset
380 OS << " " << C << " ";
anatofuz
parents:
diff changeset
381 OS << "\n";
anatofuz
parents:
diff changeset
382 OS << "-+----" << std::string(WordN * 4, '-') << "\n";
anatofuz
parents:
diff changeset
383 for (int I = 0; I <= PatN; ++I) {
anatofuz
parents:
diff changeset
384 for (Action A : {Miss, Match}) {
anatofuz
parents:
diff changeset
385 OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
anatofuz
parents:
diff changeset
386 for (int J = 0; J <= WordN; ++J) {
anatofuz
parents:
diff changeset
387 if (!isAwful(Scores[I][J][A].Score))
anatofuz
parents:
diff changeset
388 OS << llvm::format("%3d%c", Scores[I][J][A].Score,
anatofuz
parents:
diff changeset
389 Scores[I][J][A].Prev == Match ? '*' : ' ');
anatofuz
parents:
diff changeset
390 else
anatofuz
parents:
diff changeset
391 OS << " ";
anatofuz
parents:
diff changeset
392 }
anatofuz
parents:
diff changeset
393 OS << "\n";
anatofuz
parents:
diff changeset
394 }
anatofuz
parents:
diff changeset
395 }
anatofuz
parents:
diff changeset
396
anatofuz
parents:
diff changeset
397 return Result;
anatofuz
parents:
diff changeset
398 }
anatofuz
parents:
diff changeset
399
anatofuz
parents:
diff changeset
400 } // namespace clangd
anatofuz
parents:
diff changeset
401 } // namespace clang