CbC/CbC_llvm: clang-tools-extra/clangd/FuzzyMatch.cpp annotate

annotate clang-tools-extra/clangd/FuzzyMatch.cpp @ 221:79ff65ed7e25

LLVM12 Original

author	Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date	Tue, 15 Jun 2021 19:15:29 +0900
parents	1d019706d866
children	c4bab56944e8

rev	line source
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	1 //===--- FuzzyMatch.h - Approximate identifier matching ---------- C++--===//
1d019706d866 LLVM10 anatofuz parents: diff changeset	2 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
1d019706d866 LLVM10 anatofuz parents: diff changeset	4 // See https://llvm.org/LICENSE.txt for license information.
1d019706d866 LLVM10 anatofuz parents: diff changeset	5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
1d019706d866 LLVM10 anatofuz parents: diff changeset	6 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	7 //===----------------------------------------------------------------------===//
1d019706d866 LLVM10 anatofuz parents: diff changeset	8 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	9 // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
1d019706d866 LLVM10 anatofuz parents: diff changeset	10 // we consider the possible partial match states:
1d019706d866 LLVM10 anatofuz parents: diff changeset	11 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	12 // u n i q u e _ p t r
1d019706d866 LLVM10 anatofuz parents: diff changeset	13 // +---------------------
1d019706d866 LLVM10 anatofuz parents: diff changeset	14 // \|A . . . . . . . . . .
1d019706d866 LLVM10 anatofuz parents: diff changeset	15 // u\|
1d019706d866 LLVM10 anatofuz parents: diff changeset	16 // \|. . . . . . . . . . .
1d019706d866 LLVM10 anatofuz parents: diff changeset	17 // _\|
1d019706d866 LLVM10 anatofuz parents: diff changeset	18 // \|. . . . . . . O . . .
1d019706d866 LLVM10 anatofuz parents: diff changeset	19 // p\|
1d019706d866 LLVM10 anatofuz parents: diff changeset	20 // \|. . . . . . . . . . B
1d019706d866 LLVM10 anatofuz parents: diff changeset	21 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	22 // Each dot represents some prefix of the pattern being matched against some
1d019706d866 LLVM10 anatofuz parents: diff changeset	23 // prefix of the word.
1d019706d866 LLVM10 anatofuz parents: diff changeset	24 // - A is the initial state: '' matched against ''
1d019706d866 LLVM10 anatofuz parents: diff changeset	25 // - O is an intermediate state: 'u_' matched against 'unique_'
1d019706d866 LLVM10 anatofuz parents: diff changeset	26 // - B is the target state: 'u_p' matched against 'unique_ptr'
1d019706d866 LLVM10 anatofuz parents: diff changeset	27 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	28 // We aim to find the best path from A->B.
1d019706d866 LLVM10 anatofuz parents: diff changeset	29 // - Moving right (consuming a word character)
1d019706d866 LLVM10 anatofuz parents: diff changeset	30 // Always legal: not all word characters must match.
1d019706d866 LLVM10 anatofuz parents: diff changeset	31 // - Moving diagonally (consuming both a word and pattern character)
1d019706d866 LLVM10 anatofuz parents: diff changeset	32 // Legal if the characters match.
1d019706d866 LLVM10 anatofuz parents: diff changeset	33 // - Moving down (consuming a pattern character) is never legal.
1d019706d866 LLVM10 anatofuz parents: diff changeset	34 // Never legal: all pattern characters must match something.
1d019706d866 LLVM10 anatofuz parents: diff changeset	35 // Characters are matched case-insensitively.
1d019706d866 LLVM10 anatofuz parents: diff changeset	36 // The first pattern character may only match the start of a word segment.
1d019706d866 LLVM10 anatofuz parents: diff changeset	37 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	38 // The scoring is based on heuristics:
1d019706d866 LLVM10 anatofuz parents: diff changeset	39 // - when matching a character, apply a bonus or penalty depending on the
1d019706d866 LLVM10 anatofuz parents: diff changeset	40 // match quality (does case match, do word segments align, etc)
1d019706d866 LLVM10 anatofuz parents: diff changeset	41 // - when skipping a character, apply a penalty if it hurts the match
1d019706d866 LLVM10 anatofuz parents: diff changeset	42 // (it starts a word segment, or splits the matched region, etc)
1d019706d866 LLVM10 anatofuz parents: diff changeset	43 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	44 // These heuristics require the ability to "look backward" one character, to
1d019706d866 LLVM10 anatofuz parents: diff changeset	45 // see whether it was matched or not. Therefore the dynamic-programming matrix
1d019706d866 LLVM10 anatofuz parents: diff changeset	46 // has an extra dimension (last character matched).
1d019706d866 LLVM10 anatofuz parents: diff changeset	47 // Each entry also has an additional flag indicating whether the last-but-one
1d019706d866 LLVM10 anatofuz parents: diff changeset	48 // character matched, which is needed to trace back through the scoring table
1d019706d866 LLVM10 anatofuz parents: diff changeset	49 // and reconstruct the match.
1d019706d866 LLVM10 anatofuz parents: diff changeset	50 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	51 // We treat strings as byte-sequences, so only ASCII has first-class support.
1d019706d866 LLVM10 anatofuz parents: diff changeset	52 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	53 // This algorithm was inspired by VS code's client-side filtering, and aims
1d019706d866 LLVM10 anatofuz parents: diff changeset	54 // to be mostly-compatible.
1d019706d866 LLVM10 anatofuz parents: diff changeset	55 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	56 //===----------------------------------------------------------------------===//
1d019706d866 LLVM10 anatofuz parents: diff changeset	57
1d019706d866 LLVM10 anatofuz parents: diff changeset	58 #include "FuzzyMatch.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	59 #include "llvm/ADT/Optional.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	60 #include "llvm/Support/Format.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	61
1d019706d866 LLVM10 anatofuz parents: diff changeset	62 namespace clang {
1d019706d866 LLVM10 anatofuz parents: diff changeset	63 namespace clangd {
1d019706d866 LLVM10 anatofuz parents: diff changeset	64
1d019706d866 LLVM10 anatofuz parents: diff changeset	65 constexpr int FuzzyMatcher::MaxPat;
1d019706d866 LLVM10 anatofuz parents: diff changeset	66 constexpr int FuzzyMatcher::MaxWord;
1d019706d866 LLVM10 anatofuz parents: diff changeset	67
1d019706d866 LLVM10 anatofuz parents: diff changeset	68 static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
1d019706d866 LLVM10 anatofuz parents: diff changeset	69 // A "negative infinity" score that won't overflow.
1d019706d866 LLVM10 anatofuz parents: diff changeset	70 // We use this to mark unreachable states and forbidden solutions.
1d019706d866 LLVM10 anatofuz parents: diff changeset	71 // Score field is 15 bits wide, min value is -2^14, we use half of that.
1d019706d866 LLVM10 anatofuz parents: diff changeset	72 static constexpr int AwfulScore = -(1 << 13);
1d019706d866 LLVM10 anatofuz parents: diff changeset	73 static bool isAwful(int S) { return S < AwfulScore / 2; }
1d019706d866 LLVM10 anatofuz parents: diff changeset	74 static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
1d019706d866 LLVM10 anatofuz parents: diff changeset	75
1d019706d866 LLVM10 anatofuz parents: diff changeset	76 FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
1d019706d866 LLVM10 anatofuz parents: diff changeset	77 : PatN(std::min<int>(MaxPat, Pattern.size())),
1d019706d866 LLVM10 anatofuz parents: diff changeset	78 ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	79 std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
1d019706d866 LLVM10 anatofuz parents: diff changeset	80 for (int I = 0; I < PatN; ++I)
1d019706d866 LLVM10 anatofuz parents: diff changeset	81 LowPat[I] = lower(Pat[I]);
1d019706d866 LLVM10 anatofuz parents: diff changeset	82 Scores[0][0][Miss] = {0, Miss};
1d019706d866 LLVM10 anatofuz parents: diff changeset	83 Scores[0][0][Match] = {AwfulScore, Miss};
1d019706d866 LLVM10 anatofuz parents: diff changeset	84 for (int P = 0; P <= PatN; ++P)
1d019706d866 LLVM10 anatofuz parents: diff changeset	85 for (int W = 0; W < P; ++W)
1d019706d866 LLVM10 anatofuz parents: diff changeset	86 for (Action A : {Miss, Match})
1d019706d866 LLVM10 anatofuz parents: diff changeset	87 Scores[P][W][A] = {AwfulScore, Miss};
1d019706d866 LLVM10 anatofuz parents: diff changeset	88 PatTypeSet = calculateRoles(llvm::StringRef(Pat, PatN),
1d019706d866 LLVM10 anatofuz parents: diff changeset	89 llvm::makeMutableArrayRef(PatRole, PatN));
1d019706d866 LLVM10 anatofuz parents: diff changeset	90 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	91
1d019706d866 LLVM10 anatofuz parents: diff changeset	92 llvm::Optional<float> FuzzyMatcher::match(llvm::StringRef Word) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	93 if (!(WordContainsPattern = init(Word)))
1d019706d866 LLVM10 anatofuz parents: diff changeset	94 return llvm::None;
1d019706d866 LLVM10 anatofuz parents: diff changeset	95 if (!PatN)
1d019706d866 LLVM10 anatofuz parents: diff changeset	96 return 1;
1d019706d866 LLVM10 anatofuz parents: diff changeset	97 buildGraph();
1d019706d866 LLVM10 anatofuz parents: diff changeset	98 auto Best = std::max(Scores[PatN][WordN][Miss].Score,
1d019706d866 LLVM10 anatofuz parents: diff changeset	99 Scores[PatN][WordN][Match].Score);
1d019706d866 LLVM10 anatofuz parents: diff changeset	100 if (isAwful(Best))
1d019706d866 LLVM10 anatofuz parents: diff changeset	101 return llvm::None;
1d019706d866 LLVM10 anatofuz parents: diff changeset	102 float Score =
1d019706d866 LLVM10 anatofuz parents: diff changeset	103 ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
1d019706d866 LLVM10 anatofuz parents: diff changeset	104 // If the pattern is as long as the word, we have an exact string match,
1d019706d866 LLVM10 anatofuz parents: diff changeset	105 // since every pattern character must match something.
1d019706d866 LLVM10 anatofuz parents: diff changeset	106 if (WordN == PatN)
1d019706d866 LLVM10 anatofuz parents: diff changeset	107 Score *= 2; // May not be perfect 2 if case differs in a significant way.
1d019706d866 LLVM10 anatofuz parents: diff changeset	108 return Score;
1d019706d866 LLVM10 anatofuz parents: diff changeset	109 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	110
1d019706d866 LLVM10 anatofuz parents: diff changeset	111 // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
1d019706d866 LLVM10 anatofuz parents: diff changeset	112 // The top 6 bits of the char select the byte, the bottom 2 select the offset.
1d019706d866 LLVM10 anatofuz parents: diff changeset	113 // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
1d019706d866 LLVM10 anatofuz parents: diff changeset	114 constexpr static uint8_t CharTypes[] = {
1d019706d866 LLVM10 anatofuz parents: diff changeset	115 0x00, 0x00, 0x00, 0x00, // Control characters
1d019706d866 LLVM10 anatofuz parents: diff changeset	116 0x00, 0x00, 0x00, 0x00, // Control characters
1d019706d866 LLVM10 anatofuz parents: diff changeset	117 0xff, 0xff, 0xff, 0xff, // Punctuation
1d019706d866 LLVM10 anatofuz parents: diff changeset	118 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
1d019706d866 LLVM10 anatofuz parents: diff changeset	119 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
1d019706d866 LLVM10 anatofuz parents: diff changeset	120 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
1d019706d866 LLVM10 anatofuz parents: diff changeset	121 0x57, 0x55, 0x55, 0x55, // ` and a-o
1d019706d866 LLVM10 anatofuz parents: diff changeset	122 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
1d019706d866 LLVM10 anatofuz parents: diff changeset	123 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
1d019706d866 LLVM10 anatofuz parents: diff changeset	124 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
1d019706d866 LLVM10 anatofuz parents: diff changeset	125 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1d019706d866 LLVM10 anatofuz parents: diff changeset	126 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1d019706d866 LLVM10 anatofuz parents: diff changeset	127 };
1d019706d866 LLVM10 anatofuz parents: diff changeset	128
1d019706d866 LLVM10 anatofuz parents: diff changeset	129 // The Role can be determined from the Type of a character and its neighbors:
1d019706d866 LLVM10 anatofuz parents: diff changeset	130 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	131 // Example \| Chars \| Type \| Role
1d019706d866 LLVM10 anatofuz parents: diff changeset	132 // ---------+--------------+-----
1d019706d866 LLVM10 anatofuz parents: diff changeset	133 // F(o)oBar \| Foo \| Ull \| Tail
1d019706d866 LLVM10 anatofuz parents: diff changeset	134 // Foo(B)ar \| oBa \| lUl \| Head
1d019706d866 LLVM10 anatofuz parents: diff changeset	135 // (f)oo \| ^fo \| Ell \| Head
1d019706d866 LLVM10 anatofuz parents: diff changeset	136 // H(T)TP \| HTT \| UUU \| Tail
1d019706d866 LLVM10 anatofuz parents: diff changeset	137 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	138 // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
1d019706d866 LLVM10 anatofuz parents: diff changeset	139 // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
1d019706d866 LLVM10 anatofuz parents: diff changeset	140 // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
1d019706d866 LLVM10 anatofuz parents: diff changeset	141 constexpr static uint8_t CharRoles[] = {
1d019706d866 LLVM10 anatofuz parents: diff changeset	142 // clang-format off
1d019706d866 LLVM10 anatofuz parents: diff changeset	143 // Curr= Empty Lower Upper Separ
1d019706d866 LLVM10 anatofuz parents: diff changeset	144 /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower\|Upper->Head
1d019706d866 LLVM10 anatofuz parents: diff changeset	145 /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
1d019706d866 LLVM10 anatofuz parents: diff changeset	146 /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
1d019706d866 LLVM10 anatofuz parents: diff changeset	147 /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
1d019706d866 LLVM10 anatofuz parents: diff changeset	148 // clang-format on
1d019706d866 LLVM10 anatofuz parents: diff changeset	149 };
1d019706d866 LLVM10 anatofuz parents: diff changeset	150
1d019706d866 LLVM10 anatofuz parents: diff changeset	151 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	152 return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
1d019706d866 LLVM10 anatofuz parents: diff changeset	153 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	154 CharTypeSet calculateRoles(llvm::StringRef Text,
1d019706d866 LLVM10 anatofuz parents: diff changeset	155 llvm::MutableArrayRef<CharRole> Roles) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	156 assert(Text.size() == Roles.size());
1d019706d866 LLVM10 anatofuz parents: diff changeset	157 if (Text.size() == 0)
1d019706d866 LLVM10 anatofuz parents: diff changeset	158 return 0;
1d019706d866 LLVM10 anatofuz parents: diff changeset	159 CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
1d019706d866 LLVM10 anatofuz parents: diff changeset	160 CharTypeSet TypeSet = 1 << Type;
1d019706d866 LLVM10 anatofuz parents: diff changeset	161 // Types holds a sliding window of (Prev, Curr, Next) types.
1d019706d866 LLVM10 anatofuz parents: diff changeset	162 // Initial value is (Empty, Empty, type of Text[0]).
1d019706d866 LLVM10 anatofuz parents: diff changeset	163 int Types = Type;
1d019706d866 LLVM10 anatofuz parents: diff changeset	164 // Rotate slides in the type of the next character.
1d019706d866 LLVM10 anatofuz parents: diff changeset	165 auto Rotate = [&](CharType T) { Types = ((Types << 2) \| T) & 0x3f; };
1d019706d866 LLVM10 anatofuz parents: diff changeset	166 for (unsigned I = 0; I < Text.size() - 1; ++I) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	167 // For each character, rotate in the next, and look up the role.
1d019706d866 LLVM10 anatofuz parents: diff changeset	168 Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
1d019706d866 LLVM10 anatofuz parents: diff changeset	169 TypeSet \|= 1 << Type;
1d019706d866 LLVM10 anatofuz parents: diff changeset	170 Rotate(Type);
1d019706d866 LLVM10 anatofuz parents: diff changeset	171 Roles[I] = packedLookup<CharRole>(CharRoles, Types);
1d019706d866 LLVM10 anatofuz parents: diff changeset	172 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	173 // For the last character, the "next character" is Empty.
1d019706d866 LLVM10 anatofuz parents: diff changeset	174 Rotate(Empty);
1d019706d866 LLVM10 anatofuz parents: diff changeset	175 Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
1d019706d866 LLVM10 anatofuz parents: diff changeset	176 return TypeSet;
1d019706d866 LLVM10 anatofuz parents: diff changeset	177 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	178
1d019706d866 LLVM10 anatofuz parents: diff changeset	179 // Sets up the data structures matching Word.
1d019706d866 LLVM10 anatofuz parents: diff changeset	180 // Returns false if we can cheaply determine that no match is possible.
1d019706d866 LLVM10 anatofuz parents: diff changeset	181 bool FuzzyMatcher::init(llvm::StringRef NewWord) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	182 WordN = std::min<int>(MaxWord, NewWord.size());
1d019706d866 LLVM10 anatofuz parents: diff changeset	183 if (PatN > WordN)
1d019706d866 LLVM10 anatofuz parents: diff changeset	184 return false;
1d019706d866 LLVM10 anatofuz parents: diff changeset	185 std::copy(NewWord.begin(), NewWord.begin() + WordN, Word);
1d019706d866 LLVM10 anatofuz parents: diff changeset	186 if (PatN == 0)
1d019706d866 LLVM10 anatofuz parents: diff changeset	187 return true;
1d019706d866 LLVM10 anatofuz parents: diff changeset	188 for (int I = 0; I < WordN; ++I)
1d019706d866 LLVM10 anatofuz parents: diff changeset	189 LowWord[I] = lower(Word[I]);
1d019706d866 LLVM10 anatofuz parents: diff changeset	190
1d019706d866 LLVM10 anatofuz parents: diff changeset	191 // Cheap subsequence check.
1d019706d866 LLVM10 anatofuz parents: diff changeset	192 for (int W = 0, P = 0; P != PatN; ++W) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	193 if (W == WordN)
1d019706d866 LLVM10 anatofuz parents: diff changeset	194 return false;
1d019706d866 LLVM10 anatofuz parents: diff changeset	195 if (LowWord[W] == LowPat[P])
1d019706d866 LLVM10 anatofuz parents: diff changeset	196 ++P;
1d019706d866 LLVM10 anatofuz parents: diff changeset	197 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	198
1d019706d866 LLVM10 anatofuz parents: diff changeset	199 // FIXME: some words are hard to tokenize algorithmically.
1d019706d866 LLVM10 anatofuz parents: diff changeset	200 // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
1d019706d866 LLVM10 anatofuz parents: diff changeset	201 // We could add a tokenization dictionary for common stdlib names.
1d019706d866 LLVM10 anatofuz parents: diff changeset	202 WordTypeSet = calculateRoles(llvm::StringRef(Word, WordN),
1d019706d866 LLVM10 anatofuz parents: diff changeset	203 llvm::makeMutableArrayRef(WordRole, WordN));
1d019706d866 LLVM10 anatofuz parents: diff changeset	204 return true;
1d019706d866 LLVM10 anatofuz parents: diff changeset	205 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	206
1d019706d866 LLVM10 anatofuz parents: diff changeset	207 // The forwards pass finds the mappings of Pattern onto Word.
1d019706d866 LLVM10 anatofuz parents: diff changeset	208 // Score = best score achieved matching Word[..W] against Pat[..P].
1d019706d866 LLVM10 anatofuz parents: diff changeset	209 // Unlike other tables, indices range from 0 to N inclusive
1d019706d866 LLVM10 anatofuz parents: diff changeset	210 // Matched = whether we chose to match Word[W] with Pat[P] or not.
1d019706d866 LLVM10 anatofuz parents: diff changeset	211 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	212 // Points are mostly assigned to matched characters, with 1 being a good score
1d019706d866 LLVM10 anatofuz parents: diff changeset	213 // and 3 being a great one. So we treat the score range as [0, 3 * PatN].
1d019706d866 LLVM10 anatofuz parents: diff changeset	214 // This range is not strict: we can apply larger bonuses/penalties, or penalize
1d019706d866 LLVM10 anatofuz parents: diff changeset	215 // non-matched characters.
1d019706d866 LLVM10 anatofuz parents: diff changeset	216 void FuzzyMatcher::buildGraph() {
1d019706d866 LLVM10 anatofuz parents: diff changeset	217 for (int W = 0; W < WordN; ++W) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	218 Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
1d019706d866 LLVM10 anatofuz parents: diff changeset	219 Miss};
1d019706d866 LLVM10 anatofuz parents: diff changeset	220 Scores[0][W + 1][Match] = {AwfulScore, Miss};
1d019706d866 LLVM10 anatofuz parents: diff changeset	221 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	222 for (int P = 0; P < PatN; ++P) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	223 for (int W = P; W < WordN; ++W) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	224 auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
1d019706d866 LLVM10 anatofuz parents: diff changeset	225
1d019706d866 LLVM10 anatofuz parents: diff changeset	226 auto MatchMissScore = PreMiss[Match].Score;
1d019706d866 LLVM10 anatofuz parents: diff changeset	227 auto MissMissScore = PreMiss[Miss].Score;
1d019706d866 LLVM10 anatofuz parents: diff changeset	228 if (P < PatN - 1) { // Skipping trailing characters is always free.
1d019706d866 LLVM10 anatofuz parents: diff changeset	229 MatchMissScore -= skipPenalty(W, Match);
1d019706d866 LLVM10 anatofuz parents: diff changeset	230 MissMissScore -= skipPenalty(W, Miss);
1d019706d866 LLVM10 anatofuz parents: diff changeset	231 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	232 Score[Miss] = (MatchMissScore > MissMissScore)
1d019706d866 LLVM10 anatofuz parents: diff changeset	233 ? ScoreInfo{MatchMissScore, Match}
1d019706d866 LLVM10 anatofuz parents: diff changeset	234 : ScoreInfo{MissMissScore, Miss};
1d019706d866 LLVM10 anatofuz parents: diff changeset	235
1d019706d866 LLVM10 anatofuz parents: diff changeset	236 auto &PreMatch = Scores[P][W];
1d019706d866 LLVM10 anatofuz parents: diff changeset	237 auto MatchMatchScore =
1d019706d866 LLVM10 anatofuz parents: diff changeset	238 allowMatch(P, W, Match)
1d019706d866 LLVM10 anatofuz parents: diff changeset	239 ? PreMatch[Match].Score + matchBonus(P, W, Match)
1d019706d866 LLVM10 anatofuz parents: diff changeset	240 : AwfulScore;
1d019706d866 LLVM10 anatofuz parents: diff changeset	241 auto MissMatchScore = allowMatch(P, W, Miss)
1d019706d866 LLVM10 anatofuz parents: diff changeset	242 ? PreMatch[Miss].Score + matchBonus(P, W, Miss)
1d019706d866 LLVM10 anatofuz parents: diff changeset	243 : AwfulScore;
1d019706d866 LLVM10 anatofuz parents: diff changeset	244 Score[Match] = (MatchMatchScore > MissMatchScore)
1d019706d866 LLVM10 anatofuz parents: diff changeset	245 ? ScoreInfo{MatchMatchScore, Match}
1d019706d866 LLVM10 anatofuz parents: diff changeset	246 : ScoreInfo{MissMatchScore, Miss};
1d019706d866 LLVM10 anatofuz parents: diff changeset	247 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	248 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	249 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	250
1d019706d866 LLVM10 anatofuz parents: diff changeset	251 bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
1d019706d866 LLVM10 anatofuz parents: diff changeset	252 if (LowPat[P] != LowWord[W])
1d019706d866 LLVM10 anatofuz parents: diff changeset	253 return false;
1d019706d866 LLVM10 anatofuz parents: diff changeset	254 // We require a "strong" match:
1d019706d866 LLVM10 anatofuz parents: diff changeset	255 // - for the first pattern character. [foo] !~ "barefoot"
1d019706d866 LLVM10 anatofuz parents: diff changeset	256 // - after a gap. [pat] !~ "patnther"
1d019706d866 LLVM10 anatofuz parents: diff changeset	257 if (Last == Miss) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	258 // We're banning matches outright, so conservatively accept some other cases
1d019706d866 LLVM10 anatofuz parents: diff changeset	259 // where our segmentation might be wrong:
1d019706d866 LLVM10 anatofuz parents: diff changeset	260 // - allow matching B in ABCDef (but not in NDEBUG)
1d019706d866 LLVM10 anatofuz parents: diff changeset	261 // - we'd like to accept print in sprintf, but too many false positives
1d019706d866 LLVM10 anatofuz parents: diff changeset	262 if (WordRole[W] == Tail &&
1d019706d866 LLVM10 anatofuz parents: diff changeset	263 (Word[W] == LowWord[W] \|\| !(WordTypeSet & 1 << Lower)))
1d019706d866 LLVM10 anatofuz parents: diff changeset	264 return false;
1d019706d866 LLVM10 anatofuz parents: diff changeset	265 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	266 return true;
1d019706d866 LLVM10 anatofuz parents: diff changeset	267 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	268
1d019706d866 LLVM10 anatofuz parents: diff changeset	269 int FuzzyMatcher::skipPenalty(int W, Action Last) const {
1d019706d866 LLVM10 anatofuz parents: diff changeset	270 if (W == 0) // Skipping the first character.
1d019706d866 LLVM10 anatofuz parents: diff changeset	271 return 3;
1d019706d866 LLVM10 anatofuz parents: diff changeset	272 if (WordRole[W] == Head) // Skipping a segment.
1d019706d866 LLVM10 anatofuz parents: diff changeset	273 return 1; // We want to keep this lower than a consecutive match bonus.
1d019706d866 LLVM10 anatofuz parents: diff changeset	274 // Instead of penalizing non-consecutive matches, we give a bonus to a
1d019706d866 LLVM10 anatofuz parents: diff changeset	275 // consecutive match in matchBonus. This produces a better score distribution
1d019706d866 LLVM10 anatofuz parents: diff changeset	276 // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
1d019706d866 LLVM10 anatofuz parents: diff changeset	277 return 0;
1d019706d866 LLVM10 anatofuz parents: diff changeset	278 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	279
1d019706d866 LLVM10 anatofuz parents: diff changeset	280 int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
1d019706d866 LLVM10 anatofuz parents: diff changeset	281 assert(LowPat[P] == LowWord[W]);
1d019706d866 LLVM10 anatofuz parents: diff changeset	282 int S = 1;
1d019706d866 LLVM10 anatofuz parents: diff changeset	283 bool IsPatSingleCase =
1d019706d866 LLVM10 anatofuz parents: diff changeset	284 (PatTypeSet == 1 << Lower) \|\| (PatTypeSet == 1 << Upper);
1d019706d866 LLVM10 anatofuz parents: diff changeset	285 // Bonus: case matches, or a Head in the pattern aligns with one in the word.
1d019706d866 LLVM10 anatofuz parents: diff changeset	286 // Single-case patterns lack segmentation signals and we assume any character
1d019706d866 LLVM10 anatofuz parents: diff changeset	287 // can be a head of a segment.
1d019706d866 LLVM10 anatofuz parents: diff changeset	288 if (Pat[P] == Word[W] \|\|
1d019706d866 LLVM10 anatofuz parents: diff changeset	289 (WordRole[W] == Head && (IsPatSingleCase \|\| PatRole[P] == Head)))
1d019706d866 LLVM10 anatofuz parents: diff changeset	290 ++S;
1d019706d866 LLVM10 anatofuz parents: diff changeset	291 // Bonus: a consecutive match. First character match also gets a bonus to
1d019706d866 LLVM10 anatofuz parents: diff changeset	292 // ensure prefix final match score normalizes to 1.0.
1d019706d866 LLVM10 anatofuz parents: diff changeset	293 if (W == 0 \|\| Last == Match)
1d019706d866 LLVM10 anatofuz parents: diff changeset	294 S += 2;
1d019706d866 LLVM10 anatofuz parents: diff changeset	295 // Penalty: matching inside a segment (and previous char wasn't matched).
1d019706d866 LLVM10 anatofuz parents: diff changeset	296 if (WordRole[W] == Tail && P && Last == Miss)
1d019706d866 LLVM10 anatofuz parents: diff changeset	297 S -= 3;
1d019706d866 LLVM10 anatofuz parents: diff changeset	298 // Penalty: a Head in the pattern matches in the middle of a word segment.
1d019706d866 LLVM10 anatofuz parents: diff changeset	299 if (PatRole[P] == Head && WordRole[W] == Tail)
1d019706d866 LLVM10 anatofuz parents: diff changeset	300 --S;
1d019706d866 LLVM10 anatofuz parents: diff changeset	301 // Penalty: matching the first pattern character in the middle of a segment.
1d019706d866 LLVM10 anatofuz parents: diff changeset	302 if (P == 0 && WordRole[W] == Tail)
1d019706d866 LLVM10 anatofuz parents: diff changeset	303 S -= 4;
1d019706d866 LLVM10 anatofuz parents: diff changeset	304 assert(S <= PerfectBonus);
1d019706d866 LLVM10 anatofuz parents: diff changeset	305 return S;
1d019706d866 LLVM10 anatofuz parents: diff changeset	306 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	307
1d019706d866 LLVM10 anatofuz parents: diff changeset	308 llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
1d019706d866 LLVM10 anatofuz parents: diff changeset	309 llvm::SmallString<256> Result;
1d019706d866 LLVM10 anatofuz parents: diff changeset	310 OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against ["
1d019706d866 LLVM10 anatofuz parents: diff changeset	311 << llvm::StringRef(Pat, PatN) << "] ===\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	312 if (PatN == 0) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	313 OS << "Pattern is empty: perfect match.\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	314 return Result = llvm::StringRef(Word, WordN);
1d019706d866 LLVM10 anatofuz parents: diff changeset	315 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	316 if (WordN == 0) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	317 OS << "Word is empty: no match.\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	318 return Result;
1d019706d866 LLVM10 anatofuz parents: diff changeset	319 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	320 if (!WordContainsPattern) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	321 OS << "Substring check failed.\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	322 return Result;
1d019706d866 LLVM10 anatofuz parents: diff changeset	323 } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
1d019706d866 LLVM10 anatofuz parents: diff changeset	324 Scores[PatN][WordN][Miss].Score))) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	325 OS << "Substring check passed, but all matches are forbidden\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	326 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	327 if (!(PatTypeSet & 1 << Upper))
1d019706d866 LLVM10 anatofuz parents: diff changeset	328 OS << "Lowercase query, so scoring ignores case\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	329
1d019706d866 LLVM10 anatofuz parents: diff changeset	330 // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
1d019706d866 LLVM10 anatofuz parents: diff changeset	331 // The Score table has cumulative scores, subtracting along this path gives
1d019706d866 LLVM10 anatofuz parents: diff changeset	332 // us the per-letter scores.
1d019706d866 LLVM10 anatofuz parents: diff changeset	333 Action Last =
1d019706d866 LLVM10 anatofuz parents: diff changeset	334 (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
1d019706d866 LLVM10 anatofuz parents: diff changeset	335 ? Match
1d019706d866 LLVM10 anatofuz parents: diff changeset	336 : Miss;
1d019706d866 LLVM10 anatofuz parents: diff changeset	337 int S[MaxWord];
1d019706d866 LLVM10 anatofuz parents: diff changeset	338 Action A[MaxWord];
1d019706d866 LLVM10 anatofuz parents: diff changeset	339 for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	340 A[W] = Last;
1d019706d866 LLVM10 anatofuz parents: diff changeset	341 const auto &Cell = Scores[P + 1][W + 1][Last];
1d019706d866 LLVM10 anatofuz parents: diff changeset	342 if (Last == Match)
1d019706d866 LLVM10 anatofuz parents: diff changeset	343 --P;
1d019706d866 LLVM10 anatofuz parents: diff changeset	344 const auto &Prev = Scores[P + 1][W][Cell.Prev];
1d019706d866 LLVM10 anatofuz parents: diff changeset	345 S[W] = Cell.Score - Prev.Score;
1d019706d866 LLVM10 anatofuz parents: diff changeset	346 Last = Cell.Prev;
1d019706d866 LLVM10 anatofuz parents: diff changeset	347 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	348 for (int I = 0; I < WordN; ++I) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	349 if (A[I] == Match && (I == 0 \|\| A[I - 1] == Miss))
1d019706d866 LLVM10 anatofuz parents: diff changeset	350 Result.push_back('[');
1d019706d866 LLVM10 anatofuz parents: diff changeset	351 if (A[I] == Miss && I > 0 && A[I - 1] == Match)
1d019706d866 LLVM10 anatofuz parents: diff changeset	352 Result.push_back(']');
1d019706d866 LLVM10 anatofuz parents: diff changeset	353 Result.push_back(Word[I]);
1d019706d866 LLVM10 anatofuz parents: diff changeset	354 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	355 if (A[WordN - 1] == Match)
1d019706d866 LLVM10 anatofuz parents: diff changeset	356 Result.push_back(']');
1d019706d866 LLVM10 anatofuz parents: diff changeset	357
1d019706d866 LLVM10 anatofuz parents: diff changeset	358 for (char C : llvm::StringRef(Word, WordN))
1d019706d866 LLVM10 anatofuz parents: diff changeset	359 OS << " " << C << " ";
1d019706d866 LLVM10 anatofuz parents: diff changeset	360 OS << "\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	361 for (int I = 0, J = 0; I < WordN; I++)
1d019706d866 LLVM10 anatofuz parents: diff changeset	362 OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
1d019706d866 LLVM10 anatofuz parents: diff changeset	363 OS << "\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	364 for (int I = 0; I < WordN; I++)
1d019706d866 LLVM10 anatofuz parents: diff changeset	365 OS << llvm::format("%2d ", S[I]);
1d019706d866 LLVM10 anatofuz parents: diff changeset	366 OS << "\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	367
1d019706d866 LLVM10 anatofuz parents: diff changeset	368 OS << "\nSegmentation:";
1d019706d866 LLVM10 anatofuz parents: diff changeset	369 OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n ";
1d019706d866 LLVM10 anatofuz parents: diff changeset	370 for (int I = 0; I < WordN; ++I)
1d019706d866 LLVM10 anatofuz parents: diff changeset	371 OS << "?-+ "[static_cast<int>(WordRole[I])];
1d019706d866 LLVM10 anatofuz parents: diff changeset	372 OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n ";
1d019706d866 LLVM10 anatofuz parents: diff changeset	373 for (int I = 0; I < PatN; ++I)
1d019706d866 LLVM10 anatofuz parents: diff changeset	374 OS << "?-+ "[static_cast<int>(PatRole[I])];
1d019706d866 LLVM10 anatofuz parents: diff changeset	375 OS << "\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	376
1d019706d866 LLVM10 anatofuz parents: diff changeset	377 OS << "\nScoring table (last-Miss, last-Match):\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	378 OS << " \| ";
1d019706d866 LLVM10 anatofuz parents: diff changeset	379 for (char C : llvm::StringRef(Word, WordN))
1d019706d866 LLVM10 anatofuz parents: diff changeset	380 OS << " " << C << " ";
1d019706d866 LLVM10 anatofuz parents: diff changeset	381 OS << "\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	382 OS << "-+----" << std::string(WordN * 4, '-') << "\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	383 for (int I = 0; I <= PatN; ++I) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	384 for (Action A : {Miss, Match}) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	385 OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "\|";
1d019706d866 LLVM10 anatofuz parents: diff changeset	386 for (int J = 0; J <= WordN; ++J) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	387 if (!isAwful(Scores[I][J][A].Score))
1d019706d866 LLVM10 anatofuz parents: diff changeset	388 OS << llvm::format("%3d%c", Scores[I][J][A].Score,
1d019706d866 LLVM10 anatofuz parents: diff changeset	389 Scores[I][J][A].Prev == Match ? '*' : ' ');
1d019706d866 LLVM10 anatofuz parents: diff changeset	390 else
1d019706d866 LLVM10 anatofuz parents: diff changeset	391 OS << " ";
1d019706d866 LLVM10 anatofuz parents: diff changeset	392 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	393 OS << "\n";
1d019706d866 LLVM10 anatofuz parents: diff changeset	394 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	395 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	396
1d019706d866 LLVM10 anatofuz parents: diff changeset	397 return Result;
1d019706d866 LLVM10 anatofuz parents: diff changeset	398 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	399
1d019706d866 LLVM10 anatofuz parents: diff changeset	400 } // namespace clangd
1d019706d866 LLVM10 anatofuz parents: diff changeset	401 } // namespace clang

Mercurial > hg > CbC > CbC_llvm

annotate clang-tools-extra/clangd/FuzzyMatch.cpp @ 221:79ff65ed7e25