annotate clang-tools-extra/clangd/FuzzyMatch.h @ 221:79ff65ed7e25

LLVM12 Original
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 15 Jun 2021 19:15:29 +0900
parents 1d019706d866
children 1f2b6ac9f198
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
150
anatofuz
parents:
diff changeset
1 //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
anatofuz
parents:
diff changeset
2 //
anatofuz
parents:
diff changeset
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
anatofuz
parents:
diff changeset
4 // See https://llvm.org/LICENSE.txt for license information.
anatofuz
parents:
diff changeset
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
anatofuz
parents:
diff changeset
6 //
anatofuz
parents:
diff changeset
7 //===----------------------------------------------------------------------===//
anatofuz
parents:
diff changeset
8 //
anatofuz
parents:
diff changeset
9 // This file implements fuzzy-matching of strings against identifiers.
anatofuz
parents:
diff changeset
10 // It indicates both the existence and quality of a match:
anatofuz
parents:
diff changeset
11 // 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
anatofuz
parents:
diff changeset
12 //
anatofuz
parents:
diff changeset
13 //===----------------------------------------------------------------------===//
anatofuz
parents:
diff changeset
14
anatofuz
parents:
diff changeset
15 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
anatofuz
parents:
diff changeset
16 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
anatofuz
parents:
diff changeset
17
anatofuz
parents:
diff changeset
18 #include "llvm/ADT/ArrayRef.h"
anatofuz
parents:
diff changeset
19 #include "llvm/ADT/Optional.h"
anatofuz
parents:
diff changeset
20 #include "llvm/ADT/SmallString.h"
anatofuz
parents:
diff changeset
21 #include "llvm/ADT/StringRef.h"
anatofuz
parents:
diff changeset
22 #include "llvm/Support/raw_ostream.h"
anatofuz
parents:
diff changeset
23
anatofuz
parents:
diff changeset
24 namespace clang {
anatofuz
parents:
diff changeset
25 namespace clangd {
anatofuz
parents:
diff changeset
26
anatofuz
parents:
diff changeset
27 // Utilities for word segmentation.
anatofuz
parents:
diff changeset
28 // FuzzyMatcher already incorporates this logic, so most users don't need this.
anatofuz
parents:
diff changeset
29 //
anatofuz
parents:
diff changeset
30 // A name like "fooBar_baz" consists of several parts foo, bar, baz.
anatofuz
parents:
diff changeset
31 // Aligning segmentation of word and pattern improves the fuzzy-match.
anatofuz
parents:
diff changeset
32 // For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
anatofuz
parents:
diff changeset
33 //
anatofuz
parents:
diff changeset
34 // First we classify each character into types (uppercase, lowercase, etc).
anatofuz
parents:
diff changeset
35 // Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
anatofuz
parents:
diff changeset
36
anatofuz
parents:
diff changeset
37 // We distinguish the types of characters that affect segmentation.
anatofuz
parents:
diff changeset
38 // It's not obvious how to segment digits, we treat them as lowercase letters.
anatofuz
parents:
diff changeset
39 // As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
anatofuz
parents:
diff changeset
40 // This means we require exact (case-sensitive) match for those characters.
anatofuz
parents:
diff changeset
41 enum CharType : unsigned char {
anatofuz
parents:
diff changeset
42 Empty = 0, // Before-the-start and after-the-end (and control chars).
anatofuz
parents:
diff changeset
43 Lower = 1, // Lowercase letters, digits, and non-ASCII bytes.
anatofuz
parents:
diff changeset
44 Upper = 2, // Uppercase letters.
anatofuz
parents:
diff changeset
45 Punctuation = 3, // ASCII punctuation (including Space)
anatofuz
parents:
diff changeset
46 };
anatofuz
parents:
diff changeset
47 // A CharTypeSet is a bitfield representing all the character types in a word.
anatofuz
parents:
diff changeset
48 // Its bits are 1<<Empty, 1<<Lower, etc.
anatofuz
parents:
diff changeset
49 using CharTypeSet = unsigned char;
anatofuz
parents:
diff changeset
50
anatofuz
parents:
diff changeset
51 // Each character's Role is the Head or Tail of a segment, or a Separator.
anatofuz
parents:
diff changeset
52 // e.g. XMLHttpRequest_Async
anatofuz
parents:
diff changeset
53 // +--+---+------ +----
anatofuz
parents:
diff changeset
54 // ^Head ^Tail ^Separator
anatofuz
parents:
diff changeset
55 enum CharRole : unsigned char {
anatofuz
parents:
diff changeset
56 Unknown = 0, // Stray control characters or impossible states.
anatofuz
parents:
diff changeset
57 Tail = 1, // Part of a word segment, but not the first character.
anatofuz
parents:
diff changeset
58 Head = 2, // The first character of a word segment.
anatofuz
parents:
diff changeset
59 Separator = 3, // Punctuation characters that separate word segments.
anatofuz
parents:
diff changeset
60 };
anatofuz
parents:
diff changeset
61
anatofuz
parents:
diff changeset
62 // Compute segmentation of Text.
anatofuz
parents:
diff changeset
63 // Character roles are stored in Roles (Roles.size() must equal Text.size()).
anatofuz
parents:
diff changeset
64 // The set of character types encountered is returned, this may inform
anatofuz
parents:
diff changeset
65 // heuristics for dealing with poorly-segmented identifiers like "strndup".
anatofuz
parents:
diff changeset
66 CharTypeSet calculateRoles(llvm::StringRef Text,
anatofuz
parents:
diff changeset
67 llvm::MutableArrayRef<CharRole> Roles);
anatofuz
parents:
diff changeset
68
anatofuz
parents:
diff changeset
69 // A matcher capable of matching and scoring strings against a single pattern.
anatofuz
parents:
diff changeset
70 // It's optimized for matching against many strings - match() does not allocate.
anatofuz
parents:
diff changeset
71 class FuzzyMatcher {
anatofuz
parents:
diff changeset
72 public:
anatofuz
parents:
diff changeset
73 // Characters beyond MaxPat are ignored.
anatofuz
parents:
diff changeset
74 FuzzyMatcher(llvm::StringRef Pattern);
anatofuz
parents:
diff changeset
75
anatofuz
parents:
diff changeset
76 // If Word matches the pattern, return a score indicating the quality match.
anatofuz
parents:
diff changeset
77 // Scores usually fall in a [0,1] range, with 1 being a very good score.
anatofuz
parents:
diff changeset
78 // "Super" scores in (1,2] are possible if the pattern is the full word.
anatofuz
parents:
diff changeset
79 // Characters beyond MaxWord are ignored.
anatofuz
parents:
diff changeset
80 llvm::Optional<float> match(llvm::StringRef Word);
anatofuz
parents:
diff changeset
81
anatofuz
parents:
diff changeset
82 llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); }
anatofuz
parents:
diff changeset
83 bool empty() const { return PatN == 0; }
anatofuz
parents:
diff changeset
84
anatofuz
parents:
diff changeset
85 // Dump internal state from the last match() to the stream, for debugging.
anatofuz
parents:
diff changeset
86 // Returns the pattern with [] around matched characters, e.g.
anatofuz
parents:
diff changeset
87 // [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
anatofuz
parents:
diff changeset
88 llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;
anatofuz
parents:
diff changeset
89
anatofuz
parents:
diff changeset
90 private:
anatofuz
parents:
diff changeset
91 // We truncate the pattern and the word to bound the cost of matching.
anatofuz
parents:
diff changeset
92 constexpr static int MaxPat = 63, MaxWord = 127;
anatofuz
parents:
diff changeset
93 // Action describes how a word character was matched to the pattern.
anatofuz
parents:
diff changeset
94 // It should be an enum, but this causes bitfield problems:
anatofuz
parents:
diff changeset
95 // - for MSVC the enum type must be explicitly unsigned for correctness
anatofuz
parents:
diff changeset
96 // - GCC 4.8 complains not all values fit if the type is unsigned
anatofuz
parents:
diff changeset
97 using Action = bool;
anatofuz
parents:
diff changeset
98 constexpr static Action Miss = false; // Word character was skipped.
anatofuz
parents:
diff changeset
99 constexpr static Action Match = true; // Matched against a pattern character.
anatofuz
parents:
diff changeset
100
anatofuz
parents:
diff changeset
101 bool init(llvm::StringRef Word);
anatofuz
parents:
diff changeset
102 void buildGraph();
anatofuz
parents:
diff changeset
103 bool allowMatch(int P, int W, Action Last) const;
anatofuz
parents:
diff changeset
104 int skipPenalty(int W, Action Last) const;
anatofuz
parents:
diff changeset
105 int matchBonus(int P, int W, Action Last) const;
anatofuz
parents:
diff changeset
106
anatofuz
parents:
diff changeset
107 // Pattern data is initialized by the constructor, then constant.
anatofuz
parents:
diff changeset
108 char Pat[MaxPat]; // Pattern data
anatofuz
parents:
diff changeset
109 int PatN; // Length
anatofuz
parents:
diff changeset
110 char LowPat[MaxPat]; // Pattern in lowercase
anatofuz
parents:
diff changeset
111 CharRole PatRole[MaxPat]; // Pattern segmentation info
anatofuz
parents:
diff changeset
112 CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters
anatofuz
parents:
diff changeset
113 float ScoreScale; // Normalizes scores for the pattern length.
anatofuz
parents:
diff changeset
114
anatofuz
parents:
diff changeset
115 // Word data is initialized on each call to match(), mostly by init().
anatofuz
parents:
diff changeset
116 char Word[MaxWord]; // Word data
anatofuz
parents:
diff changeset
117 int WordN; // Length
anatofuz
parents:
diff changeset
118 char LowWord[MaxWord]; // Word in lowercase
anatofuz
parents:
diff changeset
119 CharRole WordRole[MaxWord]; // Word segmentation info
anatofuz
parents:
diff changeset
120 CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters
anatofuz
parents:
diff changeset
121 bool WordContainsPattern; // Simple substring check
anatofuz
parents:
diff changeset
122
anatofuz
parents:
diff changeset
123 // Cumulative best-match score table.
anatofuz
parents:
diff changeset
124 // Boundary conditions are filled in by the constructor.
anatofuz
parents:
diff changeset
125 // The rest is repopulated for each match(), by buildGraph().
anatofuz
parents:
diff changeset
126 struct ScoreInfo {
anatofuz
parents:
diff changeset
127 signed int Score : 15;
anatofuz
parents:
diff changeset
128 Action Prev : 1;
anatofuz
parents:
diff changeset
129 };
anatofuz
parents:
diff changeset
130 ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];
anatofuz
parents:
diff changeset
131 };
anatofuz
parents:
diff changeset
132
anatofuz
parents:
diff changeset
133 } // namespace clangd
anatofuz
parents:
diff changeset
134 } // namespace clang
anatofuz
parents:
diff changeset
135
anatofuz
parents:
diff changeset
136 #endif