comparison clang-tools-extra/clangd/SourceCode.cpp @ 221:79ff65ed7e25

LLVM12 Original
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 15 Jun 2021 19:15:29 +0900
parents 0572611fdcc8
children c4bab56944e8
comparison
equal deleted inserted replaced
220:42394fc6a535 221:79ff65ed7e25
6 // 6 //
7 //===----------------------------------------------------------------------===// 7 //===----------------------------------------------------------------------===//
8 #include "SourceCode.h" 8 #include "SourceCode.h"
9 9
10 #include "FuzzyMatch.h" 10 #include "FuzzyMatch.h"
11 #include "Preamble.h"
11 #include "Protocol.h" 12 #include "Protocol.h"
12 #include "refactor/Tweak.h" 13 #include "refactor/Tweak.h"
13 #include "support/Context.h" 14 #include "support/Context.h"
14 #include "support/Logger.h" 15 #include "support/Logger.h"
16 #include "support/Threading.h"
15 #include "clang/AST/ASTContext.h" 17 #include "clang/AST/ASTContext.h"
16 #include "clang/Basic/LangOptions.h" 18 #include "clang/Basic/LangOptions.h"
17 #include "clang/Basic/SourceLocation.h" 19 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h" 20 #include "clang/Basic/SourceManager.h"
19 #include "clang/Basic/TokenKinds.h" 21 #include "clang/Basic/TokenKinds.h"
52 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. 54 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial.
53 55
54 // Iterates over unicode codepoints in the (UTF-8) string. For each, 56 // Iterates over unicode codepoints in the (UTF-8) string. For each,
55 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. 57 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true.
56 // Returns true if CB returned true, false if we hit the end of string. 58 // Returns true if CB returned true, false if we hit the end of string.
59 //
60 // If the string is not valid UTF-8, we log this error and "decode" the
61 // text in some arbitrary way. This is pretty sad, but this tends to happen deep
62 // within indexing of headers where clang misdetected the encoding, and
63 // propagating the error all the way back up is (probably?) not be worth it.
57 template <typename Callback> 64 template <typename Callback>
58 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { 65 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
66 bool LoggedInvalid = false;
59 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 67 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
60 // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. 68 // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
61 for (size_t I = 0; I < U8.size();) { 69 for (size_t I = 0; I < U8.size();) {
62 unsigned char C = static_cast<unsigned char>(U8[I]); 70 unsigned char C = static_cast<unsigned char>(U8[I]);
63 if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. 71 if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
67 continue; 75 continue;
68 } 76 }
69 // This convenient property of UTF-8 holds for all non-ASCII characters. 77 // This convenient property of UTF-8 holds for all non-ASCII characters.
70 size_t UTF8Length = llvm::countLeadingOnes(C); 78 size_t UTF8Length = llvm::countLeadingOnes(C);
71 // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. 79 // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
72 // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug. 80 // 11111xxx is not valid UTF-8 at all, maybe some ISO-8859-*.
73 assert((UTF8Length >= 2 && UTF8Length <= 4) && 81 if (LLVM_UNLIKELY(UTF8Length < 2 || UTF8Length > 4)) {
74 "Invalid UTF-8, or transcoding bug?"); 82 if (!LoggedInvalid) {
83 elog("File has invalid UTF-8 near offset {0}: {1}", I, llvm::toHex(U8));
84 LoggedInvalid = true;
85 }
86 // We can't give a correct result, but avoid returning something wild.
87 // Pretend this is a valid ASCII byte, for lack of better options.
88 // (Too late to get ISO-8859-* right, we've skipped some bytes already).
89 if (CB(1, 1))
90 return true;
91 ++I;
92 continue;
93 }
75 I += UTF8Length; // Skip over all trailing bytes. 94 I += UTF8Length; // Skip over all trailing bytes.
76 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 95 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
77 // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) 96 // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
78 if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) 97 if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
79 return true; 98 return true;
154 } 173 }
155 174
156 llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P, 175 llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P,
157 bool AllowColumnsBeyondLineLength) { 176 bool AllowColumnsBeyondLineLength) {
158 if (P.line < 0) 177 if (P.line < 0)
159 return llvm::make_error<llvm::StringError>( 178 return error(llvm::errc::invalid_argument,
160 llvm::formatv("Line value can't be negative ({0})", P.line), 179 "Line value can't be negative ({0})", P.line);
161 llvm::errc::invalid_argument);
162 if (P.character < 0) 180 if (P.character < 0)
163 return llvm::make_error<llvm::StringError>( 181 return error(llvm::errc::invalid_argument,
164 llvm::formatv("Character value can't be negative ({0})", P.character), 182 "Character value can't be negative ({0})", P.character);
165 llvm::errc::invalid_argument);
166 size_t StartOfLine = 0; 183 size_t StartOfLine = 0;
167 for (int I = 0; I != P.line; ++I) { 184 for (int I = 0; I != P.line; ++I) {
168 size_t NextNL = Code.find('\n', StartOfLine); 185 size_t NextNL = Code.find('\n', StartOfLine);
169 if (NextNL == llvm::StringRef::npos) 186 if (NextNL == llvm::StringRef::npos)
170 return llvm::make_error<llvm::StringError>( 187 return error(llvm::errc::invalid_argument,
171 llvm::formatv("Line value is out of range ({0})", P.line), 188 "Line value is out of range ({0})", P.line);
172 llvm::errc::invalid_argument);
173 StartOfLine = NextNL + 1; 189 StartOfLine = NextNL + 1;
174 } 190 }
175 StringRef Line = 191 StringRef Line =
176 Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); 192 Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
177 193
178 // P.character may be in UTF-16, transcode if necessary. 194 // P.character may be in UTF-16, transcode if necessary.
179 bool Valid; 195 bool Valid;
180 size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid); 196 size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid);
181 if (!Valid && !AllowColumnsBeyondLineLength) 197 if (!Valid && !AllowColumnsBeyondLineLength)
182 return llvm::make_error<llvm::StringError>( 198 return error(llvm::errc::invalid_argument,
183 llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), 199 "{0} offset {1} is invalid for line {2}", lspEncoding(),
184 P.character, P.line), 200 P.character, P.line);
185 llvm::errc::invalid_argument);
186 return StartOfLine + ByteInLine; 201 return StartOfLine + ByteInLine;
187 } 202 }
188 203
189 Position offsetToPosition(llvm::StringRef Code, size_t Offset) { 204 Position offsetToPosition(llvm::StringRef Code, size_t Offset) {
190 Offset = std::min(Code.size(), Offset); 205 Offset = std::min(Code.size(), Offset);
428 return Result; 443 return Result;
429 } 444 }
430 445
431 llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R) { 446 llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R) {
432 assert(isValidFileRange(SM, R)); 447 assert(isValidFileRange(SM, R));
433 bool Invalid = false; 448 auto Buf = SM.getBufferOrNone(SM.getFileID(R.getBegin()));
434 auto *Buf = SM.getBuffer(SM.getFileID(R.getBegin()), &Invalid); 449 assert(Buf);
435 assert(!Invalid);
436 450
437 size_t BeginOffset = SM.getFileOffset(R.getBegin()); 451 size_t BeginOffset = SM.getFileOffset(R.getBegin());
438 size_t EndOffset = SM.getFileOffset(R.getEnd()); 452 size_t EndOffset = SM.getFileOffset(R.getEnd());
439 return Buf->getBuffer().substr(BeginOffset, EndOffset - BeginOffset); 453 return Buf->getBuffer().substr(BeginOffset, EndOffset - BeginOffset);
440 } 454 }
441 455
442 llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM, 456 llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM,
443 Position P) { 457 Position P) {
444 llvm::StringRef Code = SM.getBuffer(SM.getMainFileID())->getBuffer(); 458 llvm::StringRef Code = SM.getBufferOrFake(SM.getMainFileID()).getBuffer();
445 auto Offset = 459 auto Offset =
446 positionToOffset(Code, P, /*AllowColumnBeyondLineLength=*/false); 460 positionToOffset(Code, P, /*AllowColumnBeyondLineLength=*/false);
447 if (!Offset) 461 if (!Offset)
448 return Offset.takeError(); 462 return Offset.takeError();
449 return SM.getLocForStartOfFile(SM.getMainFileID()).getLocWithOffset(*Offset); 463 return SM.getLocForStartOfFile(SM.getMainFileID()).getLocWithOffset(*Offset);
556 return digest(Content); 570 return digest(Content);
557 } 571 }
558 572
559 format::FormatStyle getFormatStyleForFile(llvm::StringRef File, 573 format::FormatStyle getFormatStyleForFile(llvm::StringRef File,
560 llvm::StringRef Content, 574 llvm::StringRef Content,
561 llvm::vfs::FileSystem *FS) { 575 const ThreadsafeFS &TFS) {
562 auto Style = format::getStyle(format::DefaultFormatStyle, File, 576 auto Style = format::getStyle(format::DefaultFormatStyle, File,
563 format::DefaultFallbackStyle, Content, FS); 577 format::DefaultFallbackStyle, Content,
578 TFS.view(/*CWD=*/llvm::None).get());
564 if (!Style) { 579 if (!Style) {
565 log("getStyle() failed for file {0}: {1}. Fallback is LLVM style.", File, 580 log("getStyle() failed for file {0}: {1}. Fallback is LLVM style.", File,
566 Style.takeError()); 581 Style.takeError());
567 return format::getLLVMStyle(); 582 return format::getLLVMStyle();
568 } 583 }
582 lex(llvm::StringRef Code, const LangOptions &LangOpts, 597 lex(llvm::StringRef Code, const LangOptions &LangOpts,
583 llvm::function_ref<void(const syntax::Token &, const SourceManager &SM)> 598 llvm::function_ref<void(const syntax::Token &, const SourceManager &SM)>
584 Action) { 599 Action) {
585 // FIXME: InMemoryFileAdapter crashes unless the buffer is null terminated! 600 // FIXME: InMemoryFileAdapter crashes unless the buffer is null terminated!
586 std::string NullTerminatedCode = Code.str(); 601 std::string NullTerminatedCode = Code.str();
587 SourceManagerForFile FileSM("dummy.cpp", NullTerminatedCode); 602 SourceManagerForFile FileSM("mock_file_name.cpp", NullTerminatedCode);
588 auto &SM = FileSM.get(); 603 auto &SM = FileSM.get();
589 for (const auto &Tok : syntax::tokenize(SM.getMainFileID(), SM, LangOpts)) 604 for (const auto &Tok : syntax::tokenize(SM.getMainFileID(), SM, LangOpts))
590 Action(Tok, SM); 605 Action(Tok, SM);
591 } 606 }
592 607
613 if (Tok.kind() != tok::identifier || Tok.text(SM) != Identifier) 628 if (Tok.kind() != tok::identifier || Tok.text(SM) != Identifier)
614 return; 629 return;
615 Ranges.push_back(halfOpenToRange(SM, Tok.range(SM).toCharRange(SM))); 630 Ranges.push_back(halfOpenToRange(SM, Tok.range(SM).toCharRange(SM)));
616 }); 631 });
617 return Ranges; 632 return Ranges;
633 }
634
635 bool isKeyword(llvm::StringRef NewName, const LangOptions &LangOpts) {
636 // Keywords are initialized in constructor.
637 clang::IdentifierTable KeywordsTable(LangOpts);
638 return KeywordsTable.find(NewName) != KeywordsTable.end();
618 } 639 }
619 640
620 namespace { 641 namespace {
621 struct NamespaceEvent { 642 struct NamespaceEvent {
622 enum { 643 enum {
756 } 777 }
757 }); 778 });
758 } 779 }
759 780
760 // Returns the prefix namespaces of NS: {"" ... NS}. 781 // Returns the prefix namespaces of NS: {"" ... NS}.
761 llvm::SmallVector<llvm::StringRef, 8> ancestorNamespaces(llvm::StringRef NS) { 782 llvm::SmallVector<llvm::StringRef> ancestorNamespaces(llvm::StringRef NS) {
762 llvm::SmallVector<llvm::StringRef, 8> Results; 783 llvm::SmallVector<llvm::StringRef> Results;
763 Results.push_back(NS.take_front(0)); 784 Results.push_back(NS.take_front(0));
764 NS.split(Results, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false); 785 NS.split(Results, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
765 for (llvm::StringRef &R : Results) 786 for (llvm::StringRef &R : Results)
766 R = NS.take_front(R.end() - NS.begin()); 787 R = NS.take_front(R.end() - NS.begin());
767 return Results; 788 return Results;
952 const auto &SM = PP.getSourceManager(); 973 const auto &SM = PP.getSourceManager();
953 IdentifierInfo *IdentifierInfo = PP.getIdentifierInfo(SpelledTok.text(SM)); 974 IdentifierInfo *IdentifierInfo = PP.getIdentifierInfo(SpelledTok.text(SM));
954 if (!IdentifierInfo || !IdentifierInfo->hadMacroDefinition()) 975 if (!IdentifierInfo || !IdentifierInfo->hadMacroDefinition())
955 return None; 976 return None;
956 977
957 // Get the definition just before the searched location so that a macro 978 // We need to take special case to handle #define and #undef.
958 // referenced in a '#undef MACRO' can still be found. Note that we only do 979 // Preprocessor::getMacroDefinitionAtLoc() only considers a macro
959 // that if Loc is not pointing at start of file. 980 // definition to be in scope *after* the location of the macro name in a
960 if (SM.getLocForStartOfFile(SM.getFileID(Loc)) != Loc) 981 // #define that introduces it, and *before* the location of the macro name
961 Loc = Loc.getLocWithOffset(-1); 982 // in an #undef that undefines it. To handle these cases, we check for
962 MacroDefinition MacroDef = PP.getMacroDefinitionAtLoc(IdentifierInfo, Loc); 983 // the macro being in scope either just after or just before the location
963 if (auto *MI = MacroDef.getMacroInfo()) 984 // of the token. In getting the location before, we also take care to check
964 return DefinedMacro{IdentifierInfo->getName(), MI}; 985 // for start-of-file.
965 return None; 986 FileID FID = SM.getFileID(Loc);
987 assert(Loc != SM.getLocForEndOfFile(FID));
988 SourceLocation JustAfterToken = Loc.getLocWithOffset(1);
989 auto *MacroInfo =
990 PP.getMacroDefinitionAtLoc(IdentifierInfo, JustAfterToken).getMacroInfo();
991 if (!MacroInfo && SM.getLocForStartOfFile(FID) != Loc) {
992 SourceLocation JustBeforeToken = Loc.getLocWithOffset(-1);
993 MacroInfo = PP.getMacroDefinitionAtLoc(IdentifierInfo, JustBeforeToken)
994 .getMacroInfo();
995 }
996 if (!MacroInfo) {
997 return None;
998 }
999 return DefinedMacro{
1000 IdentifierInfo->getName(), MacroInfo,
1001 translatePreamblePatchLocation(MacroInfo->getDefinitionLoc(), SM)};
966 } 1002 }
967 1003
968 llvm::Expected<std::string> Edit::apply() const { 1004 llvm::Expected<std::string> Edit::apply() const {
969 return tooling::applyAllReplacements(InitialCode, Replacements); 1005 return tooling::applyAllReplacements(InitialCode, Replacements);
970 } 1006 }
1012 llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style) { 1048 llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style) {
1013 if (auto NewEdits = cleanupAndFormat(E.InitialCode, E.Replacements, Style)) 1049 if (auto NewEdits = cleanupAndFormat(E.InitialCode, E.Replacements, Style))
1014 E.Replacements = std::move(*NewEdits); 1050 E.Replacements = std::move(*NewEdits);
1015 else 1051 else
1016 return NewEdits.takeError(); 1052 return NewEdits.takeError();
1053 return llvm::Error::success();
1054 }
1055
1056 llvm::Error applyChange(std::string &Contents,
1057 const TextDocumentContentChangeEvent &Change) {
1058 if (!Change.range) {
1059 Contents = Change.text;
1060 return llvm::Error::success();
1061 }
1062
1063 const Position &Start = Change.range->start;
1064 llvm::Expected<size_t> StartIndex = positionToOffset(Contents, Start, false);
1065 if (!StartIndex)
1066 return StartIndex.takeError();
1067
1068 const Position &End = Change.range->end;
1069 llvm::Expected<size_t> EndIndex = positionToOffset(Contents, End, false);
1070 if (!EndIndex)
1071 return EndIndex.takeError();
1072
1073 if (*EndIndex < *StartIndex)
1074 return error(llvm::errc::invalid_argument,
1075 "Range's end position ({0}) is before start position ({1})",
1076 End, Start);
1077
1078 // Since the range length between two LSP positions is dependent on the
1079 // contents of the buffer we compute the range length between the start and
1080 // end position ourselves and compare it to the range length of the LSP
1081 // message to verify the buffers of the client and server are in sync.
1082
1083 // EndIndex and StartIndex are in bytes, but Change.rangeLength is in UTF-16
1084 // code units.
1085 ssize_t ComputedRangeLength =
1086 lspLength(Contents.substr(*StartIndex, *EndIndex - *StartIndex));
1087
1088 if (Change.rangeLength && ComputedRangeLength != *Change.rangeLength)
1089 return error(llvm::errc::invalid_argument,
1090 "Change's rangeLength ({0}) doesn't match the "
1091 "computed range length ({1}).",
1092 *Change.rangeLength, ComputedRangeLength);
1093
1094 Contents.replace(*StartIndex, *EndIndex - *StartIndex, Change.text);
1095
1017 return llvm::Error::success(); 1096 return llvm::Error::success();
1018 } 1097 }
1019 1098
1020 EligibleRegion getEligiblePoints(llvm::StringRef Code, 1099 EligibleRegion getEligiblePoints(llvm::StringRef Code,
1021 llvm::StringRef FullyQualifiedName, 1100 llvm::StringRef FullyQualifiedName,