Mercurial > hg > CbC > CbC_llvm
comparison clang-tools-extra/clangd/support/Markup.cpp @ 173:0572611fdcc8 llvm10 llvm12
reorgnization done
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 25 May 2020 11:55:54 +0900 |
parents | |
children | 2e18cbf3894f |
comparison
equal
deleted
inserted
replaced
172:9fbae9c8bf63 | 173:0572611fdcc8 |
---|---|
1 //===--- Markup.cpp -----------------------------------------*- C++-*------===// | |
2 // | |
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |
4 // See https://llvm.org/LICENSE.txt for license information. | |
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |
6 // | |
7 //===----------------------------------------------------------------------===// | |
8 #include "support/Markup.h" | |
9 #include "llvm/ADT/ArrayRef.h" | |
10 #include "llvm/ADT/STLExtras.h" | |
11 #include "llvm/ADT/SmallVector.h" | |
12 #include "llvm/ADT/StringExtras.h" | |
13 #include "llvm/ADT/StringRef.h" | |
14 #include "llvm/Support/Compiler.h" | |
15 #include "llvm/Support/ErrorHandling.h" | |
16 #include "llvm/Support/FormatVariadic.h" | |
17 #include "llvm/Support/raw_ostream.h" | |
18 #include <cstddef> | |
19 #include <iterator> | |
20 #include <memory> | |
21 #include <string> | |
22 #include <vector> | |
23 | |
24 namespace clang { | |
25 namespace clangd { | |
26 namespace markup { | |
27 namespace { | |
28 | |
29 // Is <contents a plausible start to an HTML tag? | |
30 // Contents may not be the rest of the line, but it's the rest of the plain | |
31 // text, so we expect to see at least the tag name. | |
32 bool looksLikeTag(llvm::StringRef Contents) { | |
33 if (Contents.empty()) | |
34 return false; | |
35 if (Contents.front() == '!' || Contents.front() == '?' || | |
36 Contents.front() == '/') | |
37 return true; | |
38 // Check the start of the tag name. | |
39 if (!llvm::isAlpha(Contents.front())) | |
40 return false; | |
41 // Drop rest of the tag name, and following whitespace. | |
42 Contents = Contents | |
43 .drop_while([](char C) { | |
44 return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':'; | |
45 }) | |
46 .drop_while(llvm::isSpace); | |
47 // The rest of the tag consists of attributes, which have restrictive names. | |
48 // If we hit '=', all bets are off (attribute values can contain anything). | |
49 for (; !Contents.empty(); Contents = Contents.drop_front()) { | |
50 if (llvm::isAlnum(Contents.front()) || llvm::isSpace(Contents.front())) | |
51 continue; | |
52 if (Contents.front() == '>' || Contents.startswith("/>")) | |
53 return true; // May close the tag. | |
54 if (Contents.front() == '=') | |
55 return true; // Don't try to parse attribute values. | |
56 return false; // Random punctuation means this isn't a tag. | |
57 } | |
58 return true; // Potentially incomplete tag. | |
59 } | |
60 | |
61 // Tests whether C should be backslash-escaped in markdown. | |
62 // The string being escaped is Before + C + After. This is part of a paragraph. | |
63 // StartsLine indicates whether `Before` is the start of the line. | |
64 // After may not be everything until the end of the line. | |
65 // | |
66 // It's always safe to escape punctuation, but want minimal escaping. | |
67 // The strategy is to escape the first character of anything that might start | |
68 // a markdown grammar construct. | |
69 bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After, | |
70 bool StartsLine) { | |
71 assert(Before.take_while(llvm::isSpace).empty()); | |
72 auto RulerLength = [&]() -> /*Length*/ unsigned { | |
73 if (!StartsLine || !Before.empty()) | |
74 return false; | |
75 llvm::StringRef A = After.rtrim(); | |
76 return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0; | |
77 }; | |
78 auto IsBullet = [&]() { | |
79 return StartsLine && Before.empty() && | |
80 (After.empty() || After.startswith(" ")); | |
81 }; | |
82 auto SpaceSurrounds = [&]() { | |
83 return (After.empty() || llvm::isSpace(After.front())) && | |
84 (Before.empty() || llvm::isSpace(Before.back())); | |
85 }; | |
86 auto WordSurrounds = [&]() { | |
87 return (!After.empty() && llvm::isAlnum(After.front())) && | |
88 (!Before.empty() && llvm::isAlnum(Before.back())); | |
89 }; | |
90 | |
91 switch (C) { | |
92 case '\\': // Escaped character. | |
93 return true; | |
94 case '`': // Code block or inline code | |
95 // Any number of backticks can delimit an inline code block that can end | |
96 // anywhere (including on another line). We must escape them all. | |
97 return true; | |
98 case '~': // Code block | |
99 return StartsLine && Before.empty() && After.startswith("~~"); | |
100 case '#': { // ATX heading. | |
101 if (!StartsLine || !Before.empty()) | |
102 return false; | |
103 llvm::StringRef Rest = After.ltrim(C); | |
104 return Rest.empty() || Rest.startswith(" "); | |
105 } | |
106 case ']': // Link or link reference. | |
107 // We escape ] rather than [ here, because it's more constrained: | |
108 // ](...) is an in-line link | |
109 // ]: is a link reference | |
110 // The following are only links if the link reference exists: | |
111 // ] by itself is a shortcut link | |
112 // ][...] is an out-of-line link | |
113 // Because we never emit link references, we don't need to handle these. | |
114 return After.startswith(":") || After.startswith("("); | |
115 case '=': // Setex heading. | |
116 return RulerLength() > 0; | |
117 case '_': // Horizontal ruler or matched delimiter. | |
118 if (RulerLength() >= 3) | |
119 return true; | |
120 // Not a delimiter if surrounded by space, or inside a word. | |
121 // (The rules at word boundaries are subtle). | |
122 return !(SpaceSurrounds() || WordSurrounds()); | |
123 case '-': // Setex heading, horizontal ruler, or bullet. | |
124 if (RulerLength() > 0) | |
125 return true; | |
126 return IsBullet(); | |
127 case '+': // Bullet list. | |
128 return IsBullet(); | |
129 case '*': // Bullet list, horizontal ruler, or delimiter. | |
130 return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds(); | |
131 case '<': // HTML tag (or autolink, which we choose not to escape) | |
132 return looksLikeTag(After); | |
133 case '>': // Quote marker. Needs escaping at start of line. | |
134 return StartsLine && Before.empty(); | |
135 case '&': { // HTML entity reference | |
136 auto End = After.find(';'); | |
137 if (End == llvm::StringRef::npos) | |
138 return false; | |
139 llvm::StringRef Content = After.substr(0, End); | |
140 if (Content.consume_front("#")) { | |
141 if (Content.consume_front("x") || Content.consume_front("X")) | |
142 return llvm::all_of(Content, llvm::isHexDigit); | |
143 return llvm::all_of(Content, llvm::isDigit); | |
144 } | |
145 return llvm::all_of(Content, llvm::isAlpha); | |
146 } | |
147 case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line. | |
148 case ')': | |
149 return StartsLine && !Before.empty() && | |
150 llvm::all_of(Before, llvm::isDigit) && After.startswith(" "); | |
151 default: | |
152 return false; | |
153 } | |
154 } | |
155 | |
156 /// Escape a markdown text block. Ensures the punctuation will not introduce | |
157 /// any of the markdown constructs. | |
158 std::string renderText(llvm::StringRef Input, bool StartsLine) { | |
159 std::string R; | |
160 for (unsigned I = 0; I < Input.size(); ++I) { | |
161 if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1), | |
162 StartsLine)) | |
163 R.push_back('\\'); | |
164 R.push_back(Input[I]); | |
165 } | |
166 return R; | |
167 } | |
168 | |
169 /// Renders \p Input as an inline block of code in markdown. The returned value | |
170 /// is surrounded by backticks and the inner contents are properly escaped. | |
171 std::string renderInlineBlock(llvm::StringRef Input) { | |
172 std::string R; | |
173 // Double all backticks to make sure we don't close the inline block early. | |
174 for (size_t From = 0; From < Input.size();) { | |
175 size_t Next = Input.find("`", From); | |
176 R += Input.substr(From, Next - From); | |
177 if (Next == llvm::StringRef::npos) | |
178 break; | |
179 R += "``"; // double the found backtick. | |
180 | |
181 From = Next + 1; | |
182 } | |
183 // If results starts with a backtick, add spaces on both sides. The spaces | |
184 // are ignored by markdown renderers. | |
185 if (llvm::StringRef(R).startswith("`") || llvm::StringRef(R).endswith("`")) | |
186 return "` " + std::move(R) + " `"; | |
187 // Markdown render should ignore first and last space if both are there. We | |
188 // add an extra pair of spaces in that case to make sure we render what the | |
189 // user intended. | |
190 if (llvm::StringRef(R).startswith(" ") && llvm::StringRef(R).endswith(" ")) | |
191 return "` " + std::move(R) + " `"; | |
192 return "`" + std::move(R) + "`"; | |
193 } | |
194 | |
195 /// Get marker required for \p Input to represent a markdown codeblock. It | |
196 /// consists of at least 3 backticks(`). Although markdown also allows to use | |
197 /// tilde(~) for code blocks, they are never used. | |
198 std::string getMarkerForCodeBlock(llvm::StringRef Input) { | |
199 // Count the maximum number of consecutive backticks in \p Input. We need to | |
200 // start and end the code block with more. | |
201 unsigned MaxBackticks = 0; | |
202 unsigned Backticks = 0; | |
203 for (char C : Input) { | |
204 if (C == '`') { | |
205 ++Backticks; | |
206 continue; | |
207 } | |
208 MaxBackticks = std::max(MaxBackticks, Backticks); | |
209 Backticks = 0; | |
210 } | |
211 MaxBackticks = std::max(Backticks, MaxBackticks); | |
212 // Use the corresponding number of backticks to start and end a code block. | |
213 return std::string(/*Repeat=*/std::max(3u, MaxBackticks + 1), '`'); | |
214 } | |
215 | |
216 // Trims the input and concatenates whitespace blocks into a single ` `. | |
217 std::string canonicalizeSpaces(llvm::StringRef Input) { | |
218 llvm::SmallVector<llvm::StringRef, 4> Words; | |
219 llvm::SplitString(Input, Words); | |
220 return llvm::join(Words, " "); | |
221 } | |
222 | |
223 std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children, | |
224 void (Block::*RenderFunc)(llvm::raw_ostream &) const) { | |
225 std::string R; | |
226 llvm::raw_string_ostream OS(R); | |
227 | |
228 // Trim rulers. | |
229 Children = Children.drop_while( | |
230 [](const std::unique_ptr<Block> &C) { return C->isRuler(); }); | |
231 auto Last = llvm::find_if( | |
232 llvm::reverse(Children), | |
233 [](const std::unique_ptr<Block> &C) { return !C->isRuler(); }); | |
234 Children = Children.drop_back(Children.end() - Last.base()); | |
235 | |
236 bool LastBlockWasRuler = true; | |
237 for (const auto &C : Children) { | |
238 if (C->isRuler() && LastBlockWasRuler) | |
239 continue; | |
240 LastBlockWasRuler = C->isRuler(); | |
241 ((*C).*RenderFunc)(OS); | |
242 } | |
243 | |
244 // Get rid of redundant empty lines introduced in plaintext while imitating | |
245 // padding in markdown. | |
246 std::string AdjustedResult; | |
247 llvm::StringRef TrimmedText(OS.str()); | |
248 TrimmedText = TrimmedText.trim(); | |
249 | |
250 llvm::copy_if(TrimmedText, std::back_inserter(AdjustedResult), | |
251 [&TrimmedText](const char &C) { | |
252 return !llvm::StringRef(TrimmedText.data(), | |
253 &C - TrimmedText.data() + 1) | |
254 // We allow at most two newlines. | |
255 .endswith("\n\n\n"); | |
256 }); | |
257 | |
258 return AdjustedResult; | |
259 } | |
260 | |
261 // Separates two blocks with extra spacing. Note that it might render strangely | |
262 // in vscode if the trailing block is a codeblock, see | |
263 // https://github.com/microsoft/vscode/issues/88416 for details. | |
264 class Ruler : public Block { | |
265 public: | |
266 void renderMarkdown(llvm::raw_ostream &OS) const override { | |
267 // Note that we need an extra new line before the ruler, otherwise we might | |
268 // make previous block a title instead of introducing a ruler. | |
269 OS << "\n---\n"; | |
270 } | |
271 void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; } | |
272 std::unique_ptr<Block> clone() const override { | |
273 return std::make_unique<Ruler>(*this); | |
274 } | |
275 bool isRuler() const override { return true; } | |
276 }; | |
277 | |
278 class CodeBlock : public Block { | |
279 public: | |
280 void renderMarkdown(llvm::raw_ostream &OS) const override { | |
281 std::string Marker = getMarkerForCodeBlock(Contents); | |
282 // No need to pad from previous blocks, as they should end with a new line. | |
283 OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n'; | |
284 } | |
285 | |
286 void renderPlainText(llvm::raw_ostream &OS) const override { | |
287 // In plaintext we want one empty line before and after codeblocks. | |
288 OS << '\n' << Contents << "\n\n"; | |
289 } | |
290 | |
291 std::unique_ptr<Block> clone() const override { | |
292 return std::make_unique<CodeBlock>(*this); | |
293 } | |
294 | |
295 CodeBlock(std::string Contents, std::string Language) | |
296 : Contents(std::move(Contents)), Language(std::move(Language)) {} | |
297 | |
298 private: | |
299 std::string Contents; | |
300 std::string Language; | |
301 }; | |
302 | |
303 // Inserts two spaces after each `\n` to indent each line. First line is not | |
304 // indented. | |
305 std::string indentLines(llvm::StringRef Input) { | |
306 assert(!Input.endswith("\n") && "Input should've been trimmed."); | |
307 std::string IndentedR; | |
308 // We'll add 2 spaces after each new line. | |
309 IndentedR.reserve(Input.size() + Input.count('\n') * 2); | |
310 for (char C : Input) { | |
311 IndentedR += C; | |
312 if (C == '\n') | |
313 IndentedR.append(" "); | |
314 } | |
315 return IndentedR; | |
316 } | |
317 | |
318 class Heading : public Paragraph { | |
319 public: | |
320 Heading(size_t Level) : Level(Level) {} | |
321 void renderMarkdown(llvm::raw_ostream &OS) const override { | |
322 OS << std::string(Level, '#') << ' '; | |
323 Paragraph::renderMarkdown(OS); | |
324 } | |
325 | |
326 private: | |
327 size_t Level; | |
328 }; | |
329 | |
330 } // namespace | |
331 | |
332 std::string Block::asMarkdown() const { | |
333 std::string R; | |
334 llvm::raw_string_ostream OS(R); | |
335 renderMarkdown(OS); | |
336 return llvm::StringRef(OS.str()).trim().str(); | |
337 } | |
338 | |
339 std::string Block::asPlainText() const { | |
340 std::string R; | |
341 llvm::raw_string_ostream OS(R); | |
342 renderPlainText(OS); | |
343 return llvm::StringRef(OS.str()).trim().str(); | |
344 } | |
345 | |
346 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const { | |
347 bool NeedsSpace = false; | |
348 bool HasChunks = false; | |
349 for (auto &C : Chunks) { | |
350 if (C.SpaceBefore || NeedsSpace) | |
351 OS << " "; | |
352 switch (C.Kind) { | |
353 case Chunk::PlainText: | |
354 OS << renderText(C.Contents, !HasChunks); | |
355 break; | |
356 case Chunk::InlineCode: | |
357 OS << renderInlineBlock(C.Contents); | |
358 break; | |
359 } | |
360 HasChunks = true; | |
361 NeedsSpace = C.SpaceAfter; | |
362 } | |
363 // Paragraphs are translated into markdown lines, not markdown paragraphs. | |
364 // Therefore it only has a single linebreak afterwards. | |
365 // VSCode requires two spaces at the end of line to start a new one. | |
366 OS << " \n"; | |
367 } | |
368 | |
369 std::unique_ptr<Block> Paragraph::clone() const { | |
370 return std::make_unique<Paragraph>(*this); | |
371 } | |
372 | |
373 /// Choose a marker to delimit `Text` from a prioritized list of options. | |
374 /// This is more readable than escaping for plain-text. | |
375 llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options, | |
376 llvm::StringRef Text) { | |
377 // Prefer a delimiter whose characters don't appear in the text. | |
378 for (llvm::StringRef S : Options) | |
379 if (Text.find_first_of(S) == llvm::StringRef::npos) | |
380 return S; | |
381 return Options.front(); | |
382 } | |
383 | |
384 void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { | |
385 bool NeedsSpace = false; | |
386 for (auto &C : Chunks) { | |
387 if (C.SpaceBefore || NeedsSpace) | |
388 OS << " "; | |
389 llvm::StringRef Marker = ""; | |
390 if (C.Preserve && C.Kind == Chunk::InlineCode) | |
391 Marker = chooseMarker({"`", "'", "\""}, C.Contents); | |
392 OS << Marker << C.Contents << Marker; | |
393 NeedsSpace = C.SpaceAfter; | |
394 } | |
395 OS << '\n'; | |
396 } | |
397 | |
398 void BulletList::renderMarkdown(llvm::raw_ostream &OS) const { | |
399 for (auto &D : Items) { | |
400 // Instead of doing this we might prefer passing Indent to children to get | |
401 // rid of the copies, if it turns out to be a bottleneck. | |
402 OS << "- " << indentLines(D.asMarkdown()) << '\n'; | |
403 } | |
404 // We need a new line after list to terminate it in markdown. | |
405 OS << '\n'; | |
406 } | |
407 | |
408 void BulletList::renderPlainText(llvm::raw_ostream &OS) const { | |
409 for (auto &D : Items) { | |
410 // Instead of doing this we might prefer passing Indent to children to get | |
411 // rid of the copies, if it turns out to be a bottleneck. | |
412 OS << "- " << indentLines(D.asPlainText()) << '\n'; | |
413 } | |
414 } | |
415 | |
416 Paragraph &Paragraph::appendSpace() { | |
417 if (!Chunks.empty()) | |
418 Chunks.back().SpaceAfter = true; | |
419 return *this; | |
420 } | |
421 | |
422 Paragraph &Paragraph::appendText(llvm::StringRef Text) { | |
423 std::string Norm = canonicalizeSpaces(Text); | |
424 if (Norm.empty()) | |
425 return *this; | |
426 Chunks.emplace_back(); | |
427 Chunk &C = Chunks.back(); | |
428 C.Contents = std::move(Norm); | |
429 C.Kind = Chunk::PlainText; | |
430 C.SpaceBefore = llvm::isSpace(Text.front()); | |
431 C.SpaceAfter = llvm::isSpace(Text.back()); | |
432 return *this; | |
433 } | |
434 | |
435 Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) { | |
436 bool AdjacentCode = | |
437 !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode; | |
438 std::string Norm = canonicalizeSpaces(std::move(Code)); | |
439 if (Norm.empty()) | |
440 return *this; | |
441 Chunks.emplace_back(); | |
442 Chunk &C = Chunks.back(); | |
443 C.Contents = std::move(Norm); | |
444 C.Kind = Chunk::InlineCode; | |
445 C.Preserve = Preserve; | |
446 // Disallow adjacent code spans without spaces, markdown can't render them. | |
447 C.SpaceBefore = AdjacentCode; | |
448 return *this; | |
449 } | |
450 | |
451 std::unique_ptr<Block> BulletList::clone() const { | |
452 return std::make_unique<BulletList>(*this); | |
453 } | |
454 | |
455 class Document &BulletList::addItem() { | |
456 Items.emplace_back(); | |
457 return Items.back(); | |
458 } | |
459 | |
460 Document &Document::operator=(const Document &Other) { | |
461 Children.clear(); | |
462 for (const auto &C : Other.Children) | |
463 Children.push_back(C->clone()); | |
464 return *this; | |
465 } | |
466 | |
467 void Document::append(Document Other) { | |
468 std::move(Other.Children.begin(), Other.Children.end(), | |
469 std::back_inserter(Children)); | |
470 } | |
471 | |
472 Paragraph &Document::addParagraph() { | |
473 Children.push_back(std::make_unique<Paragraph>()); | |
474 return *static_cast<Paragraph *>(Children.back().get()); | |
475 } | |
476 | |
477 void Document::addRuler() { Children.push_back(std::make_unique<Ruler>()); } | |
478 | |
479 void Document::addCodeBlock(std::string Code, std::string Language) { | |
480 Children.emplace_back( | |
481 std::make_unique<CodeBlock>(std::move(Code), std::move(Language))); | |
482 } | |
483 | |
484 std::string Document::asMarkdown() const { | |
485 return renderBlocks(Children, &Block::renderMarkdown); | |
486 } | |
487 | |
488 std::string Document::asPlainText() const { | |
489 return renderBlocks(Children, &Block::renderPlainText); | |
490 } | |
491 | |
492 BulletList &Document::addBulletList() { | |
493 Children.emplace_back(std::make_unique<BulletList>()); | |
494 return *static_cast<BulletList *>(Children.back().get()); | |
495 } | |
496 | |
497 Paragraph &Document::addHeading(size_t Level) { | |
498 assert(Level > 0); | |
499 Children.emplace_back(std::make_unique<Heading>(Level)); | |
500 return *static_cast<Paragraph *>(Children.back().get()); | |
501 } | |
502 } // namespace markup | |
503 } // namespace clangd | |
504 } // namespace clang |