173
|
1 //===--- Markup.cpp -----------------------------------------*- C++-*------===//
|
|
2 //
|
|
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
4 // See https://llvm.org/LICENSE.txt for license information.
|
|
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
6 //
|
|
7 //===----------------------------------------------------------------------===//
|
|
8 #include "support/Markup.h"
|
|
9 #include "llvm/ADT/ArrayRef.h"
|
|
10 #include "llvm/ADT/STLExtras.h"
|
|
11 #include "llvm/ADT/SmallVector.h"
|
|
12 #include "llvm/ADT/StringExtras.h"
|
|
13 #include "llvm/ADT/StringRef.h"
|
|
14 #include "llvm/Support/Compiler.h"
|
|
15 #include "llvm/Support/raw_ostream.h"
|
|
16 #include <cstddef>
|
|
17 #include <iterator>
|
|
18 #include <memory>
|
|
19 #include <string>
|
|
20 #include <vector>
|
|
21
|
|
22 namespace clang {
|
|
23 namespace clangd {
|
|
24 namespace markup {
|
|
25 namespace {
|
|
26
|
|
27 // Is <contents a plausible start to an HTML tag?
|
|
28 // Contents may not be the rest of the line, but it's the rest of the plain
|
|
29 // text, so we expect to see at least the tag name.
|
|
30 bool looksLikeTag(llvm::StringRef Contents) {
|
|
31 if (Contents.empty())
|
|
32 return false;
|
|
33 if (Contents.front() == '!' || Contents.front() == '?' ||
|
|
34 Contents.front() == '/')
|
|
35 return true;
|
|
36 // Check the start of the tag name.
|
|
37 if (!llvm::isAlpha(Contents.front()))
|
|
38 return false;
|
|
39 // Drop rest of the tag name, and following whitespace.
|
|
40 Contents = Contents
|
|
41 .drop_while([](char C) {
|
|
42 return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':';
|
|
43 })
|
|
44 .drop_while(llvm::isSpace);
|
|
45 // The rest of the tag consists of attributes, which have restrictive names.
|
|
46 // If we hit '=', all bets are off (attribute values can contain anything).
|
|
47 for (; !Contents.empty(); Contents = Contents.drop_front()) {
|
|
48 if (llvm::isAlnum(Contents.front()) || llvm::isSpace(Contents.front()))
|
|
49 continue;
|
|
50 if (Contents.front() == '>' || Contents.startswith("/>"))
|
|
51 return true; // May close the tag.
|
|
52 if (Contents.front() == '=')
|
|
53 return true; // Don't try to parse attribute values.
|
|
54 return false; // Random punctuation means this isn't a tag.
|
|
55 }
|
|
56 return true; // Potentially incomplete tag.
|
|
57 }
|
|
58
|
|
59 // Tests whether C should be backslash-escaped in markdown.
|
|
60 // The string being escaped is Before + C + After. This is part of a paragraph.
|
|
61 // StartsLine indicates whether `Before` is the start of the line.
|
|
62 // After may not be everything until the end of the line.
|
|
63 //
|
|
64 // It's always safe to escape punctuation, but want minimal escaping.
|
|
65 // The strategy is to escape the first character of anything that might start
|
|
66 // a markdown grammar construct.
|
|
67 bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
|
|
68 bool StartsLine) {
|
|
69 assert(Before.take_while(llvm::isSpace).empty());
|
|
70 auto RulerLength = [&]() -> /*Length*/ unsigned {
|
|
71 if (!StartsLine || !Before.empty())
|
|
72 return false;
|
|
73 llvm::StringRef A = After.rtrim();
|
|
74 return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0;
|
|
75 };
|
|
76 auto IsBullet = [&]() {
|
|
77 return StartsLine && Before.empty() &&
|
|
78 (After.empty() || After.startswith(" "));
|
|
79 };
|
|
80 auto SpaceSurrounds = [&]() {
|
|
81 return (After.empty() || llvm::isSpace(After.front())) &&
|
|
82 (Before.empty() || llvm::isSpace(Before.back()));
|
|
83 };
|
|
84 auto WordSurrounds = [&]() {
|
|
85 return (!After.empty() && llvm::isAlnum(After.front())) &&
|
|
86 (!Before.empty() && llvm::isAlnum(Before.back()));
|
|
87 };
|
|
88
|
|
89 switch (C) {
|
|
90 case '\\': // Escaped character.
|
|
91 return true;
|
|
92 case '`': // Code block or inline code
|
|
93 // Any number of backticks can delimit an inline code block that can end
|
|
94 // anywhere (including on another line). We must escape them all.
|
|
95 return true;
|
|
96 case '~': // Code block
|
|
97 return StartsLine && Before.empty() && After.startswith("~~");
|
|
98 case '#': { // ATX heading.
|
|
99 if (!StartsLine || !Before.empty())
|
|
100 return false;
|
|
101 llvm::StringRef Rest = After.ltrim(C);
|
|
102 return Rest.empty() || Rest.startswith(" ");
|
|
103 }
|
|
104 case ']': // Link or link reference.
|
|
105 // We escape ] rather than [ here, because it's more constrained:
|
|
106 // ](...) is an in-line link
|
|
107 // ]: is a link reference
|
|
108 // The following are only links if the link reference exists:
|
|
109 // ] by itself is a shortcut link
|
|
110 // ][...] is an out-of-line link
|
|
111 // Because we never emit link references, we don't need to handle these.
|
|
112 return After.startswith(":") || After.startswith("(");
|
|
113 case '=': // Setex heading.
|
|
114 return RulerLength() > 0;
|
|
115 case '_': // Horizontal ruler or matched delimiter.
|
|
116 if (RulerLength() >= 3)
|
|
117 return true;
|
|
118 // Not a delimiter if surrounded by space, or inside a word.
|
|
119 // (The rules at word boundaries are subtle).
|
|
120 return !(SpaceSurrounds() || WordSurrounds());
|
|
121 case '-': // Setex heading, horizontal ruler, or bullet.
|
|
122 if (RulerLength() > 0)
|
|
123 return true;
|
|
124 return IsBullet();
|
|
125 case '+': // Bullet list.
|
|
126 return IsBullet();
|
|
127 case '*': // Bullet list, horizontal ruler, or delimiter.
|
|
128 return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds();
|
|
129 case '<': // HTML tag (or autolink, which we choose not to escape)
|
|
130 return looksLikeTag(After);
|
|
131 case '>': // Quote marker. Needs escaping at start of line.
|
|
132 return StartsLine && Before.empty();
|
|
133 case '&': { // HTML entity reference
|
|
134 auto End = After.find(';');
|
|
135 if (End == llvm::StringRef::npos)
|
|
136 return false;
|
|
137 llvm::StringRef Content = After.substr(0, End);
|
|
138 if (Content.consume_front("#")) {
|
|
139 if (Content.consume_front("x") || Content.consume_front("X"))
|
|
140 return llvm::all_of(Content, llvm::isHexDigit);
|
|
141 return llvm::all_of(Content, llvm::isDigit);
|
|
142 }
|
|
143 return llvm::all_of(Content, llvm::isAlpha);
|
|
144 }
|
|
145 case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line.
|
|
146 case ')':
|
|
147 return StartsLine && !Before.empty() &&
|
|
148 llvm::all_of(Before, llvm::isDigit) && After.startswith(" ");
|
|
149 default:
|
|
150 return false;
|
|
151 }
|
|
152 }
|
|
153
|
|
154 /// Escape a markdown text block. Ensures the punctuation will not introduce
|
|
155 /// any of the markdown constructs.
|
|
156 std::string renderText(llvm::StringRef Input, bool StartsLine) {
|
|
157 std::string R;
|
|
158 for (unsigned I = 0; I < Input.size(); ++I) {
|
|
159 if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1),
|
|
160 StartsLine))
|
|
161 R.push_back('\\');
|
|
162 R.push_back(Input[I]);
|
|
163 }
|
|
164 return R;
|
|
165 }
|
|
166
|
|
167 /// Renders \p Input as an inline block of code in markdown. The returned value
|
|
168 /// is surrounded by backticks and the inner contents are properly escaped.
|
|
169 std::string renderInlineBlock(llvm::StringRef Input) {
|
|
170 std::string R;
|
|
171 // Double all backticks to make sure we don't close the inline block early.
|
|
172 for (size_t From = 0; From < Input.size();) {
|
|
173 size_t Next = Input.find("`", From);
|
|
174 R += Input.substr(From, Next - From);
|
|
175 if (Next == llvm::StringRef::npos)
|
|
176 break;
|
|
177 R += "``"; // double the found backtick.
|
|
178
|
|
179 From = Next + 1;
|
|
180 }
|
|
181 // If results starts with a backtick, add spaces on both sides. The spaces
|
|
182 // are ignored by markdown renderers.
|
|
183 if (llvm::StringRef(R).startswith("`") || llvm::StringRef(R).endswith("`"))
|
|
184 return "` " + std::move(R) + " `";
|
|
185 // Markdown render should ignore first and last space if both are there. We
|
|
186 // add an extra pair of spaces in that case to make sure we render what the
|
|
187 // user intended.
|
|
188 if (llvm::StringRef(R).startswith(" ") && llvm::StringRef(R).endswith(" "))
|
|
189 return "` " + std::move(R) + " `";
|
|
190 return "`" + std::move(R) + "`";
|
|
191 }
|
|
192
|
|
193 /// Get marker required for \p Input to represent a markdown codeblock. It
|
|
194 /// consists of at least 3 backticks(`). Although markdown also allows to use
|
|
195 /// tilde(~) for code blocks, they are never used.
|
|
196 std::string getMarkerForCodeBlock(llvm::StringRef Input) {
|
|
197 // Count the maximum number of consecutive backticks in \p Input. We need to
|
|
198 // start and end the code block with more.
|
|
199 unsigned MaxBackticks = 0;
|
|
200 unsigned Backticks = 0;
|
|
201 for (char C : Input) {
|
|
202 if (C == '`') {
|
|
203 ++Backticks;
|
|
204 continue;
|
|
205 }
|
|
206 MaxBackticks = std::max(MaxBackticks, Backticks);
|
|
207 Backticks = 0;
|
|
208 }
|
|
209 MaxBackticks = std::max(Backticks, MaxBackticks);
|
|
210 // Use the corresponding number of backticks to start and end a code block.
|
|
211 return std::string(/*Repeat=*/std::max(3u, MaxBackticks + 1), '`');
|
|
212 }
|
|
213
|
|
214 // Trims the input and concatenates whitespace blocks into a single ` `.
|
|
215 std::string canonicalizeSpaces(llvm::StringRef Input) {
|
221
|
216 llvm::SmallVector<llvm::StringRef> Words;
|
173
|
217 llvm::SplitString(Input, Words);
|
|
218 return llvm::join(Words, " ");
|
|
219 }
|
|
220
|
|
221 std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children,
|
|
222 void (Block::*RenderFunc)(llvm::raw_ostream &) const) {
|
|
223 std::string R;
|
|
224 llvm::raw_string_ostream OS(R);
|
|
225
|
|
226 // Trim rulers.
|
|
227 Children = Children.drop_while(
|
|
228 [](const std::unique_ptr<Block> &C) { return C->isRuler(); });
|
|
229 auto Last = llvm::find_if(
|
|
230 llvm::reverse(Children),
|
|
231 [](const std::unique_ptr<Block> &C) { return !C->isRuler(); });
|
|
232 Children = Children.drop_back(Children.end() - Last.base());
|
|
233
|
|
234 bool LastBlockWasRuler = true;
|
|
235 for (const auto &C : Children) {
|
|
236 if (C->isRuler() && LastBlockWasRuler)
|
|
237 continue;
|
|
238 LastBlockWasRuler = C->isRuler();
|
|
239 ((*C).*RenderFunc)(OS);
|
|
240 }
|
|
241
|
|
242 // Get rid of redundant empty lines introduced in plaintext while imitating
|
|
243 // padding in markdown.
|
|
244 std::string AdjustedResult;
|
|
245 llvm::StringRef TrimmedText(OS.str());
|
|
246 TrimmedText = TrimmedText.trim();
|
|
247
|
|
248 llvm::copy_if(TrimmedText, std::back_inserter(AdjustedResult),
|
|
249 [&TrimmedText](const char &C) {
|
|
250 return !llvm::StringRef(TrimmedText.data(),
|
|
251 &C - TrimmedText.data() + 1)
|
|
252 // We allow at most two newlines.
|
|
253 .endswith("\n\n\n");
|
|
254 });
|
|
255
|
|
256 return AdjustedResult;
|
|
257 }
|
|
258
|
|
259 // Separates two blocks with extra spacing. Note that it might render strangely
|
|
260 // in vscode if the trailing block is a codeblock, see
|
|
261 // https://github.com/microsoft/vscode/issues/88416 for details.
|
|
262 class Ruler : public Block {
|
|
263 public:
|
|
264 void renderMarkdown(llvm::raw_ostream &OS) const override {
|
|
265 // Note that we need an extra new line before the ruler, otherwise we might
|
|
266 // make previous block a title instead of introducing a ruler.
|
|
267 OS << "\n---\n";
|
|
268 }
|
|
269 void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; }
|
|
270 std::unique_ptr<Block> clone() const override {
|
|
271 return std::make_unique<Ruler>(*this);
|
|
272 }
|
|
273 bool isRuler() const override { return true; }
|
|
274 };
|
|
275
|
|
276 class CodeBlock : public Block {
|
|
277 public:
|
|
278 void renderMarkdown(llvm::raw_ostream &OS) const override {
|
|
279 std::string Marker = getMarkerForCodeBlock(Contents);
|
|
280 // No need to pad from previous blocks, as they should end with a new line.
|
|
281 OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n';
|
|
282 }
|
|
283
|
|
284 void renderPlainText(llvm::raw_ostream &OS) const override {
|
|
285 // In plaintext we want one empty line before and after codeblocks.
|
|
286 OS << '\n' << Contents << "\n\n";
|
|
287 }
|
|
288
|
|
289 std::unique_ptr<Block> clone() const override {
|
|
290 return std::make_unique<CodeBlock>(*this);
|
|
291 }
|
|
292
|
|
293 CodeBlock(std::string Contents, std::string Language)
|
|
294 : Contents(std::move(Contents)), Language(std::move(Language)) {}
|
|
295
|
|
296 private:
|
|
297 std::string Contents;
|
|
298 std::string Language;
|
|
299 };
|
|
300
|
|
301 // Inserts two spaces after each `\n` to indent each line. First line is not
|
|
302 // indented.
|
|
303 std::string indentLines(llvm::StringRef Input) {
|
|
304 assert(!Input.endswith("\n") && "Input should've been trimmed.");
|
|
305 std::string IndentedR;
|
|
306 // We'll add 2 spaces after each new line.
|
|
307 IndentedR.reserve(Input.size() + Input.count('\n') * 2);
|
|
308 for (char C : Input) {
|
|
309 IndentedR += C;
|
|
310 if (C == '\n')
|
|
311 IndentedR.append(" ");
|
|
312 }
|
|
313 return IndentedR;
|
|
314 }
|
|
315
|
|
316 class Heading : public Paragraph {
|
|
317 public:
|
|
318 Heading(size_t Level) : Level(Level) {}
|
|
319 void renderMarkdown(llvm::raw_ostream &OS) const override {
|
|
320 OS << std::string(Level, '#') << ' ';
|
|
321 Paragraph::renderMarkdown(OS);
|
|
322 }
|
|
323
|
|
324 private:
|
|
325 size_t Level;
|
|
326 };
|
|
327
|
|
328 } // namespace
|
|
329
|
|
330 std::string Block::asMarkdown() const {
|
|
331 std::string R;
|
|
332 llvm::raw_string_ostream OS(R);
|
|
333 renderMarkdown(OS);
|
|
334 return llvm::StringRef(OS.str()).trim().str();
|
|
335 }
|
|
336
|
|
337 std::string Block::asPlainText() const {
|
|
338 std::string R;
|
|
339 llvm::raw_string_ostream OS(R);
|
|
340 renderPlainText(OS);
|
|
341 return llvm::StringRef(OS.str()).trim().str();
|
|
342 }
|
|
343
|
|
344 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
|
|
345 bool NeedsSpace = false;
|
|
346 bool HasChunks = false;
|
|
347 for (auto &C : Chunks) {
|
|
348 if (C.SpaceBefore || NeedsSpace)
|
|
349 OS << " ";
|
|
350 switch (C.Kind) {
|
|
351 case Chunk::PlainText:
|
|
352 OS << renderText(C.Contents, !HasChunks);
|
|
353 break;
|
|
354 case Chunk::InlineCode:
|
|
355 OS << renderInlineBlock(C.Contents);
|
|
356 break;
|
|
357 }
|
|
358 HasChunks = true;
|
|
359 NeedsSpace = C.SpaceAfter;
|
|
360 }
|
|
361 // Paragraphs are translated into markdown lines, not markdown paragraphs.
|
|
362 // Therefore it only has a single linebreak afterwards.
|
|
363 // VSCode requires two spaces at the end of line to start a new one.
|
|
364 OS << " \n";
|
|
365 }
|
|
366
|
|
367 std::unique_ptr<Block> Paragraph::clone() const {
|
|
368 return std::make_unique<Paragraph>(*this);
|
|
369 }
|
|
370
|
|
371 /// Choose a marker to delimit `Text` from a prioritized list of options.
|
|
372 /// This is more readable than escaping for plain-text.
|
|
373 llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
|
|
374 llvm::StringRef Text) {
|
|
375 // Prefer a delimiter whose characters don't appear in the text.
|
|
376 for (llvm::StringRef S : Options)
|
|
377 if (Text.find_first_of(S) == llvm::StringRef::npos)
|
|
378 return S;
|
|
379 return Options.front();
|
|
380 }
|
|
381
|
|
382 void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
|
|
383 bool NeedsSpace = false;
|
|
384 for (auto &C : Chunks) {
|
|
385 if (C.SpaceBefore || NeedsSpace)
|
|
386 OS << " ";
|
|
387 llvm::StringRef Marker = "";
|
|
388 if (C.Preserve && C.Kind == Chunk::InlineCode)
|
|
389 Marker = chooseMarker({"`", "'", "\""}, C.Contents);
|
|
390 OS << Marker << C.Contents << Marker;
|
|
391 NeedsSpace = C.SpaceAfter;
|
|
392 }
|
|
393 OS << '\n';
|
|
394 }
|
|
395
|
236
|
396 BulletList::BulletList() = default;
|
|
397 BulletList::~BulletList() = default;
|
|
398
|
173
|
399 void BulletList::renderMarkdown(llvm::raw_ostream &OS) const {
|
|
400 for (auto &D : Items) {
|
|
401 // Instead of doing this we might prefer passing Indent to children to get
|
|
402 // rid of the copies, if it turns out to be a bottleneck.
|
|
403 OS << "- " << indentLines(D.asMarkdown()) << '\n';
|
|
404 }
|
|
405 // We need a new line after list to terminate it in markdown.
|
|
406 OS << '\n';
|
|
407 }
|
|
408
|
|
409 void BulletList::renderPlainText(llvm::raw_ostream &OS) const {
|
|
410 for (auto &D : Items) {
|
|
411 // Instead of doing this we might prefer passing Indent to children to get
|
|
412 // rid of the copies, if it turns out to be a bottleneck.
|
|
413 OS << "- " << indentLines(D.asPlainText()) << '\n';
|
|
414 }
|
|
415 }
|
|
416
|
|
417 Paragraph &Paragraph::appendSpace() {
|
|
418 if (!Chunks.empty())
|
|
419 Chunks.back().SpaceAfter = true;
|
|
420 return *this;
|
|
421 }
|
|
422
|
|
423 Paragraph &Paragraph::appendText(llvm::StringRef Text) {
|
|
424 std::string Norm = canonicalizeSpaces(Text);
|
|
425 if (Norm.empty())
|
|
426 return *this;
|
|
427 Chunks.emplace_back();
|
|
428 Chunk &C = Chunks.back();
|
|
429 C.Contents = std::move(Norm);
|
|
430 C.Kind = Chunk::PlainText;
|
|
431 C.SpaceBefore = llvm::isSpace(Text.front());
|
|
432 C.SpaceAfter = llvm::isSpace(Text.back());
|
|
433 return *this;
|
|
434 }
|
|
435
|
|
436 Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) {
|
|
437 bool AdjacentCode =
|
|
438 !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode;
|
|
439 std::string Norm = canonicalizeSpaces(std::move(Code));
|
|
440 if (Norm.empty())
|
|
441 return *this;
|
|
442 Chunks.emplace_back();
|
|
443 Chunk &C = Chunks.back();
|
|
444 C.Contents = std::move(Norm);
|
|
445 C.Kind = Chunk::InlineCode;
|
|
446 C.Preserve = Preserve;
|
|
447 // Disallow adjacent code spans without spaces, markdown can't render them.
|
|
448 C.SpaceBefore = AdjacentCode;
|
|
449 return *this;
|
|
450 }
|
|
451
|
|
452 std::unique_ptr<Block> BulletList::clone() const {
|
|
453 return std::make_unique<BulletList>(*this);
|
|
454 }
|
|
455
|
|
456 class Document &BulletList::addItem() {
|
|
457 Items.emplace_back();
|
|
458 return Items.back();
|
|
459 }
|
|
460
|
|
461 Document &Document::operator=(const Document &Other) {
|
|
462 Children.clear();
|
|
463 for (const auto &C : Other.Children)
|
|
464 Children.push_back(C->clone());
|
|
465 return *this;
|
|
466 }
|
|
467
|
|
468 void Document::append(Document Other) {
|
|
469 std::move(Other.Children.begin(), Other.Children.end(),
|
|
470 std::back_inserter(Children));
|
|
471 }
|
|
472
|
|
473 Paragraph &Document::addParagraph() {
|
|
474 Children.push_back(std::make_unique<Paragraph>());
|
|
475 return *static_cast<Paragraph *>(Children.back().get());
|
|
476 }
|
|
477
|
|
478 void Document::addRuler() { Children.push_back(std::make_unique<Ruler>()); }
|
|
479
|
|
480 void Document::addCodeBlock(std::string Code, std::string Language) {
|
|
481 Children.emplace_back(
|
|
482 std::make_unique<CodeBlock>(std::move(Code), std::move(Language)));
|
|
483 }
|
|
484
|
|
485 std::string Document::asMarkdown() const {
|
|
486 return renderBlocks(Children, &Block::renderMarkdown);
|
|
487 }
|
|
488
|
|
489 std::string Document::asPlainText() const {
|
|
490 return renderBlocks(Children, &Block::renderPlainText);
|
|
491 }
|
|
492
|
|
493 BulletList &Document::addBulletList() {
|
|
494 Children.emplace_back(std::make_unique<BulletList>());
|
|
495 return *static_cast<BulletList *>(Children.back().get());
|
|
496 }
|
|
497
|
|
498 Paragraph &Document::addHeading(size_t Level) {
|
|
499 assert(Level > 0);
|
|
500 Children.emplace_back(std::make_unique<Heading>(Level));
|
|
501 return *static_cast<Paragraph *>(Children.back().get());
|
|
502 }
|
|
503 } // namespace markup
|
|
504 } // namespace clangd
|
|
505 } // namespace clang
|