Mercurial > hg > CbC > CbC_llvm
diff clang-tools-extra/clangd/support/Markup.cpp @ 173:0572611fdcc8 llvm10 llvm12
reorgnization done
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 25 May 2020 11:55:54 +0900 |
parents | |
children | 2e18cbf3894f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clang-tools-extra/clangd/support/Markup.cpp Mon May 25 11:55:54 2020 +0900 @@ -0,0 +1,504 @@ +//===--- Markup.cpp -----------------------------------------*- C++-*------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "support/Markup.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include <cstddef> +#include <iterator> +#include <memory> +#include <string> +#include <vector> + +namespace clang { +namespace clangd { +namespace markup { +namespace { + +// Is <contents a plausible start to an HTML tag? +// Contents may not be the rest of the line, but it's the rest of the plain +// text, so we expect to see at least the tag name. +bool looksLikeTag(llvm::StringRef Contents) { + if (Contents.empty()) + return false; + if (Contents.front() == '!' || Contents.front() == '?' || + Contents.front() == '/') + return true; + // Check the start of the tag name. + if (!llvm::isAlpha(Contents.front())) + return false; + // Drop rest of the tag name, and following whitespace. + Contents = Contents + .drop_while([](char C) { + return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':'; + }) + .drop_while(llvm::isSpace); + // The rest of the tag consists of attributes, which have restrictive names. + // If we hit '=', all bets are off (attribute values can contain anything). + for (; !Contents.empty(); Contents = Contents.drop_front()) { + if (llvm::isAlnum(Contents.front()) || llvm::isSpace(Contents.front())) + continue; + if (Contents.front() == '>' || Contents.startswith("/>")) + return true; // May close the tag. + if (Contents.front() == '=') + return true; // Don't try to parse attribute values. + return false; // Random punctuation means this isn't a tag. + } + return true; // Potentially incomplete tag. +} + +// Tests whether C should be backslash-escaped in markdown. +// The string being escaped is Before + C + After. This is part of a paragraph. +// StartsLine indicates whether `Before` is the start of the line. +// After may not be everything until the end of the line. +// +// It's always safe to escape punctuation, but want minimal escaping. +// The strategy is to escape the first character of anything that might start +// a markdown grammar construct. +bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After, + bool StartsLine) { + assert(Before.take_while(llvm::isSpace).empty()); + auto RulerLength = [&]() -> /*Length*/ unsigned { + if (!StartsLine || !Before.empty()) + return false; + llvm::StringRef A = After.rtrim(); + return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0; + }; + auto IsBullet = [&]() { + return StartsLine && Before.empty() && + (After.empty() || After.startswith(" ")); + }; + auto SpaceSurrounds = [&]() { + return (After.empty() || llvm::isSpace(After.front())) && + (Before.empty() || llvm::isSpace(Before.back())); + }; + auto WordSurrounds = [&]() { + return (!After.empty() && llvm::isAlnum(After.front())) && + (!Before.empty() && llvm::isAlnum(Before.back())); + }; + + switch (C) { + case '\\': // Escaped character. + return true; + case '`': // Code block or inline code + // Any number of backticks can delimit an inline code block that can end + // anywhere (including on another line). We must escape them all. + return true; + case '~': // Code block + return StartsLine && Before.empty() && After.startswith("~~"); + case '#': { // ATX heading. + if (!StartsLine || !Before.empty()) + return false; + llvm::StringRef Rest = After.ltrim(C); + return Rest.empty() || Rest.startswith(" "); + } + case ']': // Link or link reference. + // We escape ] rather than [ here, because it's more constrained: + // ](...) is an in-line link + // ]: is a link reference + // The following are only links if the link reference exists: + // ] by itself is a shortcut link + // ][...] is an out-of-line link + // Because we never emit link references, we don't need to handle these. + return After.startswith(":") || After.startswith("("); + case '=': // Setex heading. + return RulerLength() > 0; + case '_': // Horizontal ruler or matched delimiter. + if (RulerLength() >= 3) + return true; + // Not a delimiter if surrounded by space, or inside a word. + // (The rules at word boundaries are subtle). + return !(SpaceSurrounds() || WordSurrounds()); + case '-': // Setex heading, horizontal ruler, or bullet. + if (RulerLength() > 0) + return true; + return IsBullet(); + case '+': // Bullet list. + return IsBullet(); + case '*': // Bullet list, horizontal ruler, or delimiter. + return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds(); + case '<': // HTML tag (or autolink, which we choose not to escape) + return looksLikeTag(After); + case '>': // Quote marker. Needs escaping at start of line. + return StartsLine && Before.empty(); + case '&': { // HTML entity reference + auto End = After.find(';'); + if (End == llvm::StringRef::npos) + return false; + llvm::StringRef Content = After.substr(0, End); + if (Content.consume_front("#")) { + if (Content.consume_front("x") || Content.consume_front("X")) + return llvm::all_of(Content, llvm::isHexDigit); + return llvm::all_of(Content, llvm::isDigit); + } + return llvm::all_of(Content, llvm::isAlpha); + } + case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line. + case ')': + return StartsLine && !Before.empty() && + llvm::all_of(Before, llvm::isDigit) && After.startswith(" "); + default: + return false; + } +} + +/// Escape a markdown text block. Ensures the punctuation will not introduce +/// any of the markdown constructs. +std::string renderText(llvm::StringRef Input, bool StartsLine) { + std::string R; + for (unsigned I = 0; I < Input.size(); ++I) { + if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1), + StartsLine)) + R.push_back('\\'); + R.push_back(Input[I]); + } + return R; +} + +/// Renders \p Input as an inline block of code in markdown. The returned value +/// is surrounded by backticks and the inner contents are properly escaped. +std::string renderInlineBlock(llvm::StringRef Input) { + std::string R; + // Double all backticks to make sure we don't close the inline block early. + for (size_t From = 0; From < Input.size();) { + size_t Next = Input.find("`", From); + R += Input.substr(From, Next - From); + if (Next == llvm::StringRef::npos) + break; + R += "``"; // double the found backtick. + + From = Next + 1; + } + // If results starts with a backtick, add spaces on both sides. The spaces + // are ignored by markdown renderers. + if (llvm::StringRef(R).startswith("`") || llvm::StringRef(R).endswith("`")) + return "` " + std::move(R) + " `"; + // Markdown render should ignore first and last space if both are there. We + // add an extra pair of spaces in that case to make sure we render what the + // user intended. + if (llvm::StringRef(R).startswith(" ") && llvm::StringRef(R).endswith(" ")) + return "` " + std::move(R) + " `"; + return "`" + std::move(R) + "`"; +} + +/// Get marker required for \p Input to represent a markdown codeblock. It +/// consists of at least 3 backticks(`). Although markdown also allows to use +/// tilde(~) for code blocks, they are never used. +std::string getMarkerForCodeBlock(llvm::StringRef Input) { + // Count the maximum number of consecutive backticks in \p Input. We need to + // start and end the code block with more. + unsigned MaxBackticks = 0; + unsigned Backticks = 0; + for (char C : Input) { + if (C == '`') { + ++Backticks; + continue; + } + MaxBackticks = std::max(MaxBackticks, Backticks); + Backticks = 0; + } + MaxBackticks = std::max(Backticks, MaxBackticks); + // Use the corresponding number of backticks to start and end a code block. + return std::string(/*Repeat=*/std::max(3u, MaxBackticks + 1), '`'); +} + +// Trims the input and concatenates whitespace blocks into a single ` `. +std::string canonicalizeSpaces(llvm::StringRef Input) { + llvm::SmallVector<llvm::StringRef, 4> Words; + llvm::SplitString(Input, Words); + return llvm::join(Words, " "); +} + +std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children, + void (Block::*RenderFunc)(llvm::raw_ostream &) const) { + std::string R; + llvm::raw_string_ostream OS(R); + + // Trim rulers. + Children = Children.drop_while( + [](const std::unique_ptr<Block> &C) { return C->isRuler(); }); + auto Last = llvm::find_if( + llvm::reverse(Children), + [](const std::unique_ptr<Block> &C) { return !C->isRuler(); }); + Children = Children.drop_back(Children.end() - Last.base()); + + bool LastBlockWasRuler = true; + for (const auto &C : Children) { + if (C->isRuler() && LastBlockWasRuler) + continue; + LastBlockWasRuler = C->isRuler(); + ((*C).*RenderFunc)(OS); + } + + // Get rid of redundant empty lines introduced in plaintext while imitating + // padding in markdown. + std::string AdjustedResult; + llvm::StringRef TrimmedText(OS.str()); + TrimmedText = TrimmedText.trim(); + + llvm::copy_if(TrimmedText, std::back_inserter(AdjustedResult), + [&TrimmedText](const char &C) { + return !llvm::StringRef(TrimmedText.data(), + &C - TrimmedText.data() + 1) + // We allow at most two newlines. + .endswith("\n\n\n"); + }); + + return AdjustedResult; +} + +// Separates two blocks with extra spacing. Note that it might render strangely +// in vscode if the trailing block is a codeblock, see +// https://github.com/microsoft/vscode/issues/88416 for details. +class Ruler : public Block { +public: + void renderMarkdown(llvm::raw_ostream &OS) const override { + // Note that we need an extra new line before the ruler, otherwise we might + // make previous block a title instead of introducing a ruler. + OS << "\n---\n"; + } + void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; } + std::unique_ptr<Block> clone() const override { + return std::make_unique<Ruler>(*this); + } + bool isRuler() const override { return true; } +}; + +class CodeBlock : public Block { +public: + void renderMarkdown(llvm::raw_ostream &OS) const override { + std::string Marker = getMarkerForCodeBlock(Contents); + // No need to pad from previous blocks, as they should end with a new line. + OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n'; + } + + void renderPlainText(llvm::raw_ostream &OS) const override { + // In plaintext we want one empty line before and after codeblocks. + OS << '\n' << Contents << "\n\n"; + } + + std::unique_ptr<Block> clone() const override { + return std::make_unique<CodeBlock>(*this); + } + + CodeBlock(std::string Contents, std::string Language) + : Contents(std::move(Contents)), Language(std::move(Language)) {} + +private: + std::string Contents; + std::string Language; +}; + +// Inserts two spaces after each `\n` to indent each line. First line is not +// indented. +std::string indentLines(llvm::StringRef Input) { + assert(!Input.endswith("\n") && "Input should've been trimmed."); + std::string IndentedR; + // We'll add 2 spaces after each new line. + IndentedR.reserve(Input.size() + Input.count('\n') * 2); + for (char C : Input) { + IndentedR += C; + if (C == '\n') + IndentedR.append(" "); + } + return IndentedR; +} + +class Heading : public Paragraph { +public: + Heading(size_t Level) : Level(Level) {} + void renderMarkdown(llvm::raw_ostream &OS) const override { + OS << std::string(Level, '#') << ' '; + Paragraph::renderMarkdown(OS); + } + +private: + size_t Level; +}; + +} // namespace + +std::string Block::asMarkdown() const { + std::string R; + llvm::raw_string_ostream OS(R); + renderMarkdown(OS); + return llvm::StringRef(OS.str()).trim().str(); +} + +std::string Block::asPlainText() const { + std::string R; + llvm::raw_string_ostream OS(R); + renderPlainText(OS); + return llvm::StringRef(OS.str()).trim().str(); +} + +void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const { + bool NeedsSpace = false; + bool HasChunks = false; + for (auto &C : Chunks) { + if (C.SpaceBefore || NeedsSpace) + OS << " "; + switch (C.Kind) { + case Chunk::PlainText: + OS << renderText(C.Contents, !HasChunks); + break; + case Chunk::InlineCode: + OS << renderInlineBlock(C.Contents); + break; + } + HasChunks = true; + NeedsSpace = C.SpaceAfter; + } + // Paragraphs are translated into markdown lines, not markdown paragraphs. + // Therefore it only has a single linebreak afterwards. + // VSCode requires two spaces at the end of line to start a new one. + OS << " \n"; +} + +std::unique_ptr<Block> Paragraph::clone() const { + return std::make_unique<Paragraph>(*this); +} + +/// Choose a marker to delimit `Text` from a prioritized list of options. +/// This is more readable than escaping for plain-text. +llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options, + llvm::StringRef Text) { + // Prefer a delimiter whose characters don't appear in the text. + for (llvm::StringRef S : Options) + if (Text.find_first_of(S) == llvm::StringRef::npos) + return S; + return Options.front(); +} + +void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { + bool NeedsSpace = false; + for (auto &C : Chunks) { + if (C.SpaceBefore || NeedsSpace) + OS << " "; + llvm::StringRef Marker = ""; + if (C.Preserve && C.Kind == Chunk::InlineCode) + Marker = chooseMarker({"`", "'", "\""}, C.Contents); + OS << Marker << C.Contents << Marker; + NeedsSpace = C.SpaceAfter; + } + OS << '\n'; +} + +void BulletList::renderMarkdown(llvm::raw_ostream &OS) const { + for (auto &D : Items) { + // Instead of doing this we might prefer passing Indent to children to get + // rid of the copies, if it turns out to be a bottleneck. + OS << "- " << indentLines(D.asMarkdown()) << '\n'; + } + // We need a new line after list to terminate it in markdown. + OS << '\n'; +} + +void BulletList::renderPlainText(llvm::raw_ostream &OS) const { + for (auto &D : Items) { + // Instead of doing this we might prefer passing Indent to children to get + // rid of the copies, if it turns out to be a bottleneck. + OS << "- " << indentLines(D.asPlainText()) << '\n'; + } +} + +Paragraph &Paragraph::appendSpace() { + if (!Chunks.empty()) + Chunks.back().SpaceAfter = true; + return *this; +} + +Paragraph &Paragraph::appendText(llvm::StringRef Text) { + std::string Norm = canonicalizeSpaces(Text); + if (Norm.empty()) + return *this; + Chunks.emplace_back(); + Chunk &C = Chunks.back(); + C.Contents = std::move(Norm); + C.Kind = Chunk::PlainText; + C.SpaceBefore = llvm::isSpace(Text.front()); + C.SpaceAfter = llvm::isSpace(Text.back()); + return *this; +} + +Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) { + bool AdjacentCode = + !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode; + std::string Norm = canonicalizeSpaces(std::move(Code)); + if (Norm.empty()) + return *this; + Chunks.emplace_back(); + Chunk &C = Chunks.back(); + C.Contents = std::move(Norm); + C.Kind = Chunk::InlineCode; + C.Preserve = Preserve; + // Disallow adjacent code spans without spaces, markdown can't render them. + C.SpaceBefore = AdjacentCode; + return *this; +} + +std::unique_ptr<Block> BulletList::clone() const { + return std::make_unique<BulletList>(*this); +} + +class Document &BulletList::addItem() { + Items.emplace_back(); + return Items.back(); +} + +Document &Document::operator=(const Document &Other) { + Children.clear(); + for (const auto &C : Other.Children) + Children.push_back(C->clone()); + return *this; +} + +void Document::append(Document Other) { + std::move(Other.Children.begin(), Other.Children.end(), + std::back_inserter(Children)); +} + +Paragraph &Document::addParagraph() { + Children.push_back(std::make_unique<Paragraph>()); + return *static_cast<Paragraph *>(Children.back().get()); +} + +void Document::addRuler() { Children.push_back(std::make_unique<Ruler>()); } + +void Document::addCodeBlock(std::string Code, std::string Language) { + Children.emplace_back( + std::make_unique<CodeBlock>(std::move(Code), std::move(Language))); +} + +std::string Document::asMarkdown() const { + return renderBlocks(Children, &Block::renderMarkdown); +} + +std::string Document::asPlainText() const { + return renderBlocks(Children, &Block::renderPlainText); +} + +BulletList &Document::addBulletList() { + Children.emplace_back(std::make_unique<BulletList>()); + return *static_cast<BulletList *>(Children.back().get()); +} + +Paragraph &Document::addHeading(size_t Level) { + assert(Level > 0); + Children.emplace_back(std::make_unique<Heading>(Level)); + return *static_cast<Paragraph *>(Children.back().get()); +} +} // namespace markup +} // namespace clangd +} // namespace clang