CbC/CbC_llvm: clang/lib/Format/Encoding.h annotate

annotate clang/lib/Format/Encoding.h @ 176:de4ac79aef9d

...

author	Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date	Mon, 25 May 2020 17:13:11 +0900
parents	1d019706d866
children

rev	line source
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	1 //===--- Encoding.h - Format C++ code ---------------------------- C++ --===//
1d019706d866 LLVM10 anatofuz parents: diff changeset	2 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
1d019706d866 LLVM10 anatofuz parents: diff changeset	4 // See https://llvm.org/LICENSE.txt for license information.
1d019706d866 LLVM10 anatofuz parents: diff changeset	5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
1d019706d866 LLVM10 anatofuz parents: diff changeset	6 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	7 //===----------------------------------------------------------------------===//
1d019706d866 LLVM10 anatofuz parents: diff changeset	8 ///
1d019706d866 LLVM10 anatofuz parents: diff changeset	9 /// \file
1d019706d866 LLVM10 anatofuz parents: diff changeset	10 /// Contains functions for text encoding manipulation. Supports UTF-8,
1d019706d866 LLVM10 anatofuz parents: diff changeset	11 /// 8-bit encodings and escape sequences in C++ string literals.
1d019706d866 LLVM10 anatofuz parents: diff changeset	12 ///
1d019706d866 LLVM10 anatofuz parents: diff changeset	13 //===----------------------------------------------------------------------===//
1d019706d866 LLVM10 anatofuz parents: diff changeset	14
1d019706d866 LLVM10 anatofuz parents: diff changeset	15 #ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
1d019706d866 LLVM10 anatofuz parents: diff changeset	16 #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
1d019706d866 LLVM10 anatofuz parents: diff changeset	17
1d019706d866 LLVM10 anatofuz parents: diff changeset	18 #include "clang/Basic/LLVM.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	19 #include "llvm/ADT/StringRef.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	20 #include "llvm/Support/ConvertUTF.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	21 #include "llvm/Support/Unicode.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	22
1d019706d866 LLVM10 anatofuz parents: diff changeset	23 namespace clang {
1d019706d866 LLVM10 anatofuz parents: diff changeset	24 namespace format {
1d019706d866 LLVM10 anatofuz parents: diff changeset	25 namespace encoding {
1d019706d866 LLVM10 anatofuz parents: diff changeset	26
1d019706d866 LLVM10 anatofuz parents: diff changeset	27 enum Encoding {
1d019706d866 LLVM10 anatofuz parents: diff changeset	28 Encoding_UTF8,
1d019706d866 LLVM10 anatofuz parents: diff changeset	29 Encoding_Unknown // We treat all other encodings as 8-bit encodings.
1d019706d866 LLVM10 anatofuz parents: diff changeset	30 };
1d019706d866 LLVM10 anatofuz parents: diff changeset	31
1d019706d866 LLVM10 anatofuz parents: diff changeset	32 /// Detects encoding of the Text. If the Text can be decoded using UTF-8,
1d019706d866 LLVM10 anatofuz parents: diff changeset	33 /// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
1d019706d866 LLVM10 anatofuz parents: diff changeset	34 inline Encoding detectEncoding(StringRef Text) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	35 const llvm::UTF8 Ptr = reinterpret_cast<const llvm::UTF8 >(Text.begin());
1d019706d866 LLVM10 anatofuz parents: diff changeset	36 const llvm::UTF8 BufEnd = reinterpret_cast<const llvm::UTF8 >(Text.end());
1d019706d866 LLVM10 anatofuz parents: diff changeset	37 if (llvm::isLegalUTF8String(&Ptr, BufEnd))
1d019706d866 LLVM10 anatofuz parents: diff changeset	38 return Encoding_UTF8;
1d019706d866 LLVM10 anatofuz parents: diff changeset	39 return Encoding_Unknown;
1d019706d866 LLVM10 anatofuz parents: diff changeset	40 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	41
1d019706d866 LLVM10 anatofuz parents: diff changeset	42 /// Returns the number of columns required to display the \p Text on a
1d019706d866 LLVM10 anatofuz parents: diff changeset	43 /// generic Unicode-capable terminal. Text is assumed to use the specified
1d019706d866 LLVM10 anatofuz parents: diff changeset	44 /// \p Encoding.
1d019706d866 LLVM10 anatofuz parents: diff changeset	45 inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	46 if (Encoding == Encoding_UTF8) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	47 int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
1d019706d866 LLVM10 anatofuz parents: diff changeset	48 // FIXME: Figure out the correct way to handle this in the presence of both
1d019706d866 LLVM10 anatofuz parents: diff changeset	49 // printable and unprintable multi-byte UTF-8 characters. Falling back to
1d019706d866 LLVM10 anatofuz parents: diff changeset	50 // returning the number of bytes may cause problems, as columnWidth suddenly
1d019706d866 LLVM10 anatofuz parents: diff changeset	51 // becomes non-additive.
1d019706d866 LLVM10 anatofuz parents: diff changeset	52 if (ContentWidth >= 0)
1d019706d866 LLVM10 anatofuz parents: diff changeset	53 return ContentWidth;
1d019706d866 LLVM10 anatofuz parents: diff changeset	54 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	55 return Text.size();
1d019706d866 LLVM10 anatofuz parents: diff changeset	56 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	57
1d019706d866 LLVM10 anatofuz parents: diff changeset	58 /// Returns the number of columns required to display the \p Text,
1d019706d866 LLVM10 anatofuz parents: diff changeset	59 /// starting from the \p StartColumn on a terminal with the \p TabWidth. The
1d019706d866 LLVM10 anatofuz parents: diff changeset	60 /// text is assumed to use the specified \p Encoding.
1d019706d866 LLVM10 anatofuz parents: diff changeset	61 inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
1d019706d866 LLVM10 anatofuz parents: diff changeset	62 unsigned TabWidth, Encoding Encoding) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	63 unsigned TotalWidth = 0;
1d019706d866 LLVM10 anatofuz parents: diff changeset	64 StringRef Tail = Text;
1d019706d866 LLVM10 anatofuz parents: diff changeset	65 for (;;) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	66 StringRef::size_type TabPos = Tail.find('\t');
1d019706d866 LLVM10 anatofuz parents: diff changeset	67 if (TabPos == StringRef::npos)
1d019706d866 LLVM10 anatofuz parents: diff changeset	68 return TotalWidth + columnWidth(Tail, Encoding);
1d019706d866 LLVM10 anatofuz parents: diff changeset	69 TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
1d019706d866 LLVM10 anatofuz parents: diff changeset	70 if (TabWidth)
1d019706d866 LLVM10 anatofuz parents: diff changeset	71 TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
1d019706d866 LLVM10 anatofuz parents: diff changeset	72 Tail = Tail.substr(TabPos + 1);
1d019706d866 LLVM10 anatofuz parents: diff changeset	73 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	74 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	75
1d019706d866 LLVM10 anatofuz parents: diff changeset	76 /// Gets the number of bytes in a sequence representing a single
1d019706d866 LLVM10 anatofuz parents: diff changeset	77 /// codepoint and starting with FirstChar in the specified Encoding.
1d019706d866 LLVM10 anatofuz parents: diff changeset	78 inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	79 switch (Encoding) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	80 case Encoding_UTF8:
1d019706d866 LLVM10 anatofuz parents: diff changeset	81 return llvm::getNumBytesForUTF8(FirstChar);
1d019706d866 LLVM10 anatofuz parents: diff changeset	82 default:
1d019706d866 LLVM10 anatofuz parents: diff changeset	83 return 1;
1d019706d866 LLVM10 anatofuz parents: diff changeset	84 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	85 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	86
1d019706d866 LLVM10 anatofuz parents: diff changeset	87 inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
1d019706d866 LLVM10 anatofuz parents: diff changeset	88
1d019706d866 LLVM10 anatofuz parents: diff changeset	89 inline bool isHexDigit(char c) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	90 return ('0' <= c && c <= '9') \|\| ('a' <= c && c <= 'f') \|\|
1d019706d866 LLVM10 anatofuz parents: diff changeset	91 ('A' <= c && c <= 'F');
1d019706d866 LLVM10 anatofuz parents: diff changeset	92 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	93
1d019706d866 LLVM10 anatofuz parents: diff changeset	94 /// Gets the length of an escape sequence inside a C++ string literal.
1d019706d866 LLVM10 anatofuz parents: diff changeset	95 /// Text should span from the beginning of the escape sequence (starting with a
1d019706d866 LLVM10 anatofuz parents: diff changeset	96 /// backslash) to the end of the string literal.
1d019706d866 LLVM10 anatofuz parents: diff changeset	97 inline unsigned getEscapeSequenceLength(StringRef Text) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	98 assert(Text[0] == '\\');
1d019706d866 LLVM10 anatofuz parents: diff changeset	99 if (Text.size() < 2)
1d019706d866 LLVM10 anatofuz parents: diff changeset	100 return 1;
1d019706d866 LLVM10 anatofuz parents: diff changeset	101
1d019706d866 LLVM10 anatofuz parents: diff changeset	102 switch (Text[1]) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	103 case 'u':
1d019706d866 LLVM10 anatofuz parents: diff changeset	104 return 6;
1d019706d866 LLVM10 anatofuz parents: diff changeset	105 case 'U':
1d019706d866 LLVM10 anatofuz parents: diff changeset	106 return 10;
1d019706d866 LLVM10 anatofuz parents: diff changeset	107 case 'x': {
1d019706d866 LLVM10 anatofuz parents: diff changeset	108 unsigned I = 2; // Point after '\x'.
1d019706d866 LLVM10 anatofuz parents: diff changeset	109 while (I < Text.size() && isHexDigit(Text[I]))
1d019706d866 LLVM10 anatofuz parents: diff changeset	110 ++I;
1d019706d866 LLVM10 anatofuz parents: diff changeset	111 return I;
1d019706d866 LLVM10 anatofuz parents: diff changeset	112 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	113 default:
1d019706d866 LLVM10 anatofuz parents: diff changeset	114 if (isOctDigit(Text[1])) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	115 unsigned I = 1;
1d019706d866 LLVM10 anatofuz parents: diff changeset	116 while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
1d019706d866 LLVM10 anatofuz parents: diff changeset	117 ++I;
1d019706d866 LLVM10 anatofuz parents: diff changeset	118 return I;
1d019706d866 LLVM10 anatofuz parents: diff changeset	119 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	120 return 1 + llvm::getNumBytesForUTF8(Text[1]);
1d019706d866 LLVM10 anatofuz parents: diff changeset	121 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	122 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	123
1d019706d866 LLVM10 anatofuz parents: diff changeset	124 } // namespace encoding
1d019706d866 LLVM10 anatofuz parents: diff changeset	125 } // namespace format
1d019706d866 LLVM10 anatofuz parents: diff changeset	126 } // namespace clang
1d019706d866 LLVM10 anatofuz parents: diff changeset	127
1d019706d866 LLVM10 anatofuz parents: diff changeset	128 #endif

Mercurial > hg > CbC > CbC_llvm

annotate clang/lib/Format/Encoding.h @ 176:de4ac79aef9d