Mercurial > hg > CbC > CbC_llvm
comparison lib/Support/YAMLParser.cpp @ 0:95c75e76d11b LLVM3.4
LLVM 3.4
author | Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 12 Dec 2013 13:56:28 +0900 |
parents | |
children | 54457678186b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:95c75e76d11b |
---|---|
1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// | |
2 // | |
3 // The LLVM Compiler Infrastructure | |
4 // | |
5 // This file is distributed under the University of Illinois Open Source | |
6 // License. See LICENSE.TXT for details. | |
7 // | |
8 //===----------------------------------------------------------------------===// | |
9 // | |
10 // This file implements a YAML parser. | |
11 // | |
12 //===----------------------------------------------------------------------===// | |
13 | |
14 #include "llvm/Support/YAMLParser.h" | |
15 #include "llvm/ADT/SmallVector.h" | |
16 #include "llvm/ADT/StringExtras.h" | |
17 #include "llvm/ADT/Twine.h" | |
18 #include "llvm/ADT/ilist.h" | |
19 #include "llvm/ADT/ilist_node.h" | |
20 #include "llvm/Support/ErrorHandling.h" | |
21 #include "llvm/Support/MemoryBuffer.h" | |
22 #include "llvm/Support/SourceMgr.h" | |
23 #include "llvm/Support/raw_ostream.h" | |
24 | |
25 using namespace llvm; | |
26 using namespace yaml; | |
27 | |
28 enum UnicodeEncodingForm { | |
29 UEF_UTF32_LE, ///< UTF-32 Little Endian | |
30 UEF_UTF32_BE, ///< UTF-32 Big Endian | |
31 UEF_UTF16_LE, ///< UTF-16 Little Endian | |
32 UEF_UTF16_BE, ///< UTF-16 Big Endian | |
33 UEF_UTF8, ///< UTF-8 or ascii. | |
34 UEF_Unknown ///< Not a valid Unicode encoding. | |
35 }; | |
36 | |
37 /// EncodingInfo - Holds the encoding type and length of the byte order mark if | |
38 /// it exists. Length is in {0, 2, 3, 4}. | |
39 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; | |
40 | |
41 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode | |
42 /// encoding form of \a Input. | |
43 /// | |
44 /// @param Input A string of length 0 or more. | |
45 /// @returns An EncodingInfo indicating the Unicode encoding form of the input | |
46 /// and how long the byte order mark is if one exists. | |
47 static EncodingInfo getUnicodeEncoding(StringRef Input) { | |
48 if (Input.size() == 0) | |
49 return std::make_pair(UEF_Unknown, 0); | |
50 | |
51 switch (uint8_t(Input[0])) { | |
52 case 0x00: | |
53 if (Input.size() >= 4) { | |
54 if ( Input[1] == 0 | |
55 && uint8_t(Input[2]) == 0xFE | |
56 && uint8_t(Input[3]) == 0xFF) | |
57 return std::make_pair(UEF_UTF32_BE, 4); | |
58 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) | |
59 return std::make_pair(UEF_UTF32_BE, 0); | |
60 } | |
61 | |
62 if (Input.size() >= 2 && Input[1] != 0) | |
63 return std::make_pair(UEF_UTF16_BE, 0); | |
64 return std::make_pair(UEF_Unknown, 0); | |
65 case 0xFF: | |
66 if ( Input.size() >= 4 | |
67 && uint8_t(Input[1]) == 0xFE | |
68 && Input[2] == 0 | |
69 && Input[3] == 0) | |
70 return std::make_pair(UEF_UTF32_LE, 4); | |
71 | |
72 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) | |
73 return std::make_pair(UEF_UTF16_LE, 2); | |
74 return std::make_pair(UEF_Unknown, 0); | |
75 case 0xFE: | |
76 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) | |
77 return std::make_pair(UEF_UTF16_BE, 2); | |
78 return std::make_pair(UEF_Unknown, 0); | |
79 case 0xEF: | |
80 if ( Input.size() >= 3 | |
81 && uint8_t(Input[1]) == 0xBB | |
82 && uint8_t(Input[2]) == 0xBF) | |
83 return std::make_pair(UEF_UTF8, 3); | |
84 return std::make_pair(UEF_Unknown, 0); | |
85 } | |
86 | |
87 // It could still be utf-32 or utf-16. | |
88 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) | |
89 return std::make_pair(UEF_UTF32_LE, 0); | |
90 | |
91 if (Input.size() >= 2 && Input[1] == 0) | |
92 return std::make_pair(UEF_UTF16_LE, 0); | |
93 | |
94 return std::make_pair(UEF_UTF8, 0); | |
95 } | |
96 | |
97 namespace llvm { | |
98 namespace yaml { | |
99 /// Pin the vtables to this file. | |
100 void Node::anchor() {} | |
101 void NullNode::anchor() {} | |
102 void ScalarNode::anchor() {} | |
103 void KeyValueNode::anchor() {} | |
104 void MappingNode::anchor() {} | |
105 void SequenceNode::anchor() {} | |
106 void AliasNode::anchor() {} | |
107 | |
108 /// Token - A single YAML token. | |
109 struct Token : ilist_node<Token> { | |
110 enum TokenKind { | |
111 TK_Error, // Uninitialized token. | |
112 TK_StreamStart, | |
113 TK_StreamEnd, | |
114 TK_VersionDirective, | |
115 TK_TagDirective, | |
116 TK_DocumentStart, | |
117 TK_DocumentEnd, | |
118 TK_BlockEntry, | |
119 TK_BlockEnd, | |
120 TK_BlockSequenceStart, | |
121 TK_BlockMappingStart, | |
122 TK_FlowEntry, | |
123 TK_FlowSequenceStart, | |
124 TK_FlowSequenceEnd, | |
125 TK_FlowMappingStart, | |
126 TK_FlowMappingEnd, | |
127 TK_Key, | |
128 TK_Value, | |
129 TK_Scalar, | |
130 TK_Alias, | |
131 TK_Anchor, | |
132 TK_Tag | |
133 } Kind; | |
134 | |
135 /// A string of length 0 or more whose begin() points to the logical location | |
136 /// of the token in the input. | |
137 StringRef Range; | |
138 | |
139 Token() : Kind(TK_Error) {} | |
140 }; | |
141 } | |
142 } | |
143 | |
144 namespace llvm { | |
145 template<> | |
146 struct ilist_sentinel_traits<Token> { | |
147 Token *createSentinel() const { | |
148 return &Sentinel; | |
149 } | |
150 static void destroySentinel(Token*) {} | |
151 | |
152 Token *provideInitialHead() const { return createSentinel(); } | |
153 Token *ensureHead(Token*) const { return createSentinel(); } | |
154 static void noteHead(Token*, Token*) {} | |
155 | |
156 private: | |
157 mutable Token Sentinel; | |
158 }; | |
159 | |
160 template<> | |
161 struct ilist_node_traits<Token> { | |
162 Token *createNode(const Token &V) { | |
163 return new (Alloc.Allocate<Token>()) Token(V); | |
164 } | |
165 static void deleteNode(Token *V) {} | |
166 | |
167 void addNodeToList(Token *) {} | |
168 void removeNodeFromList(Token *) {} | |
169 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, | |
170 ilist_iterator<Token> /*first*/, | |
171 ilist_iterator<Token> /*last*/) {} | |
172 | |
173 BumpPtrAllocator Alloc; | |
174 }; | |
175 } | |
176 | |
177 typedef ilist<Token> TokenQueueT; | |
178 | |
179 namespace { | |
180 /// @brief This struct is used to track simple keys. | |
181 /// | |
182 /// Simple keys are handled by creating an entry in SimpleKeys for each Token | |
183 /// which could legally be the start of a simple key. When peekNext is called, | |
184 /// if the Token To be returned is referenced by a SimpleKey, we continue | |
185 /// tokenizing until that potential simple key has either been found to not be | |
186 /// a simple key (we moved on to the next line or went further than 1024 chars). | |
187 /// Or when we run into a Value, and then insert a Key token (and possibly | |
188 /// others) before the SimpleKey's Tok. | |
189 struct SimpleKey { | |
190 TokenQueueT::iterator Tok; | |
191 unsigned Column; | |
192 unsigned Line; | |
193 unsigned FlowLevel; | |
194 bool IsRequired; | |
195 | |
196 bool operator ==(const SimpleKey &Other) { | |
197 return Tok == Other.Tok; | |
198 } | |
199 }; | |
200 } | |
201 | |
202 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit | |
203 /// subsequence and the subsequence's length in code units (uint8_t). | |
204 /// A length of 0 represents an error. | |
205 typedef std::pair<uint32_t, unsigned> UTF8Decoded; | |
206 | |
207 static UTF8Decoded decodeUTF8(StringRef Range) { | |
208 StringRef::iterator Position= Range.begin(); | |
209 StringRef::iterator End = Range.end(); | |
210 // 1 byte: [0x00, 0x7f] | |
211 // Bit pattern: 0xxxxxxx | |
212 if ((*Position & 0x80) == 0) { | |
213 return std::make_pair(*Position, 1); | |
214 } | |
215 // 2 bytes: [0x80, 0x7ff] | |
216 // Bit pattern: 110xxxxx 10xxxxxx | |
217 if (Position + 1 != End && | |
218 ((*Position & 0xE0) == 0xC0) && | |
219 ((*(Position + 1) & 0xC0) == 0x80)) { | |
220 uint32_t codepoint = ((*Position & 0x1F) << 6) | | |
221 (*(Position + 1) & 0x3F); | |
222 if (codepoint >= 0x80) | |
223 return std::make_pair(codepoint, 2); | |
224 } | |
225 // 3 bytes: [0x8000, 0xffff] | |
226 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx | |
227 if (Position + 2 != End && | |
228 ((*Position & 0xF0) == 0xE0) && | |
229 ((*(Position + 1) & 0xC0) == 0x80) && | |
230 ((*(Position + 2) & 0xC0) == 0x80)) { | |
231 uint32_t codepoint = ((*Position & 0x0F) << 12) | | |
232 ((*(Position + 1) & 0x3F) << 6) | | |
233 (*(Position + 2) & 0x3F); | |
234 // Codepoints between 0xD800 and 0xDFFF are invalid, as | |
235 // they are high / low surrogate halves used by UTF-16. | |
236 if (codepoint >= 0x800 && | |
237 (codepoint < 0xD800 || codepoint > 0xDFFF)) | |
238 return std::make_pair(codepoint, 3); | |
239 } | |
240 // 4 bytes: [0x10000, 0x10FFFF] | |
241 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
242 if (Position + 3 != End && | |
243 ((*Position & 0xF8) == 0xF0) && | |
244 ((*(Position + 1) & 0xC0) == 0x80) && | |
245 ((*(Position + 2) & 0xC0) == 0x80) && | |
246 ((*(Position + 3) & 0xC0) == 0x80)) { | |
247 uint32_t codepoint = ((*Position & 0x07) << 18) | | |
248 ((*(Position + 1) & 0x3F) << 12) | | |
249 ((*(Position + 2) & 0x3F) << 6) | | |
250 (*(Position + 3) & 0x3F); | |
251 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) | |
252 return std::make_pair(codepoint, 4); | |
253 } | |
254 return std::make_pair(0, 0); | |
255 } | |
256 | |
257 namespace llvm { | |
258 namespace yaml { | |
259 /// @brief Scans YAML tokens from a MemoryBuffer. | |
260 class Scanner { | |
261 public: | |
262 Scanner(const StringRef Input, SourceMgr &SM); | |
263 Scanner(MemoryBuffer *Buffer, SourceMgr &SM_); | |
264 | |
265 /// @brief Parse the next token and return it without popping it. | |
266 Token &peekNext(); | |
267 | |
268 /// @brief Parse the next token and pop it from the queue. | |
269 Token getNext(); | |
270 | |
271 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, | |
272 ArrayRef<SMRange> Ranges = None) { | |
273 SM.PrintMessage(Loc, Kind, Message, Ranges); | |
274 } | |
275 | |
276 void setError(const Twine &Message, StringRef::iterator Position) { | |
277 if (Current >= End) | |
278 Current = End - 1; | |
279 | |
280 // Don't print out more errors after the first one we encounter. The rest | |
281 // are just the result of the first, and have no meaning. | |
282 if (!Failed) | |
283 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); | |
284 Failed = true; | |
285 } | |
286 | |
287 void setError(const Twine &Message) { | |
288 setError(Message, Current); | |
289 } | |
290 | |
291 /// @brief Returns true if an error occurred while parsing. | |
292 bool failed() { | |
293 return Failed; | |
294 } | |
295 | |
296 private: | |
297 StringRef currentInput() { | |
298 return StringRef(Current, End - Current); | |
299 } | |
300 | |
301 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting | |
302 /// at \a Position. | |
303 /// | |
304 /// If the UTF-8 code units starting at Position do not form a well-formed | |
305 /// code unit subsequence, then the Unicode scalar value is 0, and the length | |
306 /// is 0. | |
307 UTF8Decoded decodeUTF8(StringRef::iterator Position) { | |
308 return ::decodeUTF8(StringRef(Position, End - Position)); | |
309 } | |
310 | |
311 // The following functions are based on the gramar rules in the YAML spec. The | |
312 // style of the function names it meant to closely match how they are written | |
313 // in the spec. The number within the [] is the number of the grammar rule in | |
314 // the spec. | |
315 // | |
316 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. | |
317 // | |
318 // c- | |
319 // A production starting and ending with a special character. | |
320 // b- | |
321 // A production matching a single line break. | |
322 // nb- | |
323 // A production starting and ending with a non-break character. | |
324 // s- | |
325 // A production starting and ending with a white space character. | |
326 // ns- | |
327 // A production starting and ending with a non-space character. | |
328 // l- | |
329 // A production matching complete line(s). | |
330 | |
331 /// @brief Skip a single nb-char[27] starting at Position. | |
332 /// | |
333 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] | |
334 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] | |
335 /// | |
336 /// @returns The code unit after the nb-char, or Position if it's not an | |
337 /// nb-char. | |
338 StringRef::iterator skip_nb_char(StringRef::iterator Position); | |
339 | |
340 /// @brief Skip a single b-break[28] starting at Position. | |
341 /// | |
342 /// A b-break is 0xD 0xA | 0xD | 0xA | |
343 /// | |
344 /// @returns The code unit after the b-break, or Position if it's not a | |
345 /// b-break. | |
346 StringRef::iterator skip_b_break(StringRef::iterator Position); | |
347 | |
348 /// @brief Skip a single s-white[33] starting at Position. | |
349 /// | |
350 /// A s-white is 0x20 | 0x9 | |
351 /// | |
352 /// @returns The code unit after the s-white, or Position if it's not a | |
353 /// s-white. | |
354 StringRef::iterator skip_s_white(StringRef::iterator Position); | |
355 | |
356 /// @brief Skip a single ns-char[34] starting at Position. | |
357 /// | |
358 /// A ns-char is nb-char - s-white | |
359 /// | |
360 /// @returns The code unit after the ns-char, or Position if it's not a | |
361 /// ns-char. | |
362 StringRef::iterator skip_ns_char(StringRef::iterator Position); | |
363 | |
364 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); | |
365 /// @brief Skip minimal well-formed code unit subsequences until Func | |
366 /// returns its input. | |
367 /// | |
368 /// @returns The code unit after the last minimal well-formed code unit | |
369 /// subsequence that Func accepted. | |
370 StringRef::iterator skip_while( SkipWhileFunc Func | |
371 , StringRef::iterator Position); | |
372 | |
373 /// @brief Scan ns-uri-char[39]s starting at Cur. | |
374 /// | |
375 /// This updates Cur and Column while scanning. | |
376 /// | |
377 /// @returns A StringRef starting at Cur which covers the longest contiguous | |
378 /// sequence of ns-uri-char. | |
379 StringRef scan_ns_uri_char(); | |
380 | |
381 /// @brief Scan ns-plain-one-line[133] starting at \a Cur. | |
382 StringRef scan_ns_plain_one_line(); | |
383 | |
384 /// @brief Consume a minimal well-formed code unit subsequence starting at | |
385 /// \a Cur. Return false if it is not the same Unicode scalar value as | |
386 /// \a Expected. This updates \a Column. | |
387 bool consume(uint32_t Expected); | |
388 | |
389 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. | |
390 void skip(uint32_t Distance); | |
391 | |
392 /// @brief Return true if the minimal well-formed code unit subsequence at | |
393 /// Pos is whitespace or a new line | |
394 bool isBlankOrBreak(StringRef::iterator Position); | |
395 | |
396 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. | |
397 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok | |
398 , unsigned AtColumn | |
399 , bool IsRequired); | |
400 | |
401 /// @brief Remove simple keys that can no longer be valid simple keys. | |
402 /// | |
403 /// Invalid simple keys are not on the current line or are further than 1024 | |
404 /// columns back. | |
405 void removeStaleSimpleKeyCandidates(); | |
406 | |
407 /// @brief Remove all simple keys on FlowLevel \a Level. | |
408 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); | |
409 | |
410 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd | |
411 /// tokens if needed. | |
412 bool unrollIndent(int ToColumn); | |
413 | |
414 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint | |
415 /// if needed. | |
416 bool rollIndent( int ToColumn | |
417 , Token::TokenKind Kind | |
418 , TokenQueueT::iterator InsertPoint); | |
419 | |
420 /// @brief Skip whitespace and comments until the start of the next token. | |
421 void scanToNextToken(); | |
422 | |
423 /// @brief Must be the first token generated. | |
424 bool scanStreamStart(); | |
425 | |
426 /// @brief Generate tokens needed to close out the stream. | |
427 bool scanStreamEnd(); | |
428 | |
429 /// @brief Scan a %BLAH directive. | |
430 bool scanDirective(); | |
431 | |
432 /// @brief Scan a ... or ---. | |
433 bool scanDocumentIndicator(bool IsStart); | |
434 | |
435 /// @brief Scan a [ or { and generate the proper flow collection start token. | |
436 bool scanFlowCollectionStart(bool IsSequence); | |
437 | |
438 /// @brief Scan a ] or } and generate the proper flow collection end token. | |
439 bool scanFlowCollectionEnd(bool IsSequence); | |
440 | |
441 /// @brief Scan the , that separates entries in a flow collection. | |
442 bool scanFlowEntry(); | |
443 | |
444 /// @brief Scan the - that starts block sequence entries. | |
445 bool scanBlockEntry(); | |
446 | |
447 /// @brief Scan an explicit ? indicating a key. | |
448 bool scanKey(); | |
449 | |
450 /// @brief Scan an explicit : indicating a value. | |
451 bool scanValue(); | |
452 | |
453 /// @brief Scan a quoted scalar. | |
454 bool scanFlowScalar(bool IsDoubleQuoted); | |
455 | |
456 /// @brief Scan an unquoted scalar. | |
457 bool scanPlainScalar(); | |
458 | |
459 /// @brief Scan an Alias or Anchor starting with * or &. | |
460 bool scanAliasOrAnchor(bool IsAlias); | |
461 | |
462 /// @brief Scan a block scalar starting with | or >. | |
463 bool scanBlockScalar(bool IsLiteral); | |
464 | |
465 /// @brief Scan a tag of the form !stuff. | |
466 bool scanTag(); | |
467 | |
468 /// @brief Dispatch to the next scanning function based on \a *Cur. | |
469 bool fetchMoreTokens(); | |
470 | |
471 /// @brief The SourceMgr used for diagnostics and buffer management. | |
472 SourceMgr &SM; | |
473 | |
474 /// @brief The original input. | |
475 MemoryBuffer *InputBuffer; | |
476 | |
477 /// @brief The current position of the scanner. | |
478 StringRef::iterator Current; | |
479 | |
480 /// @brief The end of the input (one past the last character). | |
481 StringRef::iterator End; | |
482 | |
483 /// @brief Current YAML indentation level in spaces. | |
484 int Indent; | |
485 | |
486 /// @brief Current column number in Unicode code points. | |
487 unsigned Column; | |
488 | |
489 /// @brief Current line number. | |
490 unsigned Line; | |
491 | |
492 /// @brief How deep we are in flow style containers. 0 Means at block level. | |
493 unsigned FlowLevel; | |
494 | |
495 /// @brief Are we at the start of the stream? | |
496 bool IsStartOfStream; | |
497 | |
498 /// @brief Can the next token be the start of a simple key? | |
499 bool IsSimpleKeyAllowed; | |
500 | |
501 /// @brief True if an error has occurred. | |
502 bool Failed; | |
503 | |
504 /// @brief Queue of tokens. This is required to queue up tokens while looking | |
505 /// for the end of a simple key. And for cases where a single character | |
506 /// can produce multiple tokens (e.g. BlockEnd). | |
507 TokenQueueT TokenQueue; | |
508 | |
509 /// @brief Indentation levels. | |
510 SmallVector<int, 4> Indents; | |
511 | |
512 /// @brief Potential simple keys. | |
513 SmallVector<SimpleKey, 4> SimpleKeys; | |
514 }; | |
515 | |
516 } // end namespace yaml | |
517 } // end namespace llvm | |
518 | |
519 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. | |
520 static void encodeUTF8( uint32_t UnicodeScalarValue | |
521 , SmallVectorImpl<char> &Result) { | |
522 if (UnicodeScalarValue <= 0x7F) { | |
523 Result.push_back(UnicodeScalarValue & 0x7F); | |
524 } else if (UnicodeScalarValue <= 0x7FF) { | |
525 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); | |
526 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); | |
527 Result.push_back(FirstByte); | |
528 Result.push_back(SecondByte); | |
529 } else if (UnicodeScalarValue <= 0xFFFF) { | |
530 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); | |
531 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); | |
532 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); | |
533 Result.push_back(FirstByte); | |
534 Result.push_back(SecondByte); | |
535 Result.push_back(ThirdByte); | |
536 } else if (UnicodeScalarValue <= 0x10FFFF) { | |
537 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); | |
538 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); | |
539 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); | |
540 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); | |
541 Result.push_back(FirstByte); | |
542 Result.push_back(SecondByte); | |
543 Result.push_back(ThirdByte); | |
544 Result.push_back(FourthByte); | |
545 } | |
546 } | |
547 | |
548 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { | |
549 SourceMgr SM; | |
550 Scanner scanner(Input, SM); | |
551 while (true) { | |
552 Token T = scanner.getNext(); | |
553 switch (T.Kind) { | |
554 case Token::TK_StreamStart: | |
555 OS << "Stream-Start: "; | |
556 break; | |
557 case Token::TK_StreamEnd: | |
558 OS << "Stream-End: "; | |
559 break; | |
560 case Token::TK_VersionDirective: | |
561 OS << "Version-Directive: "; | |
562 break; | |
563 case Token::TK_TagDirective: | |
564 OS << "Tag-Directive: "; | |
565 break; | |
566 case Token::TK_DocumentStart: | |
567 OS << "Document-Start: "; | |
568 break; | |
569 case Token::TK_DocumentEnd: | |
570 OS << "Document-End: "; | |
571 break; | |
572 case Token::TK_BlockEntry: | |
573 OS << "Block-Entry: "; | |
574 break; | |
575 case Token::TK_BlockEnd: | |
576 OS << "Block-End: "; | |
577 break; | |
578 case Token::TK_BlockSequenceStart: | |
579 OS << "Block-Sequence-Start: "; | |
580 break; | |
581 case Token::TK_BlockMappingStart: | |
582 OS << "Block-Mapping-Start: "; | |
583 break; | |
584 case Token::TK_FlowEntry: | |
585 OS << "Flow-Entry: "; | |
586 break; | |
587 case Token::TK_FlowSequenceStart: | |
588 OS << "Flow-Sequence-Start: "; | |
589 break; | |
590 case Token::TK_FlowSequenceEnd: | |
591 OS << "Flow-Sequence-End: "; | |
592 break; | |
593 case Token::TK_FlowMappingStart: | |
594 OS << "Flow-Mapping-Start: "; | |
595 break; | |
596 case Token::TK_FlowMappingEnd: | |
597 OS << "Flow-Mapping-End: "; | |
598 break; | |
599 case Token::TK_Key: | |
600 OS << "Key: "; | |
601 break; | |
602 case Token::TK_Value: | |
603 OS << "Value: "; | |
604 break; | |
605 case Token::TK_Scalar: | |
606 OS << "Scalar: "; | |
607 break; | |
608 case Token::TK_Alias: | |
609 OS << "Alias: "; | |
610 break; | |
611 case Token::TK_Anchor: | |
612 OS << "Anchor: "; | |
613 break; | |
614 case Token::TK_Tag: | |
615 OS << "Tag: "; | |
616 break; | |
617 case Token::TK_Error: | |
618 break; | |
619 } | |
620 OS << T.Range << "\n"; | |
621 if (T.Kind == Token::TK_StreamEnd) | |
622 break; | |
623 else if (T.Kind == Token::TK_Error) | |
624 return false; | |
625 } | |
626 return true; | |
627 } | |
628 | |
629 bool yaml::scanTokens(StringRef Input) { | |
630 llvm::SourceMgr SM; | |
631 llvm::yaml::Scanner scanner(Input, SM); | |
632 for (;;) { | |
633 llvm::yaml::Token T = scanner.getNext(); | |
634 if (T.Kind == Token::TK_StreamEnd) | |
635 break; | |
636 else if (T.Kind == Token::TK_Error) | |
637 return false; | |
638 } | |
639 return true; | |
640 } | |
641 | |
642 std::string yaml::escape(StringRef Input) { | |
643 std::string EscapedInput; | |
644 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { | |
645 if (*i == '\\') | |
646 EscapedInput += "\\\\"; | |
647 else if (*i == '"') | |
648 EscapedInput += "\\\""; | |
649 else if (*i == 0) | |
650 EscapedInput += "\\0"; | |
651 else if (*i == 0x07) | |
652 EscapedInput += "\\a"; | |
653 else if (*i == 0x08) | |
654 EscapedInput += "\\b"; | |
655 else if (*i == 0x09) | |
656 EscapedInput += "\\t"; | |
657 else if (*i == 0x0A) | |
658 EscapedInput += "\\n"; | |
659 else if (*i == 0x0B) | |
660 EscapedInput += "\\v"; | |
661 else if (*i == 0x0C) | |
662 EscapedInput += "\\f"; | |
663 else if (*i == 0x0D) | |
664 EscapedInput += "\\r"; | |
665 else if (*i == 0x1B) | |
666 EscapedInput += "\\e"; | |
667 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. | |
668 std::string HexStr = utohexstr(*i); | |
669 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; | |
670 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. | |
671 UTF8Decoded UnicodeScalarValue | |
672 = decodeUTF8(StringRef(i, Input.end() - i)); | |
673 if (UnicodeScalarValue.second == 0) { | |
674 // Found invalid char. | |
675 SmallString<4> Val; | |
676 encodeUTF8(0xFFFD, Val); | |
677 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); | |
678 // FIXME: Error reporting. | |
679 return EscapedInput; | |
680 } | |
681 if (UnicodeScalarValue.first == 0x85) | |
682 EscapedInput += "\\N"; | |
683 else if (UnicodeScalarValue.first == 0xA0) | |
684 EscapedInput += "\\_"; | |
685 else if (UnicodeScalarValue.first == 0x2028) | |
686 EscapedInput += "\\L"; | |
687 else if (UnicodeScalarValue.first == 0x2029) | |
688 EscapedInput += "\\P"; | |
689 else { | |
690 std::string HexStr = utohexstr(UnicodeScalarValue.first); | |
691 if (HexStr.size() <= 2) | |
692 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; | |
693 else if (HexStr.size() <= 4) | |
694 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; | |
695 else if (HexStr.size() <= 8) | |
696 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; | |
697 } | |
698 i += UnicodeScalarValue.second - 1; | |
699 } else | |
700 EscapedInput.push_back(*i); | |
701 } | |
702 return EscapedInput; | |
703 } | |
704 | |
705 Scanner::Scanner(StringRef Input, SourceMgr &sm) | |
706 : SM(sm) | |
707 , Indent(-1) | |
708 , Column(0) | |
709 , Line(0) | |
710 , FlowLevel(0) | |
711 , IsStartOfStream(true) | |
712 , IsSimpleKeyAllowed(true) | |
713 , Failed(false) { | |
714 InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); | |
715 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); | |
716 Current = InputBuffer->getBufferStart(); | |
717 End = InputBuffer->getBufferEnd(); | |
718 } | |
719 | |
720 Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_) | |
721 : SM(SM_) | |
722 , InputBuffer(Buffer) | |
723 , Current(InputBuffer->getBufferStart()) | |
724 , End(InputBuffer->getBufferEnd()) | |
725 , Indent(-1) | |
726 , Column(0) | |
727 , Line(0) | |
728 , FlowLevel(0) | |
729 , IsStartOfStream(true) | |
730 , IsSimpleKeyAllowed(true) | |
731 , Failed(false) { | |
732 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); | |
733 } | |
734 | |
735 Token &Scanner::peekNext() { | |
736 // If the current token is a possible simple key, keep parsing until we | |
737 // can confirm. | |
738 bool NeedMore = false; | |
739 while (true) { | |
740 if (TokenQueue.empty() || NeedMore) { | |
741 if (!fetchMoreTokens()) { | |
742 TokenQueue.clear(); | |
743 TokenQueue.push_back(Token()); | |
744 return TokenQueue.front(); | |
745 } | |
746 } | |
747 assert(!TokenQueue.empty() && | |
748 "fetchMoreTokens lied about getting tokens!"); | |
749 | |
750 removeStaleSimpleKeyCandidates(); | |
751 SimpleKey SK; | |
752 SK.Tok = TokenQueue.front(); | |
753 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) | |
754 == SimpleKeys.end()) | |
755 break; | |
756 else | |
757 NeedMore = true; | |
758 } | |
759 return TokenQueue.front(); | |
760 } | |
761 | |
762 Token Scanner::getNext() { | |
763 Token Ret = peekNext(); | |
764 // TokenQueue can be empty if there was an error getting the next token. | |
765 if (!TokenQueue.empty()) | |
766 TokenQueue.pop_front(); | |
767 | |
768 // There cannot be any referenced Token's if the TokenQueue is empty. So do a | |
769 // quick deallocation of them all. | |
770 if (TokenQueue.empty()) { | |
771 TokenQueue.Alloc.Reset(); | |
772 } | |
773 | |
774 return Ret; | |
775 } | |
776 | |
777 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { | |
778 if (Position == End) | |
779 return Position; | |
780 // Check 7 bit c-printable - b-char. | |
781 if ( *Position == 0x09 | |
782 || (*Position >= 0x20 && *Position <= 0x7E)) | |
783 return Position + 1; | |
784 | |
785 // Check for valid UTF-8. | |
786 if (uint8_t(*Position) & 0x80) { | |
787 UTF8Decoded u8d = decodeUTF8(Position); | |
788 if ( u8d.second != 0 | |
789 && u8d.first != 0xFEFF | |
790 && ( u8d.first == 0x85 | |
791 || ( u8d.first >= 0xA0 | |
792 && u8d.first <= 0xD7FF) | |
793 || ( u8d.first >= 0xE000 | |
794 && u8d.first <= 0xFFFD) | |
795 || ( u8d.first >= 0x10000 | |
796 && u8d.first <= 0x10FFFF))) | |
797 return Position + u8d.second; | |
798 } | |
799 return Position; | |
800 } | |
801 | |
802 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { | |
803 if (Position == End) | |
804 return Position; | |
805 if (*Position == 0x0D) { | |
806 if (Position + 1 != End && *(Position + 1) == 0x0A) | |
807 return Position + 2; | |
808 return Position + 1; | |
809 } | |
810 | |
811 if (*Position == 0x0A) | |
812 return Position + 1; | |
813 return Position; | |
814 } | |
815 | |
816 | |
817 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { | |
818 if (Position == End) | |
819 return Position; | |
820 if (*Position == ' ' || *Position == '\t') | |
821 return Position + 1; | |
822 return Position; | |
823 } | |
824 | |
825 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { | |
826 if (Position == End) | |
827 return Position; | |
828 if (*Position == ' ' || *Position == '\t') | |
829 return Position; | |
830 return skip_nb_char(Position); | |
831 } | |
832 | |
833 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func | |
834 , StringRef::iterator Position) { | |
835 while (true) { | |
836 StringRef::iterator i = (this->*Func)(Position); | |
837 if (i == Position) | |
838 break; | |
839 Position = i; | |
840 } | |
841 return Position; | |
842 } | |
843 | |
844 static bool is_ns_hex_digit(const char C) { | |
845 return (C >= '0' && C <= '9') | |
846 || (C >= 'a' && C <= 'z') | |
847 || (C >= 'A' && C <= 'Z'); | |
848 } | |
849 | |
850 static bool is_ns_word_char(const char C) { | |
851 return C == '-' | |
852 || (C >= 'a' && C <= 'z') | |
853 || (C >= 'A' && C <= 'Z'); | |
854 } | |
855 | |
856 StringRef Scanner::scan_ns_uri_char() { | |
857 StringRef::iterator Start = Current; | |
858 while (true) { | |
859 if (Current == End) | |
860 break; | |
861 if (( *Current == '%' | |
862 && Current + 2 < End | |
863 && is_ns_hex_digit(*(Current + 1)) | |
864 && is_ns_hex_digit(*(Current + 2))) | |
865 || is_ns_word_char(*Current) | |
866 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") | |
867 != StringRef::npos) { | |
868 ++Current; | |
869 ++Column; | |
870 } else | |
871 break; | |
872 } | |
873 return StringRef(Start, Current - Start); | |
874 } | |
875 | |
876 StringRef Scanner::scan_ns_plain_one_line() { | |
877 StringRef::iterator start = Current; | |
878 // The first character must already be verified. | |
879 ++Current; | |
880 while (true) { | |
881 if (Current == End) { | |
882 break; | |
883 } else if (*Current == ':') { | |
884 // Check if the next character is a ns-char. | |
885 if (Current + 1 == End) | |
886 break; | |
887 StringRef::iterator i = skip_ns_char(Current + 1); | |
888 if (Current + 1 != i) { | |
889 Current = i; | |
890 Column += 2; // Consume both the ':' and ns-char. | |
891 } else | |
892 break; | |
893 } else if (*Current == '#') { | |
894 // Check if the previous character was a ns-char. | |
895 // The & 0x80 check is to check for the trailing byte of a utf-8 | |
896 if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) { | |
897 ++Current; | |
898 ++Column; | |
899 } else | |
900 break; | |
901 } else { | |
902 StringRef::iterator i = skip_nb_char(Current); | |
903 if (i == Current) | |
904 break; | |
905 Current = i; | |
906 ++Column; | |
907 } | |
908 } | |
909 return StringRef(start, Current - start); | |
910 } | |
911 | |
912 bool Scanner::consume(uint32_t Expected) { | |
913 if (Expected >= 0x80) | |
914 report_fatal_error("Not dealing with this yet"); | |
915 if (Current == End) | |
916 return false; | |
917 if (uint8_t(*Current) >= 0x80) | |
918 report_fatal_error("Not dealing with this yet"); | |
919 if (uint8_t(*Current) == Expected) { | |
920 ++Current; | |
921 ++Column; | |
922 return true; | |
923 } | |
924 return false; | |
925 } | |
926 | |
927 void Scanner::skip(uint32_t Distance) { | |
928 Current += Distance; | |
929 Column += Distance; | |
930 assert(Current <= End && "Skipped past the end"); | |
931 } | |
932 | |
933 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { | |
934 if (Position == End) | |
935 return false; | |
936 if ( *Position == ' ' || *Position == '\t' | |
937 || *Position == '\r' || *Position == '\n') | |
938 return true; | |
939 return false; | |
940 } | |
941 | |
942 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok | |
943 , unsigned AtColumn | |
944 , bool IsRequired) { | |
945 if (IsSimpleKeyAllowed) { | |
946 SimpleKey SK; | |
947 SK.Tok = Tok; | |
948 SK.Line = Line; | |
949 SK.Column = AtColumn; | |
950 SK.IsRequired = IsRequired; | |
951 SK.FlowLevel = FlowLevel; | |
952 SimpleKeys.push_back(SK); | |
953 } | |
954 } | |
955 | |
956 void Scanner::removeStaleSimpleKeyCandidates() { | |
957 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); | |
958 i != SimpleKeys.end();) { | |
959 if (i->Line != Line || i->Column + 1024 < Column) { | |
960 if (i->IsRequired) | |
961 setError( "Could not find expected : for simple key" | |
962 , i->Tok->Range.begin()); | |
963 i = SimpleKeys.erase(i); | |
964 } else | |
965 ++i; | |
966 } | |
967 } | |
968 | |
969 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { | |
970 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) | |
971 SimpleKeys.pop_back(); | |
972 } | |
973 | |
974 bool Scanner::unrollIndent(int ToColumn) { | |
975 Token T; | |
976 // Indentation is ignored in flow. | |
977 if (FlowLevel != 0) | |
978 return true; | |
979 | |
980 while (Indent > ToColumn) { | |
981 T.Kind = Token::TK_BlockEnd; | |
982 T.Range = StringRef(Current, 1); | |
983 TokenQueue.push_back(T); | |
984 Indent = Indents.pop_back_val(); | |
985 } | |
986 | |
987 return true; | |
988 } | |
989 | |
990 bool Scanner::rollIndent( int ToColumn | |
991 , Token::TokenKind Kind | |
992 , TokenQueueT::iterator InsertPoint) { | |
993 if (FlowLevel) | |
994 return true; | |
995 if (Indent < ToColumn) { | |
996 Indents.push_back(Indent); | |
997 Indent = ToColumn; | |
998 | |
999 Token T; | |
1000 T.Kind = Kind; | |
1001 T.Range = StringRef(Current, 0); | |
1002 TokenQueue.insert(InsertPoint, T); | |
1003 } | |
1004 return true; | |
1005 } | |
1006 | |
1007 void Scanner::scanToNextToken() { | |
1008 while (true) { | |
1009 while (*Current == ' ' || *Current == '\t') { | |
1010 skip(1); | |
1011 } | |
1012 | |
1013 // Skip comment. | |
1014 if (*Current == '#') { | |
1015 while (true) { | |
1016 // This may skip more than one byte, thus Column is only incremented | |
1017 // for code points. | |
1018 StringRef::iterator i = skip_nb_char(Current); | |
1019 if (i == Current) | |
1020 break; | |
1021 Current = i; | |
1022 ++Column; | |
1023 } | |
1024 } | |
1025 | |
1026 // Skip EOL. | |
1027 StringRef::iterator i = skip_b_break(Current); | |
1028 if (i == Current) | |
1029 break; | |
1030 Current = i; | |
1031 ++Line; | |
1032 Column = 0; | |
1033 // New lines may start a simple key. | |
1034 if (!FlowLevel) | |
1035 IsSimpleKeyAllowed = true; | |
1036 } | |
1037 } | |
1038 | |
1039 bool Scanner::scanStreamStart() { | |
1040 IsStartOfStream = false; | |
1041 | |
1042 EncodingInfo EI = getUnicodeEncoding(currentInput()); | |
1043 | |
1044 Token T; | |
1045 T.Kind = Token::TK_StreamStart; | |
1046 T.Range = StringRef(Current, EI.second); | |
1047 TokenQueue.push_back(T); | |
1048 Current += EI.second; | |
1049 return true; | |
1050 } | |
1051 | |
1052 bool Scanner::scanStreamEnd() { | |
1053 // Force an ending new line if one isn't present. | |
1054 if (Column != 0) { | |
1055 Column = 0; | |
1056 ++Line; | |
1057 } | |
1058 | |
1059 unrollIndent(-1); | |
1060 SimpleKeys.clear(); | |
1061 IsSimpleKeyAllowed = false; | |
1062 | |
1063 Token T; | |
1064 T.Kind = Token::TK_StreamEnd; | |
1065 T.Range = StringRef(Current, 0); | |
1066 TokenQueue.push_back(T); | |
1067 return true; | |
1068 } | |
1069 | |
1070 bool Scanner::scanDirective() { | |
1071 // Reset the indentation level. | |
1072 unrollIndent(-1); | |
1073 SimpleKeys.clear(); | |
1074 IsSimpleKeyAllowed = false; | |
1075 | |
1076 StringRef::iterator Start = Current; | |
1077 consume('%'); | |
1078 StringRef::iterator NameStart = Current; | |
1079 Current = skip_while(&Scanner::skip_ns_char, Current); | |
1080 StringRef Name(NameStart, Current - NameStart); | |
1081 Current = skip_while(&Scanner::skip_s_white, Current); | |
1082 | |
1083 Token T; | |
1084 if (Name == "YAML") { | |
1085 Current = skip_while(&Scanner::skip_ns_char, Current); | |
1086 T.Kind = Token::TK_VersionDirective; | |
1087 T.Range = StringRef(Start, Current - Start); | |
1088 TokenQueue.push_back(T); | |
1089 return true; | |
1090 } else if(Name == "TAG") { | |
1091 Current = skip_while(&Scanner::skip_ns_char, Current); | |
1092 Current = skip_while(&Scanner::skip_s_white, Current); | |
1093 Current = skip_while(&Scanner::skip_ns_char, Current); | |
1094 T.Kind = Token::TK_TagDirective; | |
1095 T.Range = StringRef(Start, Current - Start); | |
1096 TokenQueue.push_back(T); | |
1097 return true; | |
1098 } | |
1099 return false; | |
1100 } | |
1101 | |
1102 bool Scanner::scanDocumentIndicator(bool IsStart) { | |
1103 unrollIndent(-1); | |
1104 SimpleKeys.clear(); | |
1105 IsSimpleKeyAllowed = false; | |
1106 | |
1107 Token T; | |
1108 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; | |
1109 T.Range = StringRef(Current, 3); | |
1110 skip(3); | |
1111 TokenQueue.push_back(T); | |
1112 return true; | |
1113 } | |
1114 | |
1115 bool Scanner::scanFlowCollectionStart(bool IsSequence) { | |
1116 Token T; | |
1117 T.Kind = IsSequence ? Token::TK_FlowSequenceStart | |
1118 : Token::TK_FlowMappingStart; | |
1119 T.Range = StringRef(Current, 1); | |
1120 skip(1); | |
1121 TokenQueue.push_back(T); | |
1122 | |
1123 // [ and { may begin a simple key. | |
1124 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); | |
1125 | |
1126 // And may also be followed by a simple key. | |
1127 IsSimpleKeyAllowed = true; | |
1128 ++FlowLevel; | |
1129 return true; | |
1130 } | |
1131 | |
1132 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { | |
1133 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); | |
1134 IsSimpleKeyAllowed = false; | |
1135 Token T; | |
1136 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd | |
1137 : Token::TK_FlowMappingEnd; | |
1138 T.Range = StringRef(Current, 1); | |
1139 skip(1); | |
1140 TokenQueue.push_back(T); | |
1141 if (FlowLevel) | |
1142 --FlowLevel; | |
1143 return true; | |
1144 } | |
1145 | |
1146 bool Scanner::scanFlowEntry() { | |
1147 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); | |
1148 IsSimpleKeyAllowed = true; | |
1149 Token T; | |
1150 T.Kind = Token::TK_FlowEntry; | |
1151 T.Range = StringRef(Current, 1); | |
1152 skip(1); | |
1153 TokenQueue.push_back(T); | |
1154 return true; | |
1155 } | |
1156 | |
1157 bool Scanner::scanBlockEntry() { | |
1158 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); | |
1159 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); | |
1160 IsSimpleKeyAllowed = true; | |
1161 Token T; | |
1162 T.Kind = Token::TK_BlockEntry; | |
1163 T.Range = StringRef(Current, 1); | |
1164 skip(1); | |
1165 TokenQueue.push_back(T); | |
1166 return true; | |
1167 } | |
1168 | |
1169 bool Scanner::scanKey() { | |
1170 if (!FlowLevel) | |
1171 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); | |
1172 | |
1173 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); | |
1174 IsSimpleKeyAllowed = !FlowLevel; | |
1175 | |
1176 Token T; | |
1177 T.Kind = Token::TK_Key; | |
1178 T.Range = StringRef(Current, 1); | |
1179 skip(1); | |
1180 TokenQueue.push_back(T); | |
1181 return true; | |
1182 } | |
1183 | |
1184 bool Scanner::scanValue() { | |
1185 // If the previous token could have been a simple key, insert the key token | |
1186 // into the token queue. | |
1187 if (!SimpleKeys.empty()) { | |
1188 SimpleKey SK = SimpleKeys.pop_back_val(); | |
1189 Token T; | |
1190 T.Kind = Token::TK_Key; | |
1191 T.Range = SK.Tok->Range; | |
1192 TokenQueueT::iterator i, e; | |
1193 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { | |
1194 if (i == SK.Tok) | |
1195 break; | |
1196 } | |
1197 assert(i != e && "SimpleKey not in token queue!"); | |
1198 i = TokenQueue.insert(i, T); | |
1199 | |
1200 // We may also need to add a Block-Mapping-Start token. | |
1201 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); | |
1202 | |
1203 IsSimpleKeyAllowed = false; | |
1204 } else { | |
1205 if (!FlowLevel) | |
1206 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); | |
1207 IsSimpleKeyAllowed = !FlowLevel; | |
1208 } | |
1209 | |
1210 Token T; | |
1211 T.Kind = Token::TK_Value; | |
1212 T.Range = StringRef(Current, 1); | |
1213 skip(1); | |
1214 TokenQueue.push_back(T); | |
1215 return true; | |
1216 } | |
1217 | |
1218 // Forbidding inlining improves performance by roughly 20%. | |
1219 // FIXME: Remove once llvm optimizes this to the faster version without hints. | |
1220 LLVM_ATTRIBUTE_NOINLINE static bool | |
1221 wasEscaped(StringRef::iterator First, StringRef::iterator Position); | |
1222 | |
1223 // Returns whether a character at 'Position' was escaped with a leading '\'. | |
1224 // 'First' specifies the position of the first character in the string. | |
1225 static bool wasEscaped(StringRef::iterator First, | |
1226 StringRef::iterator Position) { | |
1227 assert(Position - 1 >= First); | |
1228 StringRef::iterator I = Position - 1; | |
1229 // We calculate the number of consecutive '\'s before the current position | |
1230 // by iterating backwards through our string. | |
1231 while (I >= First && *I == '\\') --I; | |
1232 // (Position - 1 - I) now contains the number of '\'s before the current | |
1233 // position. If it is odd, the character at 'Position' was escaped. | |
1234 return (Position - 1 - I) % 2 == 1; | |
1235 } | |
1236 | |
1237 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { | |
1238 StringRef::iterator Start = Current; | |
1239 unsigned ColStart = Column; | |
1240 if (IsDoubleQuoted) { | |
1241 do { | |
1242 ++Current; | |
1243 while (Current != End && *Current != '"') | |
1244 ++Current; | |
1245 // Repeat until the previous character was not a '\' or was an escaped | |
1246 // backslash. | |
1247 } while ( Current != End | |
1248 && *(Current - 1) == '\\' | |
1249 && wasEscaped(Start + 1, Current)); | |
1250 } else { | |
1251 skip(1); | |
1252 while (true) { | |
1253 // Skip a ' followed by another '. | |
1254 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { | |
1255 skip(2); | |
1256 continue; | |
1257 } else if (*Current == '\'') | |
1258 break; | |
1259 StringRef::iterator i = skip_nb_char(Current); | |
1260 if (i == Current) { | |
1261 i = skip_b_break(Current); | |
1262 if (i == Current) | |
1263 break; | |
1264 Current = i; | |
1265 Column = 0; | |
1266 ++Line; | |
1267 } else { | |
1268 if (i == End) | |
1269 break; | |
1270 Current = i; | |
1271 ++Column; | |
1272 } | |
1273 } | |
1274 } | |
1275 | |
1276 if (Current == End) { | |
1277 setError("Expected quote at end of scalar", Current); | |
1278 return false; | |
1279 } | |
1280 | |
1281 skip(1); // Skip ending quote. | |
1282 Token T; | |
1283 T.Kind = Token::TK_Scalar; | |
1284 T.Range = StringRef(Start, Current - Start); | |
1285 TokenQueue.push_back(T); | |
1286 | |
1287 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); | |
1288 | |
1289 IsSimpleKeyAllowed = false; | |
1290 | |
1291 return true; | |
1292 } | |
1293 | |
1294 bool Scanner::scanPlainScalar() { | |
1295 StringRef::iterator Start = Current; | |
1296 unsigned ColStart = Column; | |
1297 unsigned LeadingBlanks = 0; | |
1298 assert(Indent >= -1 && "Indent must be >= -1 !"); | |
1299 unsigned indent = static_cast<unsigned>(Indent + 1); | |
1300 while (true) { | |
1301 if (*Current == '#') | |
1302 break; | |
1303 | |
1304 while (!isBlankOrBreak(Current)) { | |
1305 if ( FlowLevel && *Current == ':' | |
1306 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { | |
1307 setError("Found unexpected ':' while scanning a plain scalar", Current); | |
1308 return false; | |
1309 } | |
1310 | |
1311 // Check for the end of the plain scalar. | |
1312 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) | |
1313 || ( FlowLevel | |
1314 && (StringRef(Current, 1).find_first_of(",:?[]{}") | |
1315 != StringRef::npos))) | |
1316 break; | |
1317 | |
1318 StringRef::iterator i = skip_nb_char(Current); | |
1319 if (i == Current) | |
1320 break; | |
1321 Current = i; | |
1322 ++Column; | |
1323 } | |
1324 | |
1325 // Are we at the end? | |
1326 if (!isBlankOrBreak(Current)) | |
1327 break; | |
1328 | |
1329 // Eat blanks. | |
1330 StringRef::iterator Tmp = Current; | |
1331 while (isBlankOrBreak(Tmp)) { | |
1332 StringRef::iterator i = skip_s_white(Tmp); | |
1333 if (i != Tmp) { | |
1334 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { | |
1335 setError("Found invalid tab character in indentation", Tmp); | |
1336 return false; | |
1337 } | |
1338 Tmp = i; | |
1339 ++Column; | |
1340 } else { | |
1341 i = skip_b_break(Tmp); | |
1342 if (!LeadingBlanks) | |
1343 LeadingBlanks = 1; | |
1344 Tmp = i; | |
1345 Column = 0; | |
1346 ++Line; | |
1347 } | |
1348 } | |
1349 | |
1350 if (!FlowLevel && Column < indent) | |
1351 break; | |
1352 | |
1353 Current = Tmp; | |
1354 } | |
1355 if (Start == Current) { | |
1356 setError("Got empty plain scalar", Start); | |
1357 return false; | |
1358 } | |
1359 Token T; | |
1360 T.Kind = Token::TK_Scalar; | |
1361 T.Range = StringRef(Start, Current - Start); | |
1362 TokenQueue.push_back(T); | |
1363 | |
1364 // Plain scalars can be simple keys. | |
1365 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); | |
1366 | |
1367 IsSimpleKeyAllowed = false; | |
1368 | |
1369 return true; | |
1370 } | |
1371 | |
1372 bool Scanner::scanAliasOrAnchor(bool IsAlias) { | |
1373 StringRef::iterator Start = Current; | |
1374 unsigned ColStart = Column; | |
1375 skip(1); | |
1376 while(true) { | |
1377 if ( *Current == '[' || *Current == ']' | |
1378 || *Current == '{' || *Current == '}' | |
1379 || *Current == ',' | |
1380 || *Current == ':') | |
1381 break; | |
1382 StringRef::iterator i = skip_ns_char(Current); | |
1383 if (i == Current) | |
1384 break; | |
1385 Current = i; | |
1386 ++Column; | |
1387 } | |
1388 | |
1389 if (Start == Current) { | |
1390 setError("Got empty alias or anchor", Start); | |
1391 return false; | |
1392 } | |
1393 | |
1394 Token T; | |
1395 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; | |
1396 T.Range = StringRef(Start, Current - Start); | |
1397 TokenQueue.push_back(T); | |
1398 | |
1399 // Alias and anchors can be simple keys. | |
1400 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); | |
1401 | |
1402 IsSimpleKeyAllowed = false; | |
1403 | |
1404 return true; | |
1405 } | |
1406 | |
1407 bool Scanner::scanBlockScalar(bool IsLiteral) { | |
1408 StringRef::iterator Start = Current; | |
1409 skip(1); // Eat | or > | |
1410 while(true) { | |
1411 StringRef::iterator i = skip_nb_char(Current); | |
1412 if (i == Current) { | |
1413 if (Column == 0) | |
1414 break; | |
1415 i = skip_b_break(Current); | |
1416 if (i != Current) { | |
1417 // We got a line break. | |
1418 Column = 0; | |
1419 ++Line; | |
1420 Current = i; | |
1421 continue; | |
1422 } else { | |
1423 // There was an error, which should already have been printed out. | |
1424 return false; | |
1425 } | |
1426 } | |
1427 Current = i; | |
1428 ++Column; | |
1429 } | |
1430 | |
1431 if (Start == Current) { | |
1432 setError("Got empty block scalar", Start); | |
1433 return false; | |
1434 } | |
1435 | |
1436 Token T; | |
1437 T.Kind = Token::TK_Scalar; | |
1438 T.Range = StringRef(Start, Current - Start); | |
1439 TokenQueue.push_back(T); | |
1440 return true; | |
1441 } | |
1442 | |
1443 bool Scanner::scanTag() { | |
1444 StringRef::iterator Start = Current; | |
1445 unsigned ColStart = Column; | |
1446 skip(1); // Eat !. | |
1447 if (Current == End || isBlankOrBreak(Current)); // An empty tag. | |
1448 else if (*Current == '<') { | |
1449 skip(1); | |
1450 scan_ns_uri_char(); | |
1451 if (!consume('>')) | |
1452 return false; | |
1453 } else { | |
1454 // FIXME: Actually parse the c-ns-shorthand-tag rule. | |
1455 Current = skip_while(&Scanner::skip_ns_char, Current); | |
1456 } | |
1457 | |
1458 Token T; | |
1459 T.Kind = Token::TK_Tag; | |
1460 T.Range = StringRef(Start, Current - Start); | |
1461 TokenQueue.push_back(T); | |
1462 | |
1463 // Tags can be simple keys. | |
1464 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); | |
1465 | |
1466 IsSimpleKeyAllowed = false; | |
1467 | |
1468 return true; | |
1469 } | |
1470 | |
1471 bool Scanner::fetchMoreTokens() { | |
1472 if (IsStartOfStream) | |
1473 return scanStreamStart(); | |
1474 | |
1475 scanToNextToken(); | |
1476 | |
1477 if (Current == End) | |
1478 return scanStreamEnd(); | |
1479 | |
1480 removeStaleSimpleKeyCandidates(); | |
1481 | |
1482 unrollIndent(Column); | |
1483 | |
1484 if (Column == 0 && *Current == '%') | |
1485 return scanDirective(); | |
1486 | |
1487 if (Column == 0 && Current + 4 <= End | |
1488 && *Current == '-' | |
1489 && *(Current + 1) == '-' | |
1490 && *(Current + 2) == '-' | |
1491 && (Current + 3 == End || isBlankOrBreak(Current + 3))) | |
1492 return scanDocumentIndicator(true); | |
1493 | |
1494 if (Column == 0 && Current + 4 <= End | |
1495 && *Current == '.' | |
1496 && *(Current + 1) == '.' | |
1497 && *(Current + 2) == '.' | |
1498 && (Current + 3 == End || isBlankOrBreak(Current + 3))) | |
1499 return scanDocumentIndicator(false); | |
1500 | |
1501 if (*Current == '[') | |
1502 return scanFlowCollectionStart(true); | |
1503 | |
1504 if (*Current == '{') | |
1505 return scanFlowCollectionStart(false); | |
1506 | |
1507 if (*Current == ']') | |
1508 return scanFlowCollectionEnd(true); | |
1509 | |
1510 if (*Current == '}') | |
1511 return scanFlowCollectionEnd(false); | |
1512 | |
1513 if (*Current == ',') | |
1514 return scanFlowEntry(); | |
1515 | |
1516 if (*Current == '-' && isBlankOrBreak(Current + 1)) | |
1517 return scanBlockEntry(); | |
1518 | |
1519 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) | |
1520 return scanKey(); | |
1521 | |
1522 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) | |
1523 return scanValue(); | |
1524 | |
1525 if (*Current == '*') | |
1526 return scanAliasOrAnchor(true); | |
1527 | |
1528 if (*Current == '&') | |
1529 return scanAliasOrAnchor(false); | |
1530 | |
1531 if (*Current == '!') | |
1532 return scanTag(); | |
1533 | |
1534 if (*Current == '|' && !FlowLevel) | |
1535 return scanBlockScalar(true); | |
1536 | |
1537 if (*Current == '>' && !FlowLevel) | |
1538 return scanBlockScalar(false); | |
1539 | |
1540 if (*Current == '\'') | |
1541 return scanFlowScalar(false); | |
1542 | |
1543 if (*Current == '"') | |
1544 return scanFlowScalar(true); | |
1545 | |
1546 // Get a plain scalar. | |
1547 StringRef FirstChar(Current, 1); | |
1548 if (!(isBlankOrBreak(Current) | |
1549 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) | |
1550 || (*Current == '-' && !isBlankOrBreak(Current + 1)) | |
1551 || (!FlowLevel && (*Current == '?' || *Current == ':') | |
1552 && isBlankOrBreak(Current + 1)) | |
1553 || (!FlowLevel && *Current == ':' | |
1554 && Current + 2 < End | |
1555 && *(Current + 1) == ':' | |
1556 && !isBlankOrBreak(Current + 2))) | |
1557 return scanPlainScalar(); | |
1558 | |
1559 setError("Unrecognized character while tokenizing."); | |
1560 return false; | |
1561 } | |
1562 | |
1563 Stream::Stream(StringRef Input, SourceMgr &SM) | |
1564 : scanner(new Scanner(Input, SM)) | |
1565 , CurrentDoc(0) {} | |
1566 | |
1567 Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM) | |
1568 : scanner(new Scanner(InputBuffer, SM)) | |
1569 , CurrentDoc(0) {} | |
1570 | |
1571 Stream::~Stream() {} | |
1572 | |
1573 bool Stream::failed() { return scanner->failed(); } | |
1574 | |
1575 void Stream::printError(Node *N, const Twine &Msg) { | |
1576 SmallVector<SMRange, 1> Ranges; | |
1577 Ranges.push_back(N->getSourceRange()); | |
1578 scanner->printError( N->getSourceRange().Start | |
1579 , SourceMgr::DK_Error | |
1580 , Msg | |
1581 , Ranges); | |
1582 } | |
1583 | |
1584 document_iterator Stream::begin() { | |
1585 if (CurrentDoc) | |
1586 report_fatal_error("Can only iterate over the stream once"); | |
1587 | |
1588 // Skip Stream-Start. | |
1589 scanner->getNext(); | |
1590 | |
1591 CurrentDoc.reset(new Document(*this)); | |
1592 return document_iterator(CurrentDoc); | |
1593 } | |
1594 | |
1595 document_iterator Stream::end() { | |
1596 return document_iterator(); | |
1597 } | |
1598 | |
1599 void Stream::skip() { | |
1600 for (document_iterator i = begin(), e = end(); i != e; ++i) | |
1601 i->skip(); | |
1602 } | |
1603 | |
1604 Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A, StringRef T) | |
1605 : Doc(D) | |
1606 , TypeID(Type) | |
1607 , Anchor(A) | |
1608 , Tag(T) { | |
1609 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); | |
1610 SourceRange = SMRange(Start, Start); | |
1611 } | |
1612 | |
1613 std::string Node::getVerbatimTag() const { | |
1614 StringRef Raw = getRawTag(); | |
1615 if (!Raw.empty() && Raw != "!") { | |
1616 std::string Ret; | |
1617 if (Raw.find_last_of('!') == 0) { | |
1618 Ret = Doc->getTagMap().find("!")->second; | |
1619 Ret += Raw.substr(1); | |
1620 return llvm_move(Ret); | |
1621 } else if (Raw.startswith("!!")) { | |
1622 Ret = Doc->getTagMap().find("!!")->second; | |
1623 Ret += Raw.substr(2); | |
1624 return llvm_move(Ret); | |
1625 } else { | |
1626 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); | |
1627 std::map<StringRef, StringRef>::const_iterator It = | |
1628 Doc->getTagMap().find(TagHandle); | |
1629 if (It != Doc->getTagMap().end()) | |
1630 Ret = It->second; | |
1631 else { | |
1632 Token T; | |
1633 T.Kind = Token::TK_Tag; | |
1634 T.Range = TagHandle; | |
1635 setError(Twine("Unknown tag handle ") + TagHandle, T); | |
1636 } | |
1637 Ret += Raw.substr(Raw.find_last_of('!') + 1); | |
1638 return llvm_move(Ret); | |
1639 } | |
1640 } | |
1641 | |
1642 switch (getType()) { | |
1643 case NK_Null: | |
1644 return "tag:yaml.org,2002:null"; | |
1645 case NK_Scalar: | |
1646 // TODO: Tag resolution. | |
1647 return "tag:yaml.org,2002:str"; | |
1648 case NK_Mapping: | |
1649 return "tag:yaml.org,2002:map"; | |
1650 case NK_Sequence: | |
1651 return "tag:yaml.org,2002:seq"; | |
1652 } | |
1653 | |
1654 return ""; | |
1655 } | |
1656 | |
1657 Token &Node::peekNext() { | |
1658 return Doc->peekNext(); | |
1659 } | |
1660 | |
1661 Token Node::getNext() { | |
1662 return Doc->getNext(); | |
1663 } | |
1664 | |
1665 Node *Node::parseBlockNode() { | |
1666 return Doc->parseBlockNode(); | |
1667 } | |
1668 | |
1669 BumpPtrAllocator &Node::getAllocator() { | |
1670 return Doc->NodeAllocator; | |
1671 } | |
1672 | |
1673 void Node::setError(const Twine &Msg, Token &Tok) const { | |
1674 Doc->setError(Msg, Tok); | |
1675 } | |
1676 | |
1677 bool Node::failed() const { | |
1678 return Doc->failed(); | |
1679 } | |
1680 | |
1681 | |
1682 | |
1683 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { | |
1684 // TODO: Handle newlines properly. We need to remove leading whitespace. | |
1685 if (Value[0] == '"') { // Double quoted. | |
1686 // Pull off the leading and trailing "s. | |
1687 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); | |
1688 // Search for characters that would require unescaping the value. | |
1689 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); | |
1690 if (i != StringRef::npos) | |
1691 return unescapeDoubleQuoted(UnquotedValue, i, Storage); | |
1692 return UnquotedValue; | |
1693 } else if (Value[0] == '\'') { // Single quoted. | |
1694 // Pull off the leading and trailing 's. | |
1695 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); | |
1696 StringRef::size_type i = UnquotedValue.find('\''); | |
1697 if (i != StringRef::npos) { | |
1698 // We're going to need Storage. | |
1699 Storage.clear(); | |
1700 Storage.reserve(UnquotedValue.size()); | |
1701 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { | |
1702 StringRef Valid(UnquotedValue.begin(), i); | |
1703 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); | |
1704 Storage.push_back('\''); | |
1705 UnquotedValue = UnquotedValue.substr(i + 2); | |
1706 } | |
1707 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); | |
1708 return StringRef(Storage.begin(), Storage.size()); | |
1709 } | |
1710 return UnquotedValue; | |
1711 } | |
1712 // Plain or block. | |
1713 return Value.rtrim(" "); | |
1714 } | |
1715 | |
1716 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue | |
1717 , StringRef::size_type i | |
1718 , SmallVectorImpl<char> &Storage) | |
1719 const { | |
1720 // Use Storage to build proper value. | |
1721 Storage.clear(); | |
1722 Storage.reserve(UnquotedValue.size()); | |
1723 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { | |
1724 // Insert all previous chars into Storage. | |
1725 StringRef Valid(UnquotedValue.begin(), i); | |
1726 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); | |
1727 // Chop off inserted chars. | |
1728 UnquotedValue = UnquotedValue.substr(i); | |
1729 | |
1730 assert(!UnquotedValue.empty() && "Can't be empty!"); | |
1731 | |
1732 // Parse escape or line break. | |
1733 switch (UnquotedValue[0]) { | |
1734 case '\r': | |
1735 case '\n': | |
1736 Storage.push_back('\n'); | |
1737 if ( UnquotedValue.size() > 1 | |
1738 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) | |
1739 UnquotedValue = UnquotedValue.substr(1); | |
1740 UnquotedValue = UnquotedValue.substr(1); | |
1741 break; | |
1742 default: | |
1743 if (UnquotedValue.size() == 1) | |
1744 // TODO: Report error. | |
1745 break; | |
1746 UnquotedValue = UnquotedValue.substr(1); | |
1747 switch (UnquotedValue[0]) { | |
1748 default: { | |
1749 Token T; | |
1750 T.Range = StringRef(UnquotedValue.begin(), 1); | |
1751 setError("Unrecognized escape code!", T); | |
1752 return ""; | |
1753 } | |
1754 case '\r': | |
1755 case '\n': | |
1756 // Remove the new line. | |
1757 if ( UnquotedValue.size() > 1 | |
1758 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) | |
1759 UnquotedValue = UnquotedValue.substr(1); | |
1760 // If this was just a single byte newline, it will get skipped | |
1761 // below. | |
1762 break; | |
1763 case '0': | |
1764 Storage.push_back(0x00); | |
1765 break; | |
1766 case 'a': | |
1767 Storage.push_back(0x07); | |
1768 break; | |
1769 case 'b': | |
1770 Storage.push_back(0x08); | |
1771 break; | |
1772 case 't': | |
1773 case 0x09: | |
1774 Storage.push_back(0x09); | |
1775 break; | |
1776 case 'n': | |
1777 Storage.push_back(0x0A); | |
1778 break; | |
1779 case 'v': | |
1780 Storage.push_back(0x0B); | |
1781 break; | |
1782 case 'f': | |
1783 Storage.push_back(0x0C); | |
1784 break; | |
1785 case 'r': | |
1786 Storage.push_back(0x0D); | |
1787 break; | |
1788 case 'e': | |
1789 Storage.push_back(0x1B); | |
1790 break; | |
1791 case ' ': | |
1792 Storage.push_back(0x20); | |
1793 break; | |
1794 case '"': | |
1795 Storage.push_back(0x22); | |
1796 break; | |
1797 case '/': | |
1798 Storage.push_back(0x2F); | |
1799 break; | |
1800 case '\\': | |
1801 Storage.push_back(0x5C); | |
1802 break; | |
1803 case 'N': | |
1804 encodeUTF8(0x85, Storage); | |
1805 break; | |
1806 case '_': | |
1807 encodeUTF8(0xA0, Storage); | |
1808 break; | |
1809 case 'L': | |
1810 encodeUTF8(0x2028, Storage); | |
1811 break; | |
1812 case 'P': | |
1813 encodeUTF8(0x2029, Storage); | |
1814 break; | |
1815 case 'x': { | |
1816 if (UnquotedValue.size() < 3) | |
1817 // TODO: Report error. | |
1818 break; | |
1819 unsigned int UnicodeScalarValue; | |
1820 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) | |
1821 // TODO: Report error. | |
1822 UnicodeScalarValue = 0xFFFD; | |
1823 encodeUTF8(UnicodeScalarValue, Storage); | |
1824 UnquotedValue = UnquotedValue.substr(2); | |
1825 break; | |
1826 } | |
1827 case 'u': { | |
1828 if (UnquotedValue.size() < 5) | |
1829 // TODO: Report error. | |
1830 break; | |
1831 unsigned int UnicodeScalarValue; | |
1832 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) | |
1833 // TODO: Report error. | |
1834 UnicodeScalarValue = 0xFFFD; | |
1835 encodeUTF8(UnicodeScalarValue, Storage); | |
1836 UnquotedValue = UnquotedValue.substr(4); | |
1837 break; | |
1838 } | |
1839 case 'U': { | |
1840 if (UnquotedValue.size() < 9) | |
1841 // TODO: Report error. | |
1842 break; | |
1843 unsigned int UnicodeScalarValue; | |
1844 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) | |
1845 // TODO: Report error. | |
1846 UnicodeScalarValue = 0xFFFD; | |
1847 encodeUTF8(UnicodeScalarValue, Storage); | |
1848 UnquotedValue = UnquotedValue.substr(8); | |
1849 break; | |
1850 } | |
1851 } | |
1852 UnquotedValue = UnquotedValue.substr(1); | |
1853 } | |
1854 } | |
1855 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); | |
1856 return StringRef(Storage.begin(), Storage.size()); | |
1857 } | |
1858 | |
1859 Node *KeyValueNode::getKey() { | |
1860 if (Key) | |
1861 return Key; | |
1862 // Handle implicit null keys. | |
1863 { | |
1864 Token &t = peekNext(); | |
1865 if ( t.Kind == Token::TK_BlockEnd | |
1866 || t.Kind == Token::TK_Value | |
1867 || t.Kind == Token::TK_Error) { | |
1868 return Key = new (getAllocator()) NullNode(Doc); | |
1869 } | |
1870 if (t.Kind == Token::TK_Key) | |
1871 getNext(); // skip TK_Key. | |
1872 } | |
1873 | |
1874 // Handle explicit null keys. | |
1875 Token &t = peekNext(); | |
1876 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { | |
1877 return Key = new (getAllocator()) NullNode(Doc); | |
1878 } | |
1879 | |
1880 // We've got a normal key. | |
1881 return Key = parseBlockNode(); | |
1882 } | |
1883 | |
1884 Node *KeyValueNode::getValue() { | |
1885 if (Value) | |
1886 return Value; | |
1887 getKey()->skip(); | |
1888 if (failed()) | |
1889 return Value = new (getAllocator()) NullNode(Doc); | |
1890 | |
1891 // Handle implicit null values. | |
1892 { | |
1893 Token &t = peekNext(); | |
1894 if ( t.Kind == Token::TK_BlockEnd | |
1895 || t.Kind == Token::TK_FlowMappingEnd | |
1896 || t.Kind == Token::TK_Key | |
1897 || t.Kind == Token::TK_FlowEntry | |
1898 || t.Kind == Token::TK_Error) { | |
1899 return Value = new (getAllocator()) NullNode(Doc); | |
1900 } | |
1901 | |
1902 if (t.Kind != Token::TK_Value) { | |
1903 setError("Unexpected token in Key Value.", t); | |
1904 return Value = new (getAllocator()) NullNode(Doc); | |
1905 } | |
1906 getNext(); // skip TK_Value. | |
1907 } | |
1908 | |
1909 // Handle explicit null values. | |
1910 Token &t = peekNext(); | |
1911 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { | |
1912 return Value = new (getAllocator()) NullNode(Doc); | |
1913 } | |
1914 | |
1915 // We got a normal value. | |
1916 return Value = parseBlockNode(); | |
1917 } | |
1918 | |
1919 void MappingNode::increment() { | |
1920 if (failed()) { | |
1921 IsAtEnd = true; | |
1922 CurrentEntry = 0; | |
1923 return; | |
1924 } | |
1925 if (CurrentEntry) { | |
1926 CurrentEntry->skip(); | |
1927 if (Type == MT_Inline) { | |
1928 IsAtEnd = true; | |
1929 CurrentEntry = 0; | |
1930 return; | |
1931 } | |
1932 } | |
1933 Token T = peekNext(); | |
1934 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { | |
1935 // KeyValueNode eats the TK_Key. That way it can detect null keys. | |
1936 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); | |
1937 } else if (Type == MT_Block) { | |
1938 switch (T.Kind) { | |
1939 case Token::TK_BlockEnd: | |
1940 getNext(); | |
1941 IsAtEnd = true; | |
1942 CurrentEntry = 0; | |
1943 break; | |
1944 default: | |
1945 setError("Unexpected token. Expected Key or Block End", T); | |
1946 case Token::TK_Error: | |
1947 IsAtEnd = true; | |
1948 CurrentEntry = 0; | |
1949 } | |
1950 } else { | |
1951 switch (T.Kind) { | |
1952 case Token::TK_FlowEntry: | |
1953 // Eat the flow entry and recurse. | |
1954 getNext(); | |
1955 return increment(); | |
1956 case Token::TK_FlowMappingEnd: | |
1957 getNext(); | |
1958 case Token::TK_Error: | |
1959 // Set this to end iterator. | |
1960 IsAtEnd = true; | |
1961 CurrentEntry = 0; | |
1962 break; | |
1963 default: | |
1964 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " | |
1965 "Mapping End." | |
1966 , T); | |
1967 IsAtEnd = true; | |
1968 CurrentEntry = 0; | |
1969 } | |
1970 } | |
1971 } | |
1972 | |
1973 void SequenceNode::increment() { | |
1974 if (failed()) { | |
1975 IsAtEnd = true; | |
1976 CurrentEntry = 0; | |
1977 return; | |
1978 } | |
1979 if (CurrentEntry) | |
1980 CurrentEntry->skip(); | |
1981 Token T = peekNext(); | |
1982 if (SeqType == ST_Block) { | |
1983 switch (T.Kind) { | |
1984 case Token::TK_BlockEntry: | |
1985 getNext(); | |
1986 CurrentEntry = parseBlockNode(); | |
1987 if (CurrentEntry == 0) { // An error occurred. | |
1988 IsAtEnd = true; | |
1989 CurrentEntry = 0; | |
1990 } | |
1991 break; | |
1992 case Token::TK_BlockEnd: | |
1993 getNext(); | |
1994 IsAtEnd = true; | |
1995 CurrentEntry = 0; | |
1996 break; | |
1997 default: | |
1998 setError( "Unexpected token. Expected Block Entry or Block End." | |
1999 , T); | |
2000 case Token::TK_Error: | |
2001 IsAtEnd = true; | |
2002 CurrentEntry = 0; | |
2003 } | |
2004 } else if (SeqType == ST_Indentless) { | |
2005 switch (T.Kind) { | |
2006 case Token::TK_BlockEntry: | |
2007 getNext(); | |
2008 CurrentEntry = parseBlockNode(); | |
2009 if (CurrentEntry == 0) { // An error occurred. | |
2010 IsAtEnd = true; | |
2011 CurrentEntry = 0; | |
2012 } | |
2013 break; | |
2014 default: | |
2015 case Token::TK_Error: | |
2016 IsAtEnd = true; | |
2017 CurrentEntry = 0; | |
2018 } | |
2019 } else if (SeqType == ST_Flow) { | |
2020 switch (T.Kind) { | |
2021 case Token::TK_FlowEntry: | |
2022 // Eat the flow entry and recurse. | |
2023 getNext(); | |
2024 WasPreviousTokenFlowEntry = true; | |
2025 return increment(); | |
2026 case Token::TK_FlowSequenceEnd: | |
2027 getNext(); | |
2028 case Token::TK_Error: | |
2029 // Set this to end iterator. | |
2030 IsAtEnd = true; | |
2031 CurrentEntry = 0; | |
2032 break; | |
2033 case Token::TK_StreamEnd: | |
2034 case Token::TK_DocumentEnd: | |
2035 case Token::TK_DocumentStart: | |
2036 setError("Could not find closing ]!", T); | |
2037 // Set this to end iterator. | |
2038 IsAtEnd = true; | |
2039 CurrentEntry = 0; | |
2040 break; | |
2041 default: | |
2042 if (!WasPreviousTokenFlowEntry) { | |
2043 setError("Expected , between entries!", T); | |
2044 IsAtEnd = true; | |
2045 CurrentEntry = 0; | |
2046 break; | |
2047 } | |
2048 // Otherwise it must be a flow entry. | |
2049 CurrentEntry = parseBlockNode(); | |
2050 if (!CurrentEntry) { | |
2051 IsAtEnd = true; | |
2052 } | |
2053 WasPreviousTokenFlowEntry = false; | |
2054 break; | |
2055 } | |
2056 } | |
2057 } | |
2058 | |
2059 Document::Document(Stream &S) : stream(S), Root(0) { | |
2060 // Tag maps starts with two default mappings. | |
2061 TagMap["!"] = "!"; | |
2062 TagMap["!!"] = "tag:yaml.org,2002:"; | |
2063 | |
2064 if (parseDirectives()) | |
2065 expectToken(Token::TK_DocumentStart); | |
2066 Token &T = peekNext(); | |
2067 if (T.Kind == Token::TK_DocumentStart) | |
2068 getNext(); | |
2069 } | |
2070 | |
2071 bool Document::skip() { | |
2072 if (stream.scanner->failed()) | |
2073 return false; | |
2074 if (!Root) | |
2075 getRoot(); | |
2076 Root->skip(); | |
2077 Token &T = peekNext(); | |
2078 if (T.Kind == Token::TK_StreamEnd) | |
2079 return false; | |
2080 if (T.Kind == Token::TK_DocumentEnd) { | |
2081 getNext(); | |
2082 return skip(); | |
2083 } | |
2084 return true; | |
2085 } | |
2086 | |
2087 Token &Document::peekNext() { | |
2088 return stream.scanner->peekNext(); | |
2089 } | |
2090 | |
2091 Token Document::getNext() { | |
2092 return stream.scanner->getNext(); | |
2093 } | |
2094 | |
2095 void Document::setError(const Twine &Message, Token &Location) const { | |
2096 stream.scanner->setError(Message, Location.Range.begin()); | |
2097 } | |
2098 | |
2099 bool Document::failed() const { | |
2100 return stream.scanner->failed(); | |
2101 } | |
2102 | |
2103 Node *Document::parseBlockNode() { | |
2104 Token T = peekNext(); | |
2105 // Handle properties. | |
2106 Token AnchorInfo; | |
2107 Token TagInfo; | |
2108 parse_property: | |
2109 switch (T.Kind) { | |
2110 case Token::TK_Alias: | |
2111 getNext(); | |
2112 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); | |
2113 case Token::TK_Anchor: | |
2114 if (AnchorInfo.Kind == Token::TK_Anchor) { | |
2115 setError("Already encountered an anchor for this node!", T); | |
2116 return 0; | |
2117 } | |
2118 AnchorInfo = getNext(); // Consume TK_Anchor. | |
2119 T = peekNext(); | |
2120 goto parse_property; | |
2121 case Token::TK_Tag: | |
2122 if (TagInfo.Kind == Token::TK_Tag) { | |
2123 setError("Already encountered a tag for this node!", T); | |
2124 return 0; | |
2125 } | |
2126 TagInfo = getNext(); // Consume TK_Tag. | |
2127 T = peekNext(); | |
2128 goto parse_property; | |
2129 default: | |
2130 break; | |
2131 } | |
2132 | |
2133 switch (T.Kind) { | |
2134 case Token::TK_BlockEntry: | |
2135 // We got an unindented BlockEntry sequence. This is not terminated with | |
2136 // a BlockEnd. | |
2137 // Don't eat the TK_BlockEntry, SequenceNode needs it. | |
2138 return new (NodeAllocator) SequenceNode( stream.CurrentDoc | |
2139 , AnchorInfo.Range.substr(1) | |
2140 , TagInfo.Range | |
2141 , SequenceNode::ST_Indentless); | |
2142 case Token::TK_BlockSequenceStart: | |
2143 getNext(); | |
2144 return new (NodeAllocator) | |
2145 SequenceNode( stream.CurrentDoc | |
2146 , AnchorInfo.Range.substr(1) | |
2147 , TagInfo.Range | |
2148 , SequenceNode::ST_Block); | |
2149 case Token::TK_BlockMappingStart: | |
2150 getNext(); | |
2151 return new (NodeAllocator) | |
2152 MappingNode( stream.CurrentDoc | |
2153 , AnchorInfo.Range.substr(1) | |
2154 , TagInfo.Range | |
2155 , MappingNode::MT_Block); | |
2156 case Token::TK_FlowSequenceStart: | |
2157 getNext(); | |
2158 return new (NodeAllocator) | |
2159 SequenceNode( stream.CurrentDoc | |
2160 , AnchorInfo.Range.substr(1) | |
2161 , TagInfo.Range | |
2162 , SequenceNode::ST_Flow); | |
2163 case Token::TK_FlowMappingStart: | |
2164 getNext(); | |
2165 return new (NodeAllocator) | |
2166 MappingNode( stream.CurrentDoc | |
2167 , AnchorInfo.Range.substr(1) | |
2168 , TagInfo.Range | |
2169 , MappingNode::MT_Flow); | |
2170 case Token::TK_Scalar: | |
2171 getNext(); | |
2172 return new (NodeAllocator) | |
2173 ScalarNode( stream.CurrentDoc | |
2174 , AnchorInfo.Range.substr(1) | |
2175 , TagInfo.Range | |
2176 , T.Range); | |
2177 case Token::TK_Key: | |
2178 // Don't eat the TK_Key, KeyValueNode expects it. | |
2179 return new (NodeAllocator) | |
2180 MappingNode( stream.CurrentDoc | |
2181 , AnchorInfo.Range.substr(1) | |
2182 , TagInfo.Range | |
2183 , MappingNode::MT_Inline); | |
2184 case Token::TK_DocumentStart: | |
2185 case Token::TK_DocumentEnd: | |
2186 case Token::TK_StreamEnd: | |
2187 default: | |
2188 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not | |
2189 // !!null null. | |
2190 return new (NodeAllocator) NullNode(stream.CurrentDoc); | |
2191 case Token::TK_Error: | |
2192 return 0; | |
2193 } | |
2194 llvm_unreachable("Control flow shouldn't reach here."); | |
2195 return 0; | |
2196 } | |
2197 | |
2198 bool Document::parseDirectives() { | |
2199 bool isDirective = false; | |
2200 while (true) { | |
2201 Token T = peekNext(); | |
2202 if (T.Kind == Token::TK_TagDirective) { | |
2203 parseTAGDirective(); | |
2204 isDirective = true; | |
2205 } else if (T.Kind == Token::TK_VersionDirective) { | |
2206 parseYAMLDirective(); | |
2207 isDirective = true; | |
2208 } else | |
2209 break; | |
2210 } | |
2211 return isDirective; | |
2212 } | |
2213 | |
2214 void Document::parseYAMLDirective() { | |
2215 getNext(); // Eat %YAML <version> | |
2216 } | |
2217 | |
2218 void Document::parseTAGDirective() { | |
2219 Token Tag = getNext(); // %TAG <handle> <prefix> | |
2220 StringRef T = Tag.Range; | |
2221 // Strip %TAG | |
2222 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); | |
2223 std::size_t HandleEnd = T.find_first_of(" \t"); | |
2224 StringRef TagHandle = T.substr(0, HandleEnd); | |
2225 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); | |
2226 TagMap[TagHandle] = TagPrefix; | |
2227 } | |
2228 | |
2229 bool Document::expectToken(int TK) { | |
2230 Token T = getNext(); | |
2231 if (T.Kind != TK) { | |
2232 setError("Unexpected token", T); | |
2233 return false; | |
2234 } | |
2235 return true; | |
2236 } |