Mercurial > hg > CbC > CbC_llvm
comparison tools/llvm-rc/ResourceScriptToken.cpp @ 122:36195a0db682
merging ( incomplete )
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 17 Nov 2017 20:32:31 +0900 |
parents | 803732b1fca8 |
children | 3a76565eade5 |
comparison
equal
deleted
inserted
replaced
119:d9df2cbd60cd | 122:36195a0db682 |
---|---|
1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// | |
2 // | |
3 // The LLVM Compiler Infrastructure | |
4 // | |
5 // This file is distributed under the University of Illinois Open Source | |
6 // License. See LICENSE.TXT for details. | |
7 // | |
8 //===---------------------------------------------------------------------===// | |
9 // | |
10 // This file implements an interface defined in ResourceScriptToken.h. | |
11 // In particular, it defines an .rc script tokenizer. | |
12 // | |
13 //===---------------------------------------------------------------------===// | |
14 | |
15 #include "ResourceScriptToken.h" | |
16 #include "llvm/Support/raw_ostream.h" | |
17 | |
18 #include <algorithm> | |
19 #include <cassert> | |
20 #include <cctype> | |
21 #include <cstdlib> | |
22 #include <utility> | |
23 | |
24 using namespace llvm; | |
25 | |
26 using Kind = RCToken::Kind; | |
27 | |
28 // Checks if Representation is a correct description of an RC integer. | |
29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), | |
30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' | |
31 // character (that is the difference between our representation and | |
32 // StringRef's one). If Representation is correct, 'true' is returned and | |
33 // the return value is put back in Num. | |
34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { | |
35 size_t Length = Representation.size(); | |
36 if (Length == 0) | |
37 return false; | |
38 // Strip the last 'L' if unnecessary. | |
39 if (std::toupper(Representation.back()) == 'L') | |
40 Representation = Representation.drop_back(1); | |
41 | |
42 return !Representation.getAsInteger<uint32_t>(0, Num); | |
43 } | |
44 | |
45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) | |
46 : TokenKind(RCTokenKind), TokenValue(Value) {} | |
47 | |
48 uint32_t RCToken::intValue() const { | |
49 assert(TokenKind == Kind::Int); | |
50 // We assume that the token already is a correct integer (checked by | |
51 // rcGetAsInteger). | |
52 uint32_t Result; | |
53 bool IsSuccess = rcGetAsInteger(TokenValue, Result); | |
54 assert(IsSuccess); | |
55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. | |
56 return Result; | |
57 } | |
58 | |
59 bool RCToken::isLongInt() const { | |
60 return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L'; | |
61 } | |
62 | |
63 StringRef RCToken::value() const { return TokenValue; } | |
64 | |
65 Kind RCToken::kind() const { return TokenKind; } | |
66 | |
67 bool RCToken::isBinaryOp() const { | |
68 switch (TokenKind) { | |
69 case Kind::Plus: | |
70 case Kind::Minus: | |
71 case Kind::Pipe: | |
72 case Kind::Amp: | |
73 return true; | |
74 default: | |
75 return false; | |
76 } | |
77 } | |
78 | |
79 static Error getStringError(const Twine &message) { | |
80 return make_error<StringError>("Error parsing file: " + message, | |
81 inconvertibleErrorCode()); | |
82 } | |
83 | |
84 namespace { | |
85 | |
86 class Tokenizer { | |
87 public: | |
88 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {} | |
89 | |
90 Expected<std::vector<RCToken>> run(); | |
91 | |
92 private: | |
93 // All 'advancing' methods return boolean values; if they're equal to false, | |
94 // the stream has ended or failed. | |
95 bool advance(size_t Amount = 1); | |
96 bool skipWhitespaces(); | |
97 | |
98 // Consumes a token. If any problem occurred, a non-empty Error is returned. | |
99 Error consumeToken(const Kind TokenKind); | |
100 | |
101 // Check if tokenizer is about to read FollowingChars. | |
102 bool willNowRead(StringRef FollowingChars) const; | |
103 | |
104 // Check if tokenizer can start reading an identifier at current position. | |
105 // The original tool did non specify the rules to determine what is a correct | |
106 // identifier. We assume they should follow the C convention: | |
107 // [a-zA-Z_][a-zA-Z0-9_]*. | |
108 bool canStartIdentifier() const; | |
109 // Check if tokenizer can continue reading an identifier. | |
110 bool canContinueIdentifier() const; | |
111 | |
112 // Check if tokenizer can start reading an integer. | |
113 // A correct integer always starts with a 0-9 digit, | |
114 // can contain characters 0-9A-Fa-f (digits), | |
115 // Ll (marking the integer is 32-bit), Xx (marking the representation | |
116 // is hexadecimal). As some kind of separator should come after the | |
117 // integer, we can consume the integer until a non-alphanumeric | |
118 // character. | |
119 bool canStartInt() const; | |
120 bool canContinueInt() const; | |
121 | |
122 bool canStartString() const; | |
123 | |
124 // Check if tokenizer can start reading a single line comment (e.g. a comment | |
125 // that begins with '//') | |
126 bool canStartLineComment() const; | |
127 | |
128 // Check if tokenizer can start or finish reading a block comment (e.g. a | |
129 // comment that begins with '/*' and ends with '*/') | |
130 bool canStartBlockComment() const; | |
131 | |
132 // Throw away all remaining characters on the current line. | |
133 void skipCurrentLine(); | |
134 | |
135 bool streamEof() const; | |
136 | |
137 // Classify the token that is about to be read from the current position. | |
138 Kind classifyCurrentToken() const; | |
139 | |
140 // Process the Kind::Identifier token - check if it is | |
141 // an identifier describing a block start or end. | |
142 void processIdentifier(RCToken &token) const; | |
143 | |
144 StringRef Data; | |
145 size_t DataLength, Pos; | |
146 }; | |
147 | |
148 void Tokenizer::skipCurrentLine() { | |
149 Pos = Data.find_first_of("\r\n", Pos); | |
150 Pos = Data.find_first_not_of("\r\n", Pos); | |
151 | |
152 if (Pos == StringRef::npos) | |
153 Pos = DataLength; | |
154 } | |
155 | |
156 Expected<std::vector<RCToken>> Tokenizer::run() { | |
157 Pos = 0; | |
158 std::vector<RCToken> Result; | |
159 | |
160 // Consume an optional UTF-8 Byte Order Mark. | |
161 if (willNowRead("\xef\xbb\xbf")) | |
162 advance(3); | |
163 | |
164 while (!streamEof()) { | |
165 if (!skipWhitespaces()) | |
166 break; | |
167 | |
168 Kind TokenKind = classifyCurrentToken(); | |
169 if (TokenKind == Kind::Invalid) | |
170 return getStringError("Invalid token found at position " + Twine(Pos)); | |
171 | |
172 const size_t TokenStart = Pos; | |
173 if (Error TokenError = consumeToken(TokenKind)) | |
174 return std::move(TokenError); | |
175 | |
176 // Comments are just deleted, don't bother saving them. | |
177 if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment) | |
178 continue; | |
179 | |
180 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); | |
181 if (TokenKind == Kind::Identifier) { | |
182 processIdentifier(Token); | |
183 } else if (TokenKind == Kind::Int) { | |
184 uint32_t TokenInt; | |
185 if (!rcGetAsInteger(Token.value(), TokenInt)) { | |
186 // The integer has incorrect format or cannot be represented in | |
187 // a 32-bit integer. | |
188 return getStringError("Integer invalid or too large: " + | |
189 Token.value().str()); | |
190 } | |
191 } | |
192 | |
193 Result.push_back(Token); | |
194 } | |
195 | |
196 return Result; | |
197 } | |
198 | |
199 bool Tokenizer::advance(size_t Amount) { | |
200 Pos += Amount; | |
201 return !streamEof(); | |
202 } | |
203 | |
204 bool Tokenizer::skipWhitespaces() { | |
205 while (!streamEof() && std::isspace(Data[Pos])) | |
206 advance(); | |
207 return !streamEof(); | |
208 } | |
209 | |
210 Error Tokenizer::consumeToken(const Kind TokenKind) { | |
211 switch (TokenKind) { | |
212 // One-character token consumption. | |
213 #define TOKEN(Name) | |
214 #define SHORT_TOKEN(Name, Ch) case Kind::Name: | |
215 #include "ResourceScriptTokenList.h" | |
216 #undef TOKEN | |
217 #undef SHORT_TOKEN | |
218 advance(); | |
219 return Error::success(); | |
220 | |
221 case Kind::LineComment: | |
222 advance(2); | |
223 skipCurrentLine(); | |
224 return Error::success(); | |
225 | |
226 case Kind::StartComment: { | |
227 advance(2); | |
228 auto EndPos = Data.find("*/", Pos); | |
229 if (EndPos == StringRef::npos) | |
230 return getStringError( | |
231 "Unclosed multi-line comment beginning at position " + Twine(Pos)); | |
232 advance(EndPos - Pos); | |
233 advance(2); | |
234 return Error::success(); | |
235 } | |
236 case Kind::Identifier: | |
237 while (!streamEof() && canContinueIdentifier()) | |
238 advance(); | |
239 return Error::success(); | |
240 | |
241 case Kind::Int: | |
242 while (!streamEof() && canContinueInt()) | |
243 advance(); | |
244 return Error::success(); | |
245 | |
246 case Kind::String: | |
247 // Consume the preceding 'L', if there is any. | |
248 if (std::toupper(Data[Pos]) == 'L') | |
249 advance(); | |
250 // Consume the double-quote. | |
251 advance(); | |
252 | |
253 // Consume the characters until the end of the file, line or string. | |
254 while (true) { | |
255 if (streamEof()) { | |
256 return getStringError("Unterminated string literal."); | |
257 } else if (Data[Pos] == '"') { | |
258 // Consume the ending double-quote. | |
259 advance(); | |
260 // However, if another '"' follows this double-quote, the string didn't | |
261 // end and we just included '"' into the string. | |
262 if (!willNowRead("\"")) | |
263 return Error::success(); | |
264 } else if (Data[Pos] == '\n') { | |
265 return getStringError("String literal not terminated in the line."); | |
266 } | |
267 | |
268 advance(); | |
269 } | |
270 | |
271 case Kind::Invalid: | |
272 assert(false && "Cannot consume an invalid token."); | |
273 } | |
274 | |
275 llvm_unreachable("Unknown RCToken::Kind"); | |
276 } | |
277 | |
278 bool Tokenizer::willNowRead(StringRef FollowingChars) const { | |
279 return Data.drop_front(Pos).startswith(FollowingChars); | |
280 } | |
281 | |
282 bool Tokenizer::canStartIdentifier() const { | |
283 assert(!streamEof()); | |
284 | |
285 const char CurChar = Data[Pos]; | |
286 return std::isalpha(CurChar) || CurChar == '_'; | |
287 } | |
288 | |
289 bool Tokenizer::canContinueIdentifier() const { | |
290 assert(!streamEof()); | |
291 const char CurChar = Data[Pos]; | |
292 return std::isalnum(CurChar) || CurChar == '_'; | |
293 } | |
294 | |
295 bool Tokenizer::canStartInt() const { | |
296 assert(!streamEof()); | |
297 return std::isdigit(Data[Pos]); | |
298 } | |
299 | |
300 bool Tokenizer::canStartBlockComment() const { | |
301 assert(!streamEof()); | |
302 return Data.drop_front(Pos).startswith("/*"); | |
303 } | |
304 | |
305 bool Tokenizer::canStartLineComment() const { | |
306 assert(!streamEof()); | |
307 return Data.drop_front(Pos).startswith("//"); | |
308 } | |
309 | |
310 bool Tokenizer::canContinueInt() const { | |
311 assert(!streamEof()); | |
312 return std::isalnum(Data[Pos]); | |
313 } | |
314 | |
315 bool Tokenizer::canStartString() const { | |
316 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\""); | |
317 } | |
318 | |
319 bool Tokenizer::streamEof() const { return Pos == DataLength; } | |
320 | |
321 Kind Tokenizer::classifyCurrentToken() const { | |
322 if (canStartBlockComment()) | |
323 return Kind::StartComment; | |
324 if (canStartLineComment()) | |
325 return Kind::LineComment; | |
326 | |
327 if (canStartInt()) | |
328 return Kind::Int; | |
329 if (canStartString()) | |
330 return Kind::String; | |
331 // BEGIN and END are at this point of lexing recognized as identifiers. | |
332 if (canStartIdentifier()) | |
333 return Kind::Identifier; | |
334 | |
335 const char CurChar = Data[Pos]; | |
336 | |
337 switch (CurChar) { | |
338 // One-character token classification. | |
339 #define TOKEN(Name) | |
340 #define SHORT_TOKEN(Name, Ch) \ | |
341 case Ch: \ | |
342 return Kind::Name; | |
343 #include "ResourceScriptTokenList.h" | |
344 #undef TOKEN | |
345 #undef SHORT_TOKEN | |
346 | |
347 default: | |
348 return Kind::Invalid; | |
349 } | |
350 } | |
351 | |
352 void Tokenizer::processIdentifier(RCToken &Token) const { | |
353 assert(Token.kind() == Kind::Identifier); | |
354 StringRef Name = Token.value(); | |
355 | |
356 if (Name.equals_lower("begin")) | |
357 Token = RCToken(Kind::BlockBegin, Name); | |
358 else if (Name.equals_lower("end")) | |
359 Token = RCToken(Kind::BlockEnd, Name); | |
360 } | |
361 | |
362 } // anonymous namespace | |
363 | |
364 namespace llvm { | |
365 | |
366 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) { | |
367 return Tokenizer(Input).run(); | |
368 } | |
369 | |
370 } // namespace llvm |