Mercurial > hg > Members > tobaru > cbc > CbC_llvm
comparison lib/Support/StringRef.cpp @ 0:95c75e76d11b
LLVM 3.4
author | Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp> |
---|---|
date | Thu, 12 Dec 2013 13:56:28 +0900 |
parents | |
children | 54457678186b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:95c75e76d11b |
---|---|
1 //===-- StringRef.cpp - Lightweight String References ---------------------===// | |
2 // | |
3 // The LLVM Compiler Infrastructure | |
4 // | |
5 // This file is distributed under the University of Illinois Open Source | |
6 // License. See LICENSE.TXT for details. | |
7 // | |
8 //===----------------------------------------------------------------------===// | |
9 | |
10 #include "llvm/ADT/StringRef.h" | |
11 #include "llvm/ADT/APInt.h" | |
12 #include "llvm/ADT/Hashing.h" | |
13 #include "llvm/ADT/OwningPtr.h" | |
14 #include "llvm/ADT/edit_distance.h" | |
15 #include <bitset> | |
16 | |
17 using namespace llvm; | |
18 | |
19 // MSVC emits references to this into the translation units which reference it. | |
20 #ifndef _MSC_VER | |
21 const size_t StringRef::npos; | |
22 #endif | |
23 | |
24 static char ascii_tolower(char x) { | |
25 if (x >= 'A' && x <= 'Z') | |
26 return x - 'A' + 'a'; | |
27 return x; | |
28 } | |
29 | |
30 static char ascii_toupper(char x) { | |
31 if (x >= 'a' && x <= 'z') | |
32 return x - 'a' + 'A'; | |
33 return x; | |
34 } | |
35 | |
36 static bool ascii_isdigit(char x) { | |
37 return x >= '0' && x <= '9'; | |
38 } | |
39 | |
40 // strncasecmp() is not available on non-POSIX systems, so define an | |
41 // alternative function here. | |
42 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { | |
43 for (size_t I = 0; I < Length; ++I) { | |
44 unsigned char LHC = ascii_tolower(LHS[I]); | |
45 unsigned char RHC = ascii_tolower(RHS[I]); | |
46 if (LHC != RHC) | |
47 return LHC < RHC ? -1 : 1; | |
48 } | |
49 return 0; | |
50 } | |
51 | |
52 /// compare_lower - Compare strings, ignoring case. | |
53 int StringRef::compare_lower(StringRef RHS) const { | |
54 if (int Res = ascii_strncasecmp(Data, RHS.Data, min(Length, RHS.Length))) | |
55 return Res; | |
56 if (Length == RHS.Length) | |
57 return 0; | |
58 return Length < RHS.Length ? -1 : 1; | |
59 } | |
60 | |
61 /// Check if this string starts with the given \p Prefix, ignoring case. | |
62 bool StringRef::startswith_lower(StringRef Prefix) const { | |
63 return Length >= Prefix.Length && | |
64 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; | |
65 } | |
66 | |
67 /// Check if this string ends with the given \p Suffix, ignoring case. | |
68 bool StringRef::endswith_lower(StringRef Suffix) const { | |
69 return Length >= Suffix.Length && | |
70 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; | |
71 } | |
72 | |
73 /// compare_numeric - Compare strings, handle embedded numbers. | |
74 int StringRef::compare_numeric(StringRef RHS) const { | |
75 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { | |
76 // Check for sequences of digits. | |
77 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { | |
78 // The longer sequence of numbers is considered larger. | |
79 // This doesn't really handle prefixed zeros well. | |
80 size_t J; | |
81 for (J = I + 1; J != E + 1; ++J) { | |
82 bool ld = J < Length && ascii_isdigit(Data[J]); | |
83 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); | |
84 if (ld != rd) | |
85 return rd ? -1 : 1; | |
86 if (!rd) | |
87 break; | |
88 } | |
89 // The two number sequences have the same length (J-I), just memcmp them. | |
90 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) | |
91 return Res < 0 ? -1 : 1; | |
92 // Identical number sequences, continue search after the numbers. | |
93 I = J - 1; | |
94 continue; | |
95 } | |
96 if (Data[I] != RHS.Data[I]) | |
97 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; | |
98 } | |
99 if (Length == RHS.Length) | |
100 return 0; | |
101 return Length < RHS.Length ? -1 : 1; | |
102 } | |
103 | |
104 // Compute the edit distance between the two given strings. | |
105 unsigned StringRef::edit_distance(llvm::StringRef Other, | |
106 bool AllowReplacements, | |
107 unsigned MaxEditDistance) const { | |
108 return llvm::ComputeEditDistance( | |
109 llvm::ArrayRef<char>(data(), size()), | |
110 llvm::ArrayRef<char>(Other.data(), Other.size()), | |
111 AllowReplacements, MaxEditDistance); | |
112 } | |
113 | |
114 //===----------------------------------------------------------------------===// | |
115 // String Operations | |
116 //===----------------------------------------------------------------------===// | |
117 | |
118 std::string StringRef::lower() const { | |
119 std::string Result(size(), char()); | |
120 for (size_type i = 0, e = size(); i != e; ++i) { | |
121 Result[i] = ascii_tolower(Data[i]); | |
122 } | |
123 return Result; | |
124 } | |
125 | |
126 std::string StringRef::upper() const { | |
127 std::string Result(size(), char()); | |
128 for (size_type i = 0, e = size(); i != e; ++i) { | |
129 Result[i] = ascii_toupper(Data[i]); | |
130 } | |
131 return Result; | |
132 } | |
133 | |
134 //===----------------------------------------------------------------------===// | |
135 // String Searching | |
136 //===----------------------------------------------------------------------===// | |
137 | |
138 | |
139 /// find - Search for the first string \arg Str in the string. | |
140 /// | |
141 /// \return - The index of the first occurrence of \arg Str, or npos if not | |
142 /// found. | |
143 size_t StringRef::find(StringRef Str, size_t From) const { | |
144 size_t N = Str.size(); | |
145 if (N > Length) | |
146 return npos; | |
147 | |
148 // For short haystacks or unsupported needles fall back to the naive algorithm | |
149 if (Length < 16 || N > 255 || N == 0) { | |
150 for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) | |
151 if (substr(i, N).equals(Str)) | |
152 return i; | |
153 return npos; | |
154 } | |
155 | |
156 if (From >= Length) | |
157 return npos; | |
158 | |
159 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. | |
160 uint8_t BadCharSkip[256]; | |
161 std::memset(BadCharSkip, N, 256); | |
162 for (unsigned i = 0; i != N-1; ++i) | |
163 BadCharSkip[(uint8_t)Str[i]] = N-1-i; | |
164 | |
165 unsigned Len = Length-From, Pos = From; | |
166 while (Len >= N) { | |
167 if (substr(Pos, N).equals(Str)) // See if this is the correct substring. | |
168 return Pos; | |
169 | |
170 // Otherwise skip the appropriate number of bytes. | |
171 uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; | |
172 Len -= Skip; | |
173 Pos += Skip; | |
174 } | |
175 | |
176 return npos; | |
177 } | |
178 | |
179 /// rfind - Search for the last string \arg Str in the string. | |
180 /// | |
181 /// \return - The index of the last occurrence of \arg Str, or npos if not | |
182 /// found. | |
183 size_t StringRef::rfind(StringRef Str) const { | |
184 size_t N = Str.size(); | |
185 if (N > Length) | |
186 return npos; | |
187 for (size_t i = Length - N + 1, e = 0; i != e;) { | |
188 --i; | |
189 if (substr(i, N).equals(Str)) | |
190 return i; | |
191 } | |
192 return npos; | |
193 } | |
194 | |
195 /// find_first_of - Find the first character in the string that is in \arg | |
196 /// Chars, or npos if not found. | |
197 /// | |
198 /// Note: O(size() + Chars.size()) | |
199 StringRef::size_type StringRef::find_first_of(StringRef Chars, | |
200 size_t From) const { | |
201 std::bitset<1 << CHAR_BIT> CharBits; | |
202 for (size_type i = 0; i != Chars.size(); ++i) | |
203 CharBits.set((unsigned char)Chars[i]); | |
204 | |
205 for (size_type i = min(From, Length), e = Length; i != e; ++i) | |
206 if (CharBits.test((unsigned char)Data[i])) | |
207 return i; | |
208 return npos; | |
209 } | |
210 | |
211 /// find_first_not_of - Find the first character in the string that is not | |
212 /// \arg C or npos if not found. | |
213 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { | |
214 for (size_type i = min(From, Length), e = Length; i != e; ++i) | |
215 if (Data[i] != C) | |
216 return i; | |
217 return npos; | |
218 } | |
219 | |
220 /// find_first_not_of - Find the first character in the string that is not | |
221 /// in the string \arg Chars, or npos if not found. | |
222 /// | |
223 /// Note: O(size() + Chars.size()) | |
224 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, | |
225 size_t From) const { | |
226 std::bitset<1 << CHAR_BIT> CharBits; | |
227 for (size_type i = 0; i != Chars.size(); ++i) | |
228 CharBits.set((unsigned char)Chars[i]); | |
229 | |
230 for (size_type i = min(From, Length), e = Length; i != e; ++i) | |
231 if (!CharBits.test((unsigned char)Data[i])) | |
232 return i; | |
233 return npos; | |
234 } | |
235 | |
236 /// find_last_of - Find the last character in the string that is in \arg C, | |
237 /// or npos if not found. | |
238 /// | |
239 /// Note: O(size() + Chars.size()) | |
240 StringRef::size_type StringRef::find_last_of(StringRef Chars, | |
241 size_t From) const { | |
242 std::bitset<1 << CHAR_BIT> CharBits; | |
243 for (size_type i = 0; i != Chars.size(); ++i) | |
244 CharBits.set((unsigned char)Chars[i]); | |
245 | |
246 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) | |
247 if (CharBits.test((unsigned char)Data[i])) | |
248 return i; | |
249 return npos; | |
250 } | |
251 | |
252 /// find_last_not_of - Find the last character in the string that is not | |
253 /// \arg C, or npos if not found. | |
254 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { | |
255 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) | |
256 if (Data[i] != C) | |
257 return i; | |
258 return npos; | |
259 } | |
260 | |
261 /// find_last_not_of - Find the last character in the string that is not in | |
262 /// \arg Chars, or npos if not found. | |
263 /// | |
264 /// Note: O(size() + Chars.size()) | |
265 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, | |
266 size_t From) const { | |
267 std::bitset<1 << CHAR_BIT> CharBits; | |
268 for (size_type i = 0, e = Chars.size(); i != e; ++i) | |
269 CharBits.set((unsigned char)Chars[i]); | |
270 | |
271 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) | |
272 if (!CharBits.test((unsigned char)Data[i])) | |
273 return i; | |
274 return npos; | |
275 } | |
276 | |
277 void StringRef::split(SmallVectorImpl<StringRef> &A, | |
278 StringRef Separators, int MaxSplit, | |
279 bool KeepEmpty) const { | |
280 StringRef rest = *this; | |
281 | |
282 // rest.data() is used to distinguish cases like "a," that splits into | |
283 // "a" + "" and "a" that splits into "a" + 0. | |
284 for (int splits = 0; | |
285 rest.data() != NULL && (MaxSplit < 0 || splits < MaxSplit); | |
286 ++splits) { | |
287 std::pair<StringRef, StringRef> p = rest.split(Separators); | |
288 | |
289 if (KeepEmpty || p.first.size() != 0) | |
290 A.push_back(p.first); | |
291 rest = p.second; | |
292 } | |
293 // If we have a tail left, add it. | |
294 if (rest.data() != NULL && (rest.size() != 0 || KeepEmpty)) | |
295 A.push_back(rest); | |
296 } | |
297 | |
298 //===----------------------------------------------------------------------===// | |
299 // Helpful Algorithms | |
300 //===----------------------------------------------------------------------===// | |
301 | |
302 /// count - Return the number of non-overlapped occurrences of \arg Str in | |
303 /// the string. | |
304 size_t StringRef::count(StringRef Str) const { | |
305 size_t Count = 0; | |
306 size_t N = Str.size(); | |
307 if (N > Length) | |
308 return 0; | |
309 for (size_t i = 0, e = Length - N + 1; i != e; ++i) | |
310 if (substr(i, N).equals(Str)) | |
311 ++Count; | |
312 return Count; | |
313 } | |
314 | |
315 static unsigned GetAutoSenseRadix(StringRef &Str) { | |
316 if (Str.startswith("0x")) { | |
317 Str = Str.substr(2); | |
318 return 16; | |
319 } | |
320 | |
321 if (Str.startswith("0b")) { | |
322 Str = Str.substr(2); | |
323 return 2; | |
324 } | |
325 | |
326 if (Str.startswith("0o")) { | |
327 Str = Str.substr(2); | |
328 return 8; | |
329 } | |
330 | |
331 if (Str.startswith("0")) | |
332 return 8; | |
333 | |
334 return 10; | |
335 } | |
336 | |
337 | |
338 /// GetAsUnsignedInteger - Workhorse method that converts a integer character | |
339 /// sequence of radix up to 36 to an unsigned long long value. | |
340 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, | |
341 unsigned long long &Result) { | |
342 // Autosense radix if not specified. | |
343 if (Radix == 0) | |
344 Radix = GetAutoSenseRadix(Str); | |
345 | |
346 // Empty strings (after the radix autosense) are invalid. | |
347 if (Str.empty()) return true; | |
348 | |
349 // Parse all the bytes of the string given this radix. Watch for overflow. | |
350 Result = 0; | |
351 while (!Str.empty()) { | |
352 unsigned CharVal; | |
353 if (Str[0] >= '0' && Str[0] <= '9') | |
354 CharVal = Str[0]-'0'; | |
355 else if (Str[0] >= 'a' && Str[0] <= 'z') | |
356 CharVal = Str[0]-'a'+10; | |
357 else if (Str[0] >= 'A' && Str[0] <= 'Z') | |
358 CharVal = Str[0]-'A'+10; | |
359 else | |
360 return true; | |
361 | |
362 // If the parsed value is larger than the integer radix, the string is | |
363 // invalid. | |
364 if (CharVal >= Radix) | |
365 return true; | |
366 | |
367 // Add in this character. | |
368 unsigned long long PrevResult = Result; | |
369 Result = Result*Radix+CharVal; | |
370 | |
371 // Check for overflow by shifting back and seeing if bits were lost. | |
372 if (Result/Radix < PrevResult) | |
373 return true; | |
374 | |
375 Str = Str.substr(1); | |
376 } | |
377 | |
378 return false; | |
379 } | |
380 | |
381 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, | |
382 long long &Result) { | |
383 unsigned long long ULLVal; | |
384 | |
385 // Handle positive strings first. | |
386 if (Str.empty() || Str.front() != '-') { | |
387 if (getAsUnsignedInteger(Str, Radix, ULLVal) || | |
388 // Check for value so large it overflows a signed value. | |
389 (long long)ULLVal < 0) | |
390 return true; | |
391 Result = ULLVal; | |
392 return false; | |
393 } | |
394 | |
395 // Get the positive part of the value. | |
396 if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || | |
397 // Reject values so large they'd overflow as negative signed, but allow | |
398 // "-0". This negates the unsigned so that the negative isn't undefined | |
399 // on signed overflow. | |
400 (long long)-ULLVal > 0) | |
401 return true; | |
402 | |
403 Result = -ULLVal; | |
404 return false; | |
405 } | |
406 | |
407 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { | |
408 StringRef Str = *this; | |
409 | |
410 // Autosense radix if not specified. | |
411 if (Radix == 0) | |
412 Radix = GetAutoSenseRadix(Str); | |
413 | |
414 assert(Radix > 1 && Radix <= 36); | |
415 | |
416 // Empty strings (after the radix autosense) are invalid. | |
417 if (Str.empty()) return true; | |
418 | |
419 // Skip leading zeroes. This can be a significant improvement if | |
420 // it means we don't need > 64 bits. | |
421 while (!Str.empty() && Str.front() == '0') | |
422 Str = Str.substr(1); | |
423 | |
424 // If it was nothing but zeroes.... | |
425 if (Str.empty()) { | |
426 Result = APInt(64, 0); | |
427 return false; | |
428 } | |
429 | |
430 // (Over-)estimate the required number of bits. | |
431 unsigned Log2Radix = 0; | |
432 while ((1U << Log2Radix) < Radix) Log2Radix++; | |
433 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); | |
434 | |
435 unsigned BitWidth = Log2Radix * Str.size(); | |
436 if (BitWidth < Result.getBitWidth()) | |
437 BitWidth = Result.getBitWidth(); // don't shrink the result | |
438 else if (BitWidth > Result.getBitWidth()) | |
439 Result = Result.zext(BitWidth); | |
440 | |
441 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix | |
442 if (!IsPowerOf2Radix) { | |
443 // These must have the same bit-width as Result. | |
444 RadixAP = APInt(BitWidth, Radix); | |
445 CharAP = APInt(BitWidth, 0); | |
446 } | |
447 | |
448 // Parse all the bytes of the string given this radix. | |
449 Result = 0; | |
450 while (!Str.empty()) { | |
451 unsigned CharVal; | |
452 if (Str[0] >= '0' && Str[0] <= '9') | |
453 CharVal = Str[0]-'0'; | |
454 else if (Str[0] >= 'a' && Str[0] <= 'z') | |
455 CharVal = Str[0]-'a'+10; | |
456 else if (Str[0] >= 'A' && Str[0] <= 'Z') | |
457 CharVal = Str[0]-'A'+10; | |
458 else | |
459 return true; | |
460 | |
461 // If the parsed value is larger than the integer radix, the string is | |
462 // invalid. | |
463 if (CharVal >= Radix) | |
464 return true; | |
465 | |
466 // Add in this character. | |
467 if (IsPowerOf2Radix) { | |
468 Result <<= Log2Radix; | |
469 Result |= CharVal; | |
470 } else { | |
471 Result *= RadixAP; | |
472 CharAP = CharVal; | |
473 Result += CharAP; | |
474 } | |
475 | |
476 Str = Str.substr(1); | |
477 } | |
478 | |
479 return false; | |
480 } | |
481 | |
482 | |
483 // Implementation of StringRef hashing. | |
484 hash_code llvm::hash_value(StringRef S) { | |
485 return hash_combine_range(S.begin(), S.end()); | |
486 } |