Line data Source code
1 : //===-- StringRef.cpp - Lightweight String References ---------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 :
10 : #include "llvm/ADT/StringRef.h"
11 : #include "llvm/ADT/APFloat.h"
12 : #include "llvm/ADT/APInt.h"
13 : #include "llvm/ADT/Hashing.h"
14 : #include "llvm/ADT/StringExtras.h"
15 : #include "llvm/ADT/edit_distance.h"
16 : #include <bitset>
17 :
18 : using namespace llvm;
19 :
20 : // MSVC emits references to this into the translation units which reference it.
21 : #ifndef _MSC_VER
22 : const size_t StringRef::npos;
23 : #endif
24 :
25 : // strncasecmp() is not available on non-POSIX systems, so define an
26 : // alternative function here.
27 : static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
28 45141280 : for (size_t I = 0; I < Length; ++I) {
29 44542211 : unsigned char LHC = toLower(LHS[I]);
30 44542211 : unsigned char RHC = toLower(RHS[I]);
31 44542211 : if (LHC != RHC)
32 18830188 : return LHC < RHC ? -1 : 1;
33 : }
34 : return 0;
35 : }
36 :
37 : /// compare_lower - Compare strings, ignoring case.
38 19412540 : int StringRef::compare_lower(StringRef RHS) const {
39 19896746 : if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
40 18830188 : return Res;
41 582352 : if (Length == RHS.Length)
42 : return 0;
43 4864 : return Length < RHS.Length ? -1 : 1;
44 : }
45 :
46 : /// Check if this string starts with the given \p Prefix, ignoring case.
47 238922 : bool StringRef::startswith_lower(StringRef Prefix) const {
48 238922 : return Length >= Prefix.Length &&
49 225087 : ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
50 : }
51 :
52 : /// Check if this string ends with the given \p Suffix, ignoring case.
53 25917 : bool StringRef::endswith_lower(StringRef Suffix) const {
54 25917 : return Length >= Suffix.Length &&
55 23944 : ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
56 : }
57 :
58 14 : size_t StringRef::find_lower(char C, size_t From) const {
59 : char L = toLower(C);
60 14 : return find_if([L](char D) { return toLower(D) == L; }, From);
61 : }
62 :
63 : /// compare_numeric - Compare strings, handle embedded numbers.
64 671092 : int StringRef::compare_numeric(StringRef RHS) const {
65 5881072 : for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
66 : // Check for sequences of digits.
67 11558922 : if (isDigit(Data[I]) && isDigit(RHS.Data[I])) {
68 : // The longer sequence of numbers is considered larger.
69 : // This doesn't really handle prefixed zeros well.
70 : size_t J;
71 1725995 : for (J = I + 1; J != E + 1; ++J) {
72 1725995 : bool ld = J < Length && isDigit(Data[J]);
73 1725995 : bool rd = J < RHS.Length && isDigit(RHS.Data[J]);
74 1725995 : if (ld != rd)
75 16066 : return rd ? -1 : 1;
76 1713942 : if (!rd)
77 : break;
78 : }
79 : // The two number sequences have the same length (J-I), just memcmp them.
80 452760 : if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
81 494567 : return Res < 0 ? -1 : 1;
82 : // Identical number sequences, continue search after the numbers.
83 47898 : I = J - 1;
84 47898 : continue;
85 : }
86 5314648 : if (Data[I] != RHS.Data[I])
87 419145 : return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
88 : }
89 5460 : if (Length == RHS.Length)
90 : return 0;
91 4156 : return Length < RHS.Length ? -1 : 1;
92 : }
93 :
94 : // Compute the edit distance between the two given strings.
95 2028347 : unsigned StringRef::edit_distance(llvm::StringRef Other,
96 : bool AllowReplacements,
97 : unsigned MaxEditDistance) const {
98 4056694 : return llvm::ComputeEditDistance(
99 : makeArrayRef(data(), size()),
100 : makeArrayRef(Other.data(), Other.size()),
101 2028347 : AllowReplacements, MaxEditDistance);
102 : }
103 :
104 : //===----------------------------------------------------------------------===//
105 : // String Operations
106 : //===----------------------------------------------------------------------===//
107 :
108 7244300 : std::string StringRef::lower() const {
109 : std::string Result(size(), char());
110 110685589 : for (size_type i = 0, e = size(); i != e; ++i) {
111 150940313 : Result[i] = toLower(Data[i]);
112 : }
113 7244300 : return Result;
114 : }
115 :
116 497723 : std::string StringRef::upper() const {
117 : std::string Result(size(), char());
118 2327001 : for (size_type i = 0, e = size(); i != e; ++i) {
119 3103463 : Result[i] = toUpper(Data[i]);
120 : }
121 497723 : return Result;
122 : }
123 :
124 : //===----------------------------------------------------------------------===//
125 : // String Searching
126 : //===----------------------------------------------------------------------===//
127 :
128 :
129 : /// find - Search for the first string \arg Str in the string.
130 : ///
131 : /// \return - The index of the first occurrence of \arg Str, or npos if not
132 : /// found.
133 44633923 : size_t StringRef::find(StringRef Str, size_t From) const {
134 44633923 : if (From > Length)
135 : return npos;
136 :
137 44633923 : const char *Start = Data + From;
138 89267846 : size_t Size = Length - From;
139 :
140 : const char *Needle = Str.data();
141 : size_t N = Str.size();
142 44633923 : if (N == 0)
143 : return From;
144 44633283 : if (Size < N)
145 : return npos;
146 43806808 : if (N == 1) {
147 31048838 : const char *Ptr = (const char *)::memchr(Start, Needle[0], Size);
148 31048838 : return Ptr == nullptr ? npos : Ptr - Data;
149 : }
150 :
151 12757970 : const char *Stop = Start + (Size - N + 1);
152 :
153 : // For short haystacks or unsupported needles fall back to the naive algorithm
154 12757970 : if (Size < 16 || N > 255) {
155 : do {
156 22675630 : if (std::memcmp(Start, Needle, N) == 0)
157 274047 : return Start - Data;
158 22401583 : ++Start;
159 22401583 : } while (Start < Stop);
160 : return npos;
161 : }
162 :
163 : // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
164 : uint8_t BadCharSkip[256];
165 9571555 : std::memset(BadCharSkip, N, 256);
166 90217251 : for (unsigned i = 0; i != N-1; ++i)
167 161291392 : BadCharSkip[(uint8_t)Str[i]] = N-1-i;
168 :
169 : do {
170 172248703 : uint8_t Last = Start[N - 1];
171 172248703 : if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1]))
172 9209248 : if (std::memcmp(Start, Needle, N - 1) == 0)
173 4711461 : return Start - Data;
174 :
175 : // Otherwise skip the appropriate number of bytes.
176 167537242 : Start += BadCharSkip[Last];
177 167537242 : } while (Start < Stop);
178 :
179 : return npos;
180 : }
181 :
182 6286 : size_t StringRef::find_lower(StringRef Str, size_t From) const {
183 6286 : StringRef This = substr(From);
184 23594 : while (This.size() >= Str.size()) {
185 17768 : if (This.startswith_lower(Str))
186 460 : return From;
187 17308 : This = This.drop_front();
188 17308 : ++From;
189 : }
190 : return npos;
191 : }
192 :
193 3 : size_t StringRef::rfind_lower(char C, size_t From) const {
194 6 : From = std::min(From, Length);
195 : size_t i = From;
196 15 : while (i != 0) {
197 14 : --i;
198 28 : if (toLower(Data[i]) == toLower(C))
199 2 : return i;
200 : }
201 : return npos;
202 : }
203 :
204 : /// rfind - Search for the last string \arg Str in the string.
205 : ///
206 : /// \return - The index of the last occurrence of \arg Str, or npos if not
207 : /// found.
208 349065 : size_t StringRef::rfind(StringRef Str) const {
209 : size_t N = Str.size();
210 349065 : if (N > Length)
211 : return npos;
212 3019160 : for (size_t i = Length - N + 1, e = 0; i != e;) {
213 2909193 : --i;
214 2909193 : if (substr(i, N).equals(Str))
215 239061 : return i;
216 : }
217 : return npos;
218 : }
219 :
220 1 : size_t StringRef::rfind_lower(StringRef Str) const {
221 : size_t N = Str.size();
222 1 : if (N > Length)
223 : return npos;
224 2 : for (size_t i = Length - N + 1, e = 0; i != e;) {
225 1 : --i;
226 2 : if (substr(i, N).equals_lower(Str))
227 0 : return i;
228 : }
229 : return npos;
230 : }
231 :
232 : /// find_first_of - Find the first character in the string that is in \arg
233 : /// Chars, or npos if not found.
234 : ///
235 : /// Note: O(size() + Chars.size())
236 50372323 : StringRef::size_type StringRef::find_first_of(StringRef Chars,
237 : size_t From) const {
238 151117849 : std::bitset<1 << CHAR_BIT> CharBits;
239 126925585 : for (size_type i = 0; i != Chars.size(); ++i)
240 229644207 : CharBits.set((unsigned char)Chars[i]);
241 :
242 550299831 : for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
243 524742349 : if (CharBits.test((unsigned char)Data[i]))
244 25411075 : return i;
245 : return npos;
246 : }
247 :
248 : /// find_first_not_of - Find the first character in the string that is not
249 : /// \arg C or npos if not found.
250 17066 : StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
251 18449 : for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
252 16843 : if (Data[i] != C)
253 15460 : return i;
254 : return npos;
255 : }
256 :
257 : /// find_first_not_of - Find the first character in the string that is not
258 : /// in the string \arg Chars, or npos if not found.
259 : ///
260 : /// Note: O(size() + Chars.size())
261 39845562 : StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
262 : size_t From) const {
263 119536704 : std::bitset<1 << CHAR_BIT> CharBits;
264 245694297 : for (size_type i = 0; i != Chars.size(); ++i)
265 411697352 : CharBits.set((unsigned char)Chars[i]);
266 :
267 50221048 : for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
268 49177994 : if (!CharBits.test((unsigned char)Data[i]))
269 38802541 : return i;
270 : return npos;
271 : }
272 :
273 : /// find_last_of - Find the last character in the string that is in \arg C,
274 : /// or npos if not found.
275 : ///
276 : /// Note: O(size() + Chars.size())
277 9789384 : StringRef::size_type StringRef::find_last_of(StringRef Chars,
278 : size_t From) const {
279 29368152 : std::bitset<1 << CHAR_BIT> CharBits;
280 19646349 : for (size_type i = 0; i != Chars.size(); ++i)
281 19713932 : CharBits.set((unsigned char)Chars[i]);
282 :
283 87447844 : for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
284 86048870 : if (CharBits.test((unsigned char)Data[i]))
285 9028328 : return i;
286 : return npos;
287 : }
288 :
289 : /// find_last_not_of - Find the last character in the string that is not
290 : /// \arg C, or npos if not found.
291 327147 : StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
292 673381 : for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
293 344641 : if (Data[i] != C)
294 325554 : return i;
295 : return npos;
296 : }
297 :
298 : /// find_last_not_of - Find the last character in the string that is not in
299 : /// \arg Chars, or npos if not found.
300 : ///
301 : /// Note: O(size() + Chars.size())
302 29017897 : StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
303 : size_t From) const {
304 87053691 : std::bitset<1 << CHAR_BIT> CharBits;
305 202359330 : for (size_type i = 0, e = Chars.size(); i != e; ++i)
306 346682866 : CharBits.set((unsigned char)Chars[i]);
307 :
308 60811080 : for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
309 31431767 : if (!CharBits.test((unsigned char)Data[i]))
310 28653765 : return i;
311 : return npos;
312 : }
313 :
314 770223 : void StringRef::split(SmallVectorImpl<StringRef> &A,
315 : StringRef Separator, int MaxSplit,
316 : bool KeepEmpty) const {
317 770223 : StringRef S = *this;
318 :
319 : // Count down from MaxSplit. When MaxSplit is -1, this will just split
320 : // "forever". This doesn't support splitting more than 2^31 times
321 : // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
322 : // but that seems unlikely to be useful.
323 25490803 : while (MaxSplit-- != 0) {
324 25490790 : size_t Idx = S.find(Separator);
325 25490790 : if (Idx == npos)
326 : break;
327 :
328 : // Push this split.
329 24720580 : if (KeepEmpty || Idx > 0)
330 47615910 : A.push_back(S.slice(0, Idx));
331 :
332 : // Jump forward.
333 49441160 : S = S.slice(Idx + Separator.size(), npos);
334 : }
335 :
336 : // Push the tail.
337 770223 : if (KeepEmpty || !S.empty())
338 649863 : A.push_back(S);
339 770223 : }
340 :
341 1931777 : void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
342 : int MaxSplit, bool KeepEmpty) const {
343 1931777 : StringRef S = *this;
344 :
345 : // Count down from MaxSplit. When MaxSplit is -1, this will just split
346 : // "forever". This doesn't support splitting more than 2^31 times
347 : // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
348 : // but that seems unlikely to be useful.
349 6379997 : while (MaxSplit-- != 0) {
350 6474046 : size_t Idx = S.find(Separator);
351 4448220 : if (Idx == npos)
352 : break;
353 :
354 : // Push this split.
355 4448220 : if (KeepEmpty || Idx > 0)
356 8896430 : A.push_back(S.slice(0, Idx));
357 :
358 : // Jump forward.
359 8896440 : S = S.slice(Idx + 1, npos);
360 : }
361 :
362 : // Push the tail.
363 1931777 : if (KeepEmpty || !S.empty())
364 1856863 : A.push_back(S);
365 1931777 : }
366 :
367 : //===----------------------------------------------------------------------===//
368 : // Helpful Algorithms
369 : //===----------------------------------------------------------------------===//
370 :
371 : /// count - Return the number of non-overlapped occurrences of \arg Str in
372 : /// the string.
373 6617 : size_t StringRef::count(StringRef Str) const {
374 : size_t Count = 0;
375 : size_t N = Str.size();
376 6617 : if (N > Length)
377 : return 0;
378 116651 : for (size_t i = 0, e = Length - N + 1; i != e; ++i)
379 110053 : if (substr(i, N).equals(Str))
380 50380 : ++Count;
381 : return Count;
382 : }
383 :
384 1099476 : static unsigned GetAutoSenseRadix(StringRef &Str) {
385 1099476 : if (Str.empty())
386 : return 10;
387 :
388 : if (Str.startswith("0x") || Str.startswith("0X")) {
389 834871 : Str = Str.substr(2);
390 834871 : return 16;
391 : }
392 :
393 : if (Str.startswith("0b") || Str.startswith("0B")) {
394 38 : Str = Str.substr(2);
395 38 : return 2;
396 : }
397 :
398 : if (Str.startswith("0o")) {
399 1 : Str = Str.substr(2);
400 1 : return 8;
401 : }
402 :
403 527840 : if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) {
404 43 : Str = Str.substr(1);
405 43 : return 8;
406 : }
407 :
408 : return 10;
409 : }
410 :
411 5395788 : bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
412 : unsigned long long &Result) {
413 : // Autosense radix if not specified.
414 5395788 : if (Radix == 0)
415 1051176 : Radix = GetAutoSenseRadix(Str);
416 :
417 : // Empty strings (after the radix autosense) are invalid.
418 5395788 : if (Str.empty()) return true;
419 :
420 : // Parse all the bytes of the string given this radix. Watch for overflow.
421 5381406 : StringRef Str2 = Str;
422 5381406 : Result = 0;
423 13469153 : while (!Str2.empty()) {
424 : unsigned CharVal;
425 16452910 : if (Str2[0] >= '0' && Str2[0] <= '9')
426 7680017 : CharVal = Str2[0] - '0';
427 546438 : else if (Str2[0] >= 'a' && Str2[0] <= 'z')
428 512852 : CharVal = Str2[0] - 'a' + 10;
429 33586 : else if (Str2[0] >= 'A' && Str2[0] <= 'Z')
430 19663 : CharVal = Str2[0] - 'A' + 10;
431 : else
432 : break;
433 :
434 : // If the parsed value is larger than the integer radix, we cannot
435 : // consume any more characters.
436 8212532 : if (CharVal >= Radix)
437 : break;
438 :
439 : // Add in this character.
440 8087752 : unsigned long long PrevResult = Result;
441 8087752 : Result = Result * Radix + CharVal;
442 :
443 : // Check for overflow by shifting back and seeing if bits were lost.
444 8087752 : if (Result / Radix < PrevResult)
445 : return true;
446 :
447 8087747 : Str2 = Str2.substr(1);
448 : }
449 :
450 : // We consider the operation a failure if no characters were consumed
451 : // successfully.
452 5381401 : if (Str.size() == Str2.size())
453 : return true;
454 :
455 5255087 : Str = Str2;
456 5255087 : return false;
457 : }
458 :
459 2069244 : bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
460 : long long &Result) {
461 : unsigned long long ULLVal;
462 :
463 : // Handle positive strings first.
464 2069244 : if (Str.empty() || Str.front() != '-') {
465 2053206 : if (consumeUnsignedInteger(Str, Radix, ULLVal) ||
466 : // Check for value so large it overflows a signed value.
467 1999223 : (long long)ULLVal < 0)
468 : return true;
469 1999222 : Result = ULLVal;
470 1999222 : return false;
471 : }
472 :
473 : // Get the positive part of the value.
474 16038 : StringRef Str2 = Str.drop_front(1);
475 16038 : if (consumeUnsignedInteger(Str2, Radix, ULLVal) ||
476 : // Reject values so large they'd overflow as negative signed, but allow
477 : // "-0". This negates the unsigned so that the negative isn't undefined
478 : // on signed overflow.
479 16038 : (long long)-ULLVal > 0)
480 : return true;
481 :
482 16037 : Str = Str2;
483 16037 : Result = -ULLVal;
484 16037 : return false;
485 : }
486 :
487 : /// GetAsUnsignedInteger - Workhorse method that converts a integer character
488 : /// sequence of radix up to 36 to an unsigned long long value.
489 3186904 : bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
490 : unsigned long long &Result) {
491 3186904 : if (consumeUnsignedInteger(Str, Radix, Result))
492 : return true;
493 :
494 : // For getAsUnsignedInteger, we require the whole string to be consumed or
495 : // else we consider it a failure.
496 3113723 : return !Str.empty();
497 : }
498 :
499 2069074 : bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
500 : long long &Result) {
501 2069074 : if (consumeSignedInteger(Str, Radix, Result))
502 : return true;
503 :
504 : // For getAsSignedInteger, we require the whole string to be consumed or else
505 : // we consider it a failure.
506 2015089 : return !Str.empty();
507 : }
508 :
509 641189 : bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
510 641189 : StringRef Str = *this;
511 :
512 : // Autosense radix if not specified.
513 641189 : if (Radix == 0)
514 48300 : Radix = GetAutoSenseRadix(Str);
515 :
516 : assert(Radix > 1 && Radix <= 36);
517 :
518 : // Empty strings (after the radix autosense) are invalid.
519 641189 : if (Str.empty()) return true;
520 :
521 : // Skip leading zeroes. This can be a significant improvement if
522 : // it means we don't need > 64 bits.
523 722815 : while (!Str.empty() && Str.front() == '0')
524 81626 : Str = Str.substr(1);
525 :
526 : // If it was nothing but zeroes....
527 641189 : if (Str.empty()) {
528 73229 : Result = APInt(64, 0);
529 73229 : return false;
530 : }
531 :
532 : // (Over-)estimate the required number of bits.
533 : unsigned Log2Radix = 0;
534 2839636 : while ((1U << Log2Radix) < Radix) Log2Radix++;
535 : bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
536 :
537 567960 : unsigned BitWidth = Log2Radix * Str.size();
538 567960 : if (BitWidth < Result.getBitWidth())
539 : BitWidth = Result.getBitWidth(); // don't shrink the result
540 50 : else if (BitWidth > Result.getBitWidth())
541 82 : Result = Result.zext(BitWidth);
542 :
543 : APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
544 567960 : if (!IsPowerOf2Radix) {
545 : // These must have the same bit-width as Result.
546 1073462 : RadixAP = APInt(BitWidth, Radix);
547 536731 : CharAP = APInt(BitWidth, 0);
548 : }
549 :
550 : // Parse all the bytes of the string given this radix.
551 567960 : Result = 0;
552 1671344 : while (!Str.empty()) {
553 : unsigned CharVal;
554 2206818 : if (Str[0] >= '0' && Str[0] <= '9')
555 1059248 : CharVal = Str[0]-'0';
556 44161 : else if (Str[0] >= 'a' && Str[0] <= 'z')
557 41395 : CharVal = Str[0]-'a'+10;
558 2766 : else if (Str[0] >= 'A' && Str[0] <= 'Z')
559 2766 : CharVal = Str[0]-'A'+10;
560 : else
561 : return true;
562 :
563 : // If the parsed value is larger than the integer radix, the string is
564 : // invalid.
565 1103409 : if (CharVal >= Radix)
566 : return true;
567 :
568 : // Add in this character.
569 1103384 : if (IsPowerOf2Radix) {
570 127532 : Result <<= Log2Radix;
571 127532 : Result |= CharVal;
572 : } else {
573 975852 : Result *= RadixAP;
574 975852 : CharAP = CharVal;
575 975852 : Result += CharAP;
576 : }
577 :
578 1103384 : Str = Str.substr(1);
579 : }
580 :
581 : return false;
582 : }
583 :
584 16 : bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
585 16 : APFloat F(0.0);
586 : APFloat::opStatus Status =
587 16 : F.convertFromString(*this, APFloat::rmNearestTiesToEven);
588 16 : if (Status != APFloat::opOK) {
589 6 : if (!AllowInexact || !(Status & APFloat::opInexact))
590 : return true;
591 : }
592 :
593 13 : Result = F.convertToDouble();
594 13 : return false;
595 : }
596 :
597 : // Implementation of StringRef hashing.
598 7666735 : hash_code llvm::hash_value(StringRef S) {
599 7666735 : return hash_combine_range(S.begin(), S.end());
600 : }
|