LLVM  mainline
StringRef.cpp
Go to the documentation of this file.
00001 //===-- StringRef.cpp - Lightweight String References ---------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 
00010 #include "llvm/ADT/StringRef.h"
00011 #include "llvm/ADT/APInt.h"
00012 #include "llvm/ADT/Hashing.h"
00013 #include "llvm/ADT/edit_distance.h"
00014 #include <bitset>
00015 
00016 using namespace llvm;
00017 
00018 // MSVC emits references to this into the translation units which reference it.
00019 #ifndef _MSC_VER
00020 const size_t StringRef::npos;
00021 #endif
00022 
00023 static char ascii_tolower(char x) {
00024   if (x >= 'A' && x <= 'Z')
00025     return x - 'A' + 'a';
00026   return x;
00027 }
00028 
00029 static char ascii_toupper(char x) {
00030   if (x >= 'a' && x <= 'z')
00031     return x - 'a' + 'A';
00032   return x;
00033 }
00034 
00035 static bool ascii_isdigit(char x) {
00036   return x >= '0' && x <= '9';
00037 }
00038 
00039 // strncasecmp() is not available on non-POSIX systems, so define an
00040 // alternative function here.
00041 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
00042   for (size_t I = 0; I < Length; ++I) {
00043     unsigned char LHC = ascii_tolower(LHS[I]);
00044     unsigned char RHC = ascii_tolower(RHS[I]);
00045     if (LHC != RHC)
00046       return LHC < RHC ? -1 : 1;
00047   }
00048   return 0;
00049 }
00050 
00051 /// compare_lower - Compare strings, ignoring case.
00052 int StringRef::compare_lower(StringRef RHS) const {
00053   if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
00054     return Res;
00055   if (Length == RHS.Length)
00056     return 0;
00057   return Length < RHS.Length ? -1 : 1;
00058 }
00059 
00060 /// Check if this string starts with the given \p Prefix, ignoring case.
00061 bool StringRef::startswith_lower(StringRef Prefix) const {
00062   return Length >= Prefix.Length &&
00063       ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
00064 }
00065 
00066 /// Check if this string ends with the given \p Suffix, ignoring case.
00067 bool StringRef::endswith_lower(StringRef Suffix) const {
00068   return Length >= Suffix.Length &&
00069       ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
00070 }
00071 
00072 /// compare_numeric - Compare strings, handle embedded numbers.
00073 int StringRef::compare_numeric(StringRef RHS) const {
00074   for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
00075     // Check for sequences of digits.
00076     if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
00077       // The longer sequence of numbers is considered larger.
00078       // This doesn't really handle prefixed zeros well.
00079       size_t J;
00080       for (J = I + 1; J != E + 1; ++J) {
00081         bool ld = J < Length && ascii_isdigit(Data[J]);
00082         bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
00083         if (ld != rd)
00084           return rd ? -1 : 1;
00085         if (!rd)
00086           break;
00087       }
00088       // The two number sequences have the same length (J-I), just memcmp them.
00089       if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
00090         return Res < 0 ? -1 : 1;
00091       // Identical number sequences, continue search after the numbers.
00092       I = J - 1;
00093       continue;
00094     }
00095     if (Data[I] != RHS.Data[I])
00096       return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
00097   }
00098   if (Length == RHS.Length)
00099     return 0;
00100   return Length < RHS.Length ? -1 : 1;
00101 }
00102 
00103 // Compute the edit distance between the two given strings.
00104 unsigned StringRef::edit_distance(llvm::StringRef Other,
00105                                   bool AllowReplacements,
00106                                   unsigned MaxEditDistance) const {
00107   return llvm::ComputeEditDistance(
00108       makeArrayRef(data(), size()),
00109       makeArrayRef(Other.data(), Other.size()),
00110       AllowReplacements, MaxEditDistance);
00111 }
00112 
00113 //===----------------------------------------------------------------------===//
00114 // String Operations
00115 //===----------------------------------------------------------------------===//
00116 
00117 std::string StringRef::lower() const {
00118   std::string Result(size(), char());
00119   for (size_type i = 0, e = size(); i != e; ++i) {
00120     Result[i] = ascii_tolower(Data[i]);
00121   }
00122   return Result;
00123 }
00124 
00125 std::string StringRef::upper() const {
00126   std::string Result(size(), char());
00127   for (size_type i = 0, e = size(); i != e; ++i) {
00128     Result[i] = ascii_toupper(Data[i]);
00129   }
00130   return Result;
00131 }
00132 
00133 //===----------------------------------------------------------------------===//
00134 // String Searching
00135 //===----------------------------------------------------------------------===//
00136 
00137 
00138 /// find - Search for the first string \arg Str in the string.
00139 ///
00140 /// \return - The index of the first occurrence of \arg Str, or npos if not
00141 /// found.
00142 size_t StringRef::find(StringRef Str, size_t From) const {
00143   if (From > Length)
00144     return npos;
00145 
00146   const char *Needle = Str.data();
00147   size_t N = Str.size();
00148   if (N == 0)
00149     return From;
00150 
00151   size_t Size = Length - From;
00152   if (Size < N)
00153     return npos;
00154 
00155   const char *Start = Data + From;
00156   const char *Stop = Start + (Size - N + 1);
00157 
00158   // For short haystacks or unsupported needles fall back to the naive algorithm
00159   if (Size < 16 || N > 255) {
00160     do {
00161       if (std::memcmp(Start, Needle, N) == 0)
00162         return Start - Data;
00163       ++Start;
00164     } while (Start < Stop);
00165     return npos;
00166   }
00167 
00168   // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
00169   uint8_t BadCharSkip[256];
00170   std::memset(BadCharSkip, N, 256);
00171   for (unsigned i = 0; i != N-1; ++i)
00172     BadCharSkip[(uint8_t)Str[i]] = N-1-i;
00173 
00174   do {
00175     if (std::memcmp(Start, Needle, N) == 0)
00176       return Start - Data;
00177 
00178     // Otherwise skip the appropriate number of bytes.
00179     Start += BadCharSkip[(uint8_t)Start[N-1]];
00180   } while (Start < Stop);
00181 
00182   return npos;
00183 }
00184 
00185 /// rfind - Search for the last string \arg Str in the string.
00186 ///
00187 /// \return - The index of the last occurrence of \arg Str, or npos if not
00188 /// found.
00189 size_t StringRef::rfind(StringRef Str) const {
00190   size_t N = Str.size();
00191   if (N > Length)
00192     return npos;
00193   for (size_t i = Length - N + 1, e = 0; i != e;) {
00194     --i;
00195     if (substr(i, N).equals(Str))
00196       return i;
00197   }
00198   return npos;
00199 }
00200 
00201 /// find_first_of - Find the first character in the string that is in \arg
00202 /// Chars, or npos if not found.
00203 ///
00204 /// Note: O(size() + Chars.size())
00205 StringRef::size_type StringRef::find_first_of(StringRef Chars,
00206                                               size_t From) const {
00207   std::bitset<1 << CHAR_BIT> CharBits;
00208   for (size_type i = 0; i != Chars.size(); ++i)
00209     CharBits.set((unsigned char)Chars[i]);
00210 
00211   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
00212     if (CharBits.test((unsigned char)Data[i]))
00213       return i;
00214   return npos;
00215 }
00216 
00217 /// find_first_not_of - Find the first character in the string that is not
00218 /// \arg C or npos if not found.
00219 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
00220   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
00221     if (Data[i] != C)
00222       return i;
00223   return npos;
00224 }
00225 
00226 /// find_first_not_of - Find the first character in the string that is not
00227 /// in the string \arg Chars, or npos if not found.
00228 ///
00229 /// Note: O(size() + Chars.size())
00230 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
00231                                                   size_t From) const {
00232   std::bitset<1 << CHAR_BIT> CharBits;
00233   for (size_type i = 0; i != Chars.size(); ++i)
00234     CharBits.set((unsigned char)Chars[i]);
00235 
00236   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
00237     if (!CharBits.test((unsigned char)Data[i]))
00238       return i;
00239   return npos;
00240 }
00241 
00242 /// find_last_of - Find the last character in the string that is in \arg C,
00243 /// or npos if not found.
00244 ///
00245 /// Note: O(size() + Chars.size())
00246 StringRef::size_type StringRef::find_last_of(StringRef Chars,
00247                                              size_t From) const {
00248   std::bitset<1 << CHAR_BIT> CharBits;
00249   for (size_type i = 0; i != Chars.size(); ++i)
00250     CharBits.set((unsigned char)Chars[i]);
00251 
00252   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
00253     if (CharBits.test((unsigned char)Data[i]))
00254       return i;
00255   return npos;
00256 }
00257 
00258 /// find_last_not_of - Find the last character in the string that is not
00259 /// \arg C, or npos if not found.
00260 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
00261   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
00262     if (Data[i] != C)
00263       return i;
00264   return npos;
00265 }
00266 
00267 /// find_last_not_of - Find the last character in the string that is not in
00268 /// \arg Chars, or npos if not found.
00269 ///
00270 /// Note: O(size() + Chars.size())
00271 StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
00272                                                  size_t From) const {
00273   std::bitset<1 << CHAR_BIT> CharBits;
00274   for (size_type i = 0, e = Chars.size(); i != e; ++i)
00275     CharBits.set((unsigned char)Chars[i]);
00276 
00277   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
00278     if (!CharBits.test((unsigned char)Data[i]))
00279       return i;
00280   return npos;
00281 }
00282 
00283 void StringRef::split(SmallVectorImpl<StringRef> &A,
00284                       StringRef Separator, int MaxSplit,
00285                       bool KeepEmpty) const {
00286   StringRef S = *this;
00287 
00288   // Count down from MaxSplit. When MaxSplit is -1, this will just split
00289   // "forever". This doesn't support splitting more than 2^31 times
00290   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
00291   // but that seems unlikely to be useful.
00292   while (MaxSplit-- != 0) {
00293     size_t Idx = S.find(Separator);
00294     if (Idx == npos)
00295       break;
00296 
00297     // Push this split.
00298     if (KeepEmpty || Idx > 0)
00299       A.push_back(S.slice(0, Idx));
00300 
00301     // Jump forward.
00302     S = S.slice(Idx + Separator.size(), npos);
00303   }
00304 
00305   // Push the tail.
00306   if (KeepEmpty || !S.empty())
00307     A.push_back(S);
00308 }
00309 
00310 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
00311                       int MaxSplit, bool KeepEmpty) const {
00312   StringRef S = *this;
00313 
00314   // Count down from MaxSplit. When MaxSplit is -1, this will just split
00315   // "forever". This doesn't support splitting more than 2^31 times
00316   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
00317   // but that seems unlikely to be useful.
00318   while (MaxSplit-- != 0) {
00319     size_t Idx = S.find(Separator);
00320     if (Idx == npos)
00321       break;
00322 
00323     // Push this split.
00324     if (KeepEmpty || Idx > 0)
00325       A.push_back(S.slice(0, Idx));
00326 
00327     // Jump forward.
00328     S = S.slice(Idx + 1, npos);
00329   }
00330 
00331   // Push the tail.
00332   if (KeepEmpty || !S.empty())
00333     A.push_back(S);
00334 }
00335 
00336 //===----------------------------------------------------------------------===//
00337 // Helpful Algorithms
00338 //===----------------------------------------------------------------------===//
00339 
00340 /// count - Return the number of non-overlapped occurrences of \arg Str in
00341 /// the string.
00342 size_t StringRef::count(StringRef Str) const {
00343   size_t Count = 0;
00344   size_t N = Str.size();
00345   if (N > Length)
00346     return 0;
00347   for (size_t i = 0, e = Length - N + 1; i != e; ++i)
00348     if (substr(i, N).equals(Str))
00349       ++Count;
00350   return Count;
00351 }
00352 
00353 static unsigned GetAutoSenseRadix(StringRef &Str) {
00354   if (Str.startswith("0x")) {
00355     Str = Str.substr(2);
00356     return 16;
00357   }
00358   
00359   if (Str.startswith("0b")) {
00360     Str = Str.substr(2);
00361     return 2;
00362   }
00363 
00364   if (Str.startswith("0o")) {
00365     Str = Str.substr(2);
00366     return 8;
00367   }
00368 
00369   if (Str.startswith("0"))
00370     return 8;
00371   
00372   return 10;
00373 }
00374 
00375 
00376 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
00377 /// sequence of radix up to 36 to an unsigned long long value.
00378 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
00379                                 unsigned long long &Result) {
00380   // Autosense radix if not specified.
00381   if (Radix == 0)
00382     Radix = GetAutoSenseRadix(Str);
00383 
00384   // Empty strings (after the radix autosense) are invalid.
00385   if (Str.empty()) return true;
00386 
00387   // Parse all the bytes of the string given this radix.  Watch for overflow.
00388   Result = 0;
00389   while (!Str.empty()) {
00390     unsigned CharVal;
00391     if (Str[0] >= '0' && Str[0] <= '9')
00392       CharVal = Str[0]-'0';
00393     else if (Str[0] >= 'a' && Str[0] <= 'z')
00394       CharVal = Str[0]-'a'+10;
00395     else if (Str[0] >= 'A' && Str[0] <= 'Z')
00396       CharVal = Str[0]-'A'+10;
00397     else
00398       return true;
00399 
00400     // If the parsed value is larger than the integer radix, the string is
00401     // invalid.
00402     if (CharVal >= Radix)
00403       return true;
00404 
00405     // Add in this character.
00406     unsigned long long PrevResult = Result;
00407     Result = Result*Radix+CharVal;
00408 
00409     // Check for overflow by shifting back and seeing if bits were lost.
00410     if (Result/Radix < PrevResult)
00411       return true;
00412 
00413     Str = Str.substr(1);
00414   }
00415 
00416   return false;
00417 }
00418 
00419 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
00420                               long long &Result) {
00421   unsigned long long ULLVal;
00422 
00423   // Handle positive strings first.
00424   if (Str.empty() || Str.front() != '-') {
00425     if (getAsUnsignedInteger(Str, Radix, ULLVal) ||
00426         // Check for value so large it overflows a signed value.
00427         (long long)ULLVal < 0)
00428       return true;
00429     Result = ULLVal;
00430     return false;
00431   }
00432 
00433   // Get the positive part of the value.
00434   if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
00435       // Reject values so large they'd overflow as negative signed, but allow
00436       // "-0".  This negates the unsigned so that the negative isn't undefined
00437       // on signed overflow.
00438       (long long)-ULLVal > 0)
00439     return true;
00440 
00441   Result = -ULLVal;
00442   return false;
00443 }
00444 
00445 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
00446   StringRef Str = *this;
00447 
00448   // Autosense radix if not specified.
00449   if (Radix == 0)
00450     Radix = GetAutoSenseRadix(Str);
00451 
00452   assert(Radix > 1 && Radix <= 36);
00453 
00454   // Empty strings (after the radix autosense) are invalid.
00455   if (Str.empty()) return true;
00456 
00457   // Skip leading zeroes.  This can be a significant improvement if
00458   // it means we don't need > 64 bits.
00459   while (!Str.empty() && Str.front() == '0')
00460     Str = Str.substr(1);
00461 
00462   // If it was nothing but zeroes....
00463   if (Str.empty()) {
00464     Result = APInt(64, 0);
00465     return false;
00466   }
00467 
00468   // (Over-)estimate the required number of bits.
00469   unsigned Log2Radix = 0;
00470   while ((1U << Log2Radix) < Radix) Log2Radix++;
00471   bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
00472 
00473   unsigned BitWidth = Log2Radix * Str.size();
00474   if (BitWidth < Result.getBitWidth())
00475     BitWidth = Result.getBitWidth(); // don't shrink the result
00476   else if (BitWidth > Result.getBitWidth())
00477     Result = Result.zext(BitWidth);
00478 
00479   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
00480   if (!IsPowerOf2Radix) {
00481     // These must have the same bit-width as Result.
00482     RadixAP = APInt(BitWidth, Radix);
00483     CharAP = APInt(BitWidth, 0);
00484   }
00485 
00486   // Parse all the bytes of the string given this radix.
00487   Result = 0;
00488   while (!Str.empty()) {
00489     unsigned CharVal;
00490     if (Str[0] >= '0' && Str[0] <= '9')
00491       CharVal = Str[0]-'0';
00492     else if (Str[0] >= 'a' && Str[0] <= 'z')
00493       CharVal = Str[0]-'a'+10;
00494     else if (Str[0] >= 'A' && Str[0] <= 'Z')
00495       CharVal = Str[0]-'A'+10;
00496     else
00497       return true;
00498 
00499     // If the parsed value is larger than the integer radix, the string is
00500     // invalid.
00501     if (CharVal >= Radix)
00502       return true;
00503 
00504     // Add in this character.
00505     if (IsPowerOf2Radix) {
00506       Result <<= Log2Radix;
00507       Result |= CharVal;
00508     } else {
00509       Result *= RadixAP;
00510       CharAP = CharVal;
00511       Result += CharAP;
00512     }
00513 
00514     Str = Str.substr(1);
00515   }
00516 
00517   return false;
00518 }
00519 
00520 
00521 // Implementation of StringRef hashing.
00522 hash_code llvm::hash_value(StringRef S) {
00523   return hash_combine_range(S.begin(), S.end());
00524 }