Line data Source code
1 : //===-- Regex.cpp - Regular Expression matcher implementation -------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file implements a POSIX regular expression matcher.
11 : //
12 : //===----------------------------------------------------------------------===//
13 :
14 : #include "llvm/Support/Regex.h"
15 : #include "llvm/ADT/SmallVector.h"
16 : #include "llvm/ADT/StringRef.h"
17 : #include "llvm/ADT/Twine.h"
18 : #include <string>
19 :
20 : // Important this comes last because it defines "_REGEX_H_". At least on
21 : // Darwin, if included before any header that (transitively) includes
22 : // xlocale.h, this will cause trouble, because of missing regex-related types.
23 : #include "regex_impl.h"
24 :
25 : using namespace llvm;
26 :
27 2 : Regex::Regex() : preg(nullptr), error(REG_BADPAT) {}
28 :
29 1931245 : Regex::Regex(StringRef regex, unsigned Flags) {
30 : unsigned flags = 0;
31 1931245 : preg = new llvm_regex();
32 1931245 : preg->re_endp = regex.end();
33 1931245 : if (Flags & IgnoreCase)
34 : flags |= REG_ICASE;
35 1931245 : if (Flags & Newline)
36 730610 : flags |= REG_NEWLINE;
37 1931245 : if (!(Flags & BasicRegex))
38 1931245 : flags |= REG_EXTENDED;
39 1931245 : error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
40 1931245 : }
41 :
42 15498 : Regex::Regex(Regex &®ex) {
43 15498 : preg = regex.preg;
44 15498 : error = regex.error;
45 15498 : regex.preg = nullptr;
46 15498 : regex.error = REG_BADPAT;
47 15498 : }
48 :
49 3892572 : Regex::~Regex() {
50 1946286 : if (preg) {
51 1930786 : llvm_regfree(preg);
52 1930786 : delete preg;
53 : }
54 1946286 : }
55 :
56 877359 : bool Regex::isValid(std::string &Error) const {
57 877359 : if (!error)
58 : return true;
59 :
60 11 : size_t len = llvm_regerror(error, preg, nullptr, 0);
61 :
62 11 : Error.resize(len - 1);
63 11 : llvm_regerror(error, preg, &Error[0], len);
64 11 : return false;
65 : }
66 :
67 : /// getNumMatches - In a valid regex, return the number of parenthesized
68 : /// matches it contains.
69 813489 : unsigned Regex::getNumMatches() const {
70 813489 : return preg->re_nsub;
71 : }
72 :
73 36759205 : bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
74 36759205 : if (error)
75 : return false;
76 :
77 35441384 : unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
78 :
79 : // pmatch needs to have at least one element.
80 : SmallVector<llvm_regmatch_t, 8> pm;
81 35441384 : pm.resize(nmatch > 0 ? nmatch : 1);
82 35441384 : pm[0].rm_so = 0;
83 35441384 : pm[0].rm_eo = String.size();
84 :
85 35441384 : int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
86 :
87 35441384 : if (rc == REG_NOMATCH)
88 : return false;
89 5095464 : if (rc != 0) {
90 : // regexec can fail due to invalid pattern or running out of memory.
91 0 : error = rc;
92 0 : return false;
93 : }
94 :
95 : // There was a match.
96 :
97 5095464 : if (Matches) { // match position requested
98 : Matches->clear();
99 :
100 9571940 : for (unsigned i = 0; i != nmatch; ++i) {
101 10454768 : if (pm[i].rm_so == -1) {
102 : // this group didn't match
103 16770 : Matches->push_back(StringRef());
104 16770 : continue;
105 : }
106 : assert(pm[i].rm_eo >= pm[i].rm_so);
107 5210614 : Matches->push_back(StringRef(String.data()+pm[i].rm_so,
108 5210614 : pm[i].rm_eo-pm[i].rm_so));
109 : }
110 : }
111 :
112 : return true;
113 : }
114 :
115 77 : std::string Regex::sub(StringRef Repl, StringRef String,
116 : std::string *Error) {
117 : SmallVector<StringRef, 8> Matches;
118 :
119 : // Reset error, if given.
120 77 : if (Error && !Error->empty()) *Error = "";
121 :
122 : // Return the input if there was no match.
123 77 : if (!match(String, &Matches))
124 : return String;
125 :
126 : // Otherwise splice in the replacement string, starting with the prefix before
127 : // the match.
128 23 : std::string Res(String.begin(), Matches[0].begin());
129 :
130 : // Then the replacement string, honoring possible substitutions.
131 42 : while (!Repl.empty()) {
132 : // Skip to the next escape.
133 26 : std::pair<StringRef, StringRef> Split = Repl.split('\\');
134 :
135 : // Add the skipped substring.
136 : Res += Split.first;
137 :
138 : // Check for terminimation and trailing backslash.
139 26 : if (Split.second.empty()) {
140 1 : if (Repl.size() != Split.first.size() &&
141 8 : Error && Error->empty())
142 : *Error = "replacement string contained trailing backslash";
143 7 : break;
144 : }
145 :
146 : // Otherwise update the replacement string and interpret escapes.
147 19 : Repl = Split.second;
148 :
149 : // FIXME: We should have a StringExtras function for mapping C99 escapes.
150 38 : switch (Repl[0]) {
151 : // Treat all unrecognized characters as self-quoting.
152 2 : default:
153 : Res += Repl[0];
154 2 : Repl = Repl.substr(1);
155 2 : break;
156 :
157 : // Single character escapes.
158 : case 't':
159 : Res += '\t';
160 1 : Repl = Repl.substr(1);
161 1 : break;
162 : case 'n':
163 : Res += '\n';
164 1 : Repl = Repl.substr(1);
165 1 : break;
166 :
167 : // Decimal escapes are backreferences.
168 : case '0': case '1': case '2': case '3': case '4':
169 : case '5': case '6': case '7': case '8': case '9': {
170 : // Extract the backreference number.
171 30 : StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
172 15 : Repl = Repl.substr(Ref.size());
173 :
174 : unsigned RefValue;
175 15 : if (!Ref.getAsInteger(10, RefValue) &&
176 15 : RefValue < Matches.size())
177 : Res += Matches[RefValue];
178 1 : else if (Error && Error->empty())
179 2 : *Error = ("invalid backreference string '" + Twine(Ref) + "'").str();
180 : break;
181 : }
182 : }
183 : }
184 :
185 : // And finally the suffix.
186 46 : Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
187 :
188 : return Res;
189 : }
190 :
191 : // These are the special characters matched in functions like "p_ere_exp".
192 : static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
193 :
194 11904 : bool Regex::isLiteralERE(StringRef Str) {
195 : // Check for regex metacharacters. This list was derived from our regex
196 : // implementation in regcomp.c and double checked against the POSIX extended
197 : // regular expression specification.
198 11904 : return Str.find_first_of(RegexMetachars) == StringRef::npos;
199 : }
200 :
201 1991661 : std::string Regex::escape(StringRef String) {
202 : std::string RegexStr;
203 27505334 : for (unsigned i = 0, e = String.size(); i != e; ++i) {
204 76541019 : if (strchr(RegexMetachars, String[i]))
205 : RegexStr += '\\';
206 25513673 : RegexStr += String[i];
207 : }
208 :
209 1991661 : return RegexStr;
210 : }
|