Line data Source code
1 : //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // Implement the Lexer for TableGen.
11 : //
12 : //===----------------------------------------------------------------------===//
13 :
14 : #include "TGLexer.h"
15 : #include "llvm/ADT/StringSwitch.h"
16 : #include "llvm/ADT/Twine.h"
17 : #include "llvm/Config/config.h" // for strtoull()/strtoll() define
18 : #include "llvm/Support/Compiler.h"
19 : #include "llvm/Support/MemoryBuffer.h"
20 : #include "llvm/Support/SourceMgr.h"
21 : #include "llvm/TableGen/Error.h"
22 : #include <cctype>
23 : #include <cerrno>
24 : #include <cstdint>
25 : #include <cstdio>
26 : #include <cstdlib>
27 : #include <cstring>
28 :
29 : using namespace llvm;
30 :
31 352 : TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
32 352 : CurBuffer = SrcMgr.getMainFileID();
33 352 : CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
34 352 : CurPtr = CurBuf.begin();
35 352 : TokStart = nullptr;
36 352 : }
37 :
38 21844850 : SMLoc TGLexer::getLoc() const {
39 21844850 : return SMLoc::getFromPointer(TokStart);
40 : }
41 :
42 : /// ReturnError - Set the error to the specified string at the specified
43 : /// location. This is defined to always return tgtok::Error.
44 0 : tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
45 0 : PrintError(Loc, Msg);
46 0 : return tgtok::Error;
47 : }
48 :
49 137885815 : int TGLexer::getNextChar() {
50 137894267 : char CurChar = *CurPtr++;
51 137894267 : switch (CurChar) {
52 127573537 : default:
53 127573537 : return (unsigned char)CurChar;
54 8800 : case 0: {
55 : // A nul character in the stream is either the end of the current buffer or
56 : // a random nul in the file. Disambiguate that here.
57 17600 : if (CurPtr-1 != CurBuf.end())
58 : return 0; // Just whitespace.
59 :
60 : // If this is the end of an included file, pop the parent file off the
61 : // include stack.
62 8800 : SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
63 8800 : if (ParentIncludeLoc != SMLoc()) {
64 8452 : CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
65 8452 : CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
66 8452 : CurPtr = ParentIncludeLoc.getPointer();
67 8452 : return getNextChar();
68 : }
69 :
70 : // Otherwise, return end of file.
71 348 : --CurPtr; // Another call to lex will return EOF again.
72 348 : return EOF;
73 : }
74 10311930 : case '\n':
75 : case '\r':
76 : // Handle the newline character by ignoring it and incrementing the line
77 : // count. However, be careful about 'dos style' files with \n\r in them.
78 : // Only treat a \n\r or \r\n as a single line.
79 10311930 : if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
80 : *CurPtr != CurChar)
81 0 : ++CurPtr; // Eat the two char newline sequence.
82 : return '\n';
83 : }
84 : }
85 :
86 2646843 : int TGLexer::peekNextChar(int Index) {
87 2646843 : return *(CurPtr + Index);
88 : }
89 :
90 133946215 : tgtok::TokKind TGLexer::LexToken() {
91 133946215 : TokStart = CurPtr;
92 : // This always consumes at least one character.
93 133946215 : int CurChar = getNextChar();
94 :
95 133946215 : switch (CurChar) {
96 19523396 : default:
97 : // Handle letters: [a-zA-Z_]
98 19523396 : if (isalpha(CurChar) || CurChar == '_')
99 19523396 : return LexIdentifier();
100 :
101 : // Unknown character, emit an error.
102 0 : return ReturnError(TokStart, "Unexpected character");
103 : case EOF: return tgtok::Eof;
104 3481361 : case ':': return tgtok::colon;
105 3127853 : case ';': return tgtok::semi;
106 123623 : case '.': return tgtok::period;
107 8214572 : case ',': return tgtok::comma;
108 3609746 : case '<': return tgtok::less;
109 3609746 : case '>': return tgtok::greater;
110 3005986 : case ']': return tgtok::r_square;
111 593268 : case '{': return tgtok::l_brace;
112 593268 : case '}': return tgtok::r_brace;
113 1194006 : case '(': return tgtok::l_paren;
114 1194006 : case ')': return tgtok::r_paren;
115 976359 : case '=': return tgtok::equal;
116 24561 : case '?': return tgtok::question;
117 112116 : case '#': return tgtok::paste;
118 :
119 73963432 : case 0:
120 : case ' ':
121 : case '\t':
122 : case '\n':
123 : case '\r':
124 : // Ignore whitespace.
125 73963432 : return LexToken();
126 2250944 : case '/':
127 : // If this is the start of a // comment, skip until the end of the line or
128 : // the end of the buffer.
129 2250944 : if (*CurPtr == '/')
130 2238193 : SkipBCPLComment();
131 12751 : else if (*CurPtr == '*') {
132 12751 : if (SkipCComment())
133 : return tgtok::Error;
134 : } else // Otherwise, this is an error.
135 0 : return ReturnError(TokStart, "Unexpected character");
136 2250943 : return LexToken();
137 2124691 : case '-': case '+':
138 : case '0': case '1': case '2': case '3': case '4': case '5': case '6':
139 : case '7': case '8': case '9': {
140 : int NextChar = 0;
141 2124691 : if (isdigit(CurChar)) {
142 : // Allow identifiers to start with a number if it is followed by
143 : // an identifier. This can happen with paste operations like
144 : // foo#8i.
145 : int i = 0;
146 : do {
147 2276596 : NextChar = peekNextChar(i++);
148 2276596 : } while (isdigit(NextChar));
149 :
150 1911391 : if (NextChar == 'x' || NextChar == 'b') {
151 : // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
152 : // likely a number.
153 370247 : int NextNextChar = peekNextChar(i);
154 : switch (NextNextChar) {
155 : default:
156 : break;
157 288315 : case '0': case '1':
158 288315 : if (NextChar == 'b')
159 261719 : return LexNumber();
160 : LLVM_FALLTHROUGH;
161 : case '2': case '3': case '4': case '5':
162 : case '6': case '7': case '8': case '9':
163 : case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
164 : case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
165 108528 : if (NextChar == 'x')
166 108528 : return LexNumber();
167 : break;
168 : }
169 : }
170 : }
171 :
172 1754444 : if (isalpha(NextChar) || NextChar == '_')
173 2041 : return LexIdentifier();
174 :
175 1752403 : return LexNumber();
176 : }
177 2024271 : case '"': return LexString();
178 1085439 : case '$': return LexVarName();
179 3026716 : case '[': return LexBracket();
180 86508 : case '!': return LexExclaim();
181 : }
182 : }
183 :
184 : /// LexString - Lex "[^"]*"
185 2024271 : tgtok::TokKind TGLexer::LexString() {
186 2024271 : const char *StrStart = CurPtr;
187 :
188 2024271 : CurStrVal = "";
189 :
190 41228815 : while (*CurPtr != '"') {
191 : // If we hit the end of the buffer, report an error.
192 39204544 : if (*CurPtr == 0 && CurPtr == CurBuf.end())
193 0 : return ReturnError(StrStart, "End of file in string literal");
194 :
195 39204544 : if (*CurPtr == '\n' || *CurPtr == '\r')
196 0 : return ReturnError(StrStart, "End of line in string literal");
197 :
198 39204544 : if (*CurPtr != '\\') {
199 39124954 : CurStrVal += *CurPtr++;
200 39124954 : continue;
201 : }
202 :
203 79590 : ++CurPtr;
204 :
205 79590 : switch (*CurPtr) {
206 13646 : case '\\': case '\'': case '"':
207 : // These turn into their literal character.
208 13646 : CurStrVal += *CurPtr++;
209 : break;
210 65142 : case 't':
211 : CurStrVal += '\t';
212 65142 : ++CurPtr;
213 65142 : break;
214 802 : case 'n':
215 : CurStrVal += '\n';
216 802 : ++CurPtr;
217 802 : break;
218 :
219 : case '\n':
220 : case '\r':
221 0 : return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
222 :
223 : // If we hit the end of the buffer, report an error.
224 0 : case '\0':
225 0 : if (CurPtr == CurBuf.end())
226 0 : return ReturnError(StrStart, "End of file in string literal");
227 : LLVM_FALLTHROUGH;
228 : default:
229 0 : return ReturnError(CurPtr, "invalid escape in string literal");
230 : }
231 : }
232 :
233 2024271 : ++CurPtr;
234 2024271 : return tgtok::StrVal;
235 : }
236 :
237 1085439 : tgtok::TokKind TGLexer::LexVarName() {
238 1085439 : if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
239 0 : return ReturnError(TokStart, "Invalid variable name");
240 :
241 : // Otherwise, we're ok, consume the rest of the characters.
242 1085439 : const char *VarNameStart = CurPtr++;
243 :
244 3360273 : while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
245 2274834 : ++CurPtr;
246 :
247 1085439 : CurStrVal.assign(VarNameStart, CurPtr);
248 1085439 : return tgtok::VarName;
249 : }
250 :
251 19525437 : tgtok::TokKind TGLexer::LexIdentifier() {
252 : // The first letter is [a-zA-Z_#].
253 19525437 : const char *IdentStart = TokStart;
254 :
255 : // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
256 185477364 : while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
257 165951927 : ++CurPtr;
258 :
259 : // Check to see if this identifier is a keyword.
260 19525437 : StringRef Str(IdentStart, CurPtr-IdentStart);
261 :
262 : if (Str == "include") {
263 8452 : if (LexInclude()) return tgtok::Error;
264 8452 : return Lex();
265 : }
266 :
267 15635308 : tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
268 : .Case("int", tgtok::Int)
269 : .Case("bit", tgtok::Bit)
270 : .Case("bits", tgtok::Bits)
271 : .Case("string", tgtok::String)
272 : .Case("list", tgtok::List)
273 : .Case("code", tgtok::Code)
274 : .Case("dag", tgtok::Dag)
275 : .Case("class", tgtok::Class)
276 : .Case("def", tgtok::Def)
277 : .Case("foreach", tgtok::Foreach)
278 : .Case("defm", tgtok::Defm)
279 : .Case("defset", tgtok::Defset)
280 : .Case("multiclass", tgtok::MultiClass)
281 : .Case("field", tgtok::Field)
282 : .Case("let", tgtok::Let)
283 : .Case("in", tgtok::In)
284 : .Default(tgtok::Id);
285 :
286 3881677 : if (Kind == tgtok::Id)
287 15635308 : CurStrVal.assign(Str.begin(), Str.end());
288 : return Kind;
289 : }
290 :
291 : /// LexInclude - We just read the "include" token. Get the string token that
292 : /// comes next and enter the include.
293 8452 : bool TGLexer::LexInclude() {
294 : // The token after the include must be a string.
295 8452 : tgtok::TokKind Tok = LexToken();
296 8452 : if (Tok == tgtok::Error) return true;
297 8452 : if (Tok != tgtok::StrVal) {
298 0 : PrintError(getLoc(), "Expected filename after include");
299 0 : return true;
300 : }
301 :
302 : // Get the string.
303 : std::string Filename = CurStrVal;
304 : std::string IncludedFile;
305 :
306 8452 : CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
307 : IncludedFile);
308 8452 : if (!CurBuffer) {
309 0 : PrintError(getLoc(), "Could not find include file '" + Filename + "'");
310 0 : return true;
311 : }
312 :
313 : DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
314 8452 : if (Found != Dependencies.end()) {
315 0 : PrintError(getLoc(),
316 0 : "File '" + IncludedFile + "' has already been included.");
317 0 : SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
318 : "previously included here");
319 0 : return true;
320 : }
321 8452 : Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
322 : // Save the line number and lex buffer of the includer.
323 8452 : CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
324 8452 : CurPtr = CurBuf.begin();
325 8452 : return false;
326 : }
327 :
328 2238193 : void TGLexer::SkipBCPLComment() {
329 2238193 : ++CurPtr; // skip the second slash.
330 : while (true) {
331 74788217 : switch (*CurPtr) {
332 : case '\n':
333 : case '\r':
334 : return; // Newline is end of comment.
335 1 : case 0:
336 : // If this is the end of the buffer, end the comment.
337 2 : if (CurPtr == CurBuf.end())
338 : return;
339 : break;
340 : }
341 : // Otherwise, skip the character.
342 72550024 : ++CurPtr;
343 : }
344 : }
345 :
346 : /// SkipCComment - This skips C-style /**/ comments. The only difference from C
347 : /// is that we allow nesting.
348 12751 : bool TGLexer::SkipCComment() {
349 12751 : ++CurPtr; // skip the star.
350 : unsigned CommentDepth = 1;
351 :
352 : while (true) {
353 254846 : int CurChar = getNextChar();
354 254846 : switch (CurChar) {
355 : case EOF:
356 1 : PrintError(TokStart, "Unterminated comment!");
357 1 : return true;
358 18749 : case '*':
359 : // End of the comment?
360 18749 : if (CurPtr[0] != '/') break;
361 :
362 12753 : ++CurPtr; // End the */.
363 12753 : if (--CommentDepth == 0)
364 : return false;
365 : break;
366 102 : case '/':
367 : // Start of a nested comment?
368 102 : if (CurPtr[0] != '*') break;
369 4 : ++CurPtr;
370 4 : ++CommentDepth;
371 4 : break;
372 : }
373 : }
374 : }
375 :
376 : /// LexNumber - Lex:
377 : /// [-+]?[0-9]+
378 : /// 0x[0-9a-fA-F]+
379 : /// 0b[01]+
380 2122650 : tgtok::TokKind TGLexer::LexNumber() {
381 2122650 : if (CurPtr[-1] == '0') {
382 725752 : if (CurPtr[0] == 'x') {
383 108528 : ++CurPtr;
384 : const char *NumStart = CurPtr;
385 373425 : while (isxdigit(CurPtr[0]))
386 264897 : ++CurPtr;
387 :
388 : // Requires at least one hex digit.
389 108528 : if (CurPtr == NumStart)
390 0 : return ReturnError(TokStart, "Invalid hexadecimal number");
391 :
392 108528 : errno = 0;
393 108528 : CurIntVal = strtoll(NumStart, nullptr, 16);
394 108528 : if (errno == EINVAL)
395 0 : return ReturnError(TokStart, "Invalid hexadecimal number");
396 108528 : if (errno == ERANGE) {
397 94 : errno = 0;
398 94 : CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16);
399 94 : if (errno == EINVAL)
400 0 : return ReturnError(TokStart, "Invalid hexadecimal number");
401 94 : if (errno == ERANGE)
402 0 : return ReturnError(TokStart, "Hexadecimal number out of range");
403 : }
404 108528 : return tgtok::IntVal;
405 617224 : } else if (CurPtr[0] == 'b') {
406 261719 : ++CurPtr;
407 : const char *NumStart = CurPtr;
408 1337245 : while (CurPtr[0] == '0' || CurPtr[0] == '1')
409 1075526 : ++CurPtr;
410 :
411 : // Requires at least one binary digit.
412 261719 : if (CurPtr == NumStart)
413 0 : return ReturnError(CurPtr-2, "Invalid binary number");
414 261719 : CurIntVal = strtoll(NumStart, nullptr, 2);
415 261719 : return tgtok::BinaryIntVal;
416 : }
417 : }
418 :
419 : // Check for a sign without a digit.
420 1752403 : if (!isdigit(CurPtr[0])) {
421 1217566 : if (CurPtr[-1] == '-')
422 : return tgtok::minus;
423 1217098 : else if (CurPtr[-1] == '+')
424 : return tgtok::plus;
425 : }
426 :
427 2438479 : while (isdigit(CurPtr[0]))
428 686544 : ++CurPtr;
429 1751935 : CurIntVal = strtoll(TokStart, nullptr, 10);
430 1751935 : return tgtok::IntVal;
431 : }
432 :
433 : /// LexBracket - We just read '['. If this is a code block, return it,
434 : /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
435 3026716 : tgtok::TokKind TGLexer::LexBracket() {
436 3026716 : if (CurPtr[0] != '{')
437 : return tgtok::l_square;
438 20730 : ++CurPtr;
439 : const char *CodeStart = CurPtr;
440 : while (true) {
441 3660623 : int Char = getNextChar();
442 3660623 : if (Char == EOF) break;
443 :
444 3660623 : if (Char != '}') continue;
445 :
446 24131 : Char = getNextChar();
447 24131 : if (Char == EOF) break;
448 24131 : if (Char == ']') {
449 20730 : CurStrVal.assign(CodeStart, CurPtr-2);
450 20730 : return tgtok::CodeFragment;
451 : }
452 : }
453 :
454 0 : return ReturnError(CodeStart-2, "Unterminated Code Block");
455 : }
456 :
457 : /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
458 86508 : tgtok::TokKind TGLexer::LexExclaim() {
459 86508 : if (!isalpha(*CurPtr))
460 0 : return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
461 :
462 86508 : const char *Start = CurPtr++;
463 437966 : while (isalpha(*CurPtr))
464 351458 : ++CurPtr;
465 :
466 : // Check to see which operator this is.
467 : tgtok::TokKind Kind =
468 86508 : StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
469 86508 : .Case("eq", tgtok::XEq)
470 86508 : .Case("ne", tgtok::XNe)
471 86508 : .Case("le", tgtok::XLe)
472 86508 : .Case("lt", tgtok::XLt)
473 86508 : .Case("ge", tgtok::XGe)
474 86508 : .Case("gt", tgtok::XGt)
475 86508 : .Case("if", tgtok::XIf)
476 86508 : .Case("isa", tgtok::XIsA)
477 86508 : .Case("head", tgtok::XHead)
478 86508 : .Case("tail", tgtok::XTail)
479 86508 : .Case("size", tgtok::XSize)
480 86508 : .Case("con", tgtok::XConcat)
481 86508 : .Case("dag", tgtok::XDag)
482 86508 : .Case("add", tgtok::XADD)
483 86508 : .Case("and", tgtok::XAND)
484 86508 : .Case("or", tgtok::XOR)
485 86508 : .Case("shl", tgtok::XSHL)
486 86508 : .Case("sra", tgtok::XSRA)
487 86508 : .Case("srl", tgtok::XSRL)
488 86508 : .Case("cast", tgtok::XCast)
489 86508 : .Case("empty", tgtok::XEmpty)
490 86508 : .Case("subst", tgtok::XSubst)
491 86508 : .Case("foldl", tgtok::XFoldl)
492 86508 : .Case("foreach", tgtok::XForEach)
493 86508 : .Case("listconcat", tgtok::XListConcat)
494 86508 : .Case("strconcat", tgtok::XStrConcat)
495 : .Default(tgtok::Error);
496 :
497 86508 : return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
498 : }
|