LLVM 22.0.0git
AsmLexer.cpp
Go to the documentation of this file.
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class implements the lexer for assembly files.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/APInt.h"
15#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/MC/MCAsmInfo.h"
20#include "llvm/Support/SMLoc.h"
23#include <cassert>
24#include <cctype>
25#include <cstdio>
26#include <cstring>
27#include <string>
28
29using namespace llvm;
30
31SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Str.data()); }
32
34 return SMLoc::getFromPointer(Str.data() + Str.size());
35}
36
38
39void AsmToken::dump(raw_ostream &OS) const {
40 switch (Kind) {
41 case AsmToken::Error:
42 OS << "error";
43 break;
45 OS << "identifier: " << getString();
46 break;
48 OS << "int: " << getString();
49 break;
50 case AsmToken::Real:
51 OS << "real: " << getString();
52 break;
54 OS << "string: " << getString();
55 break;
56
57 // clang-format off
58 case AsmToken::Amp: OS << "Amp"; break;
59 case AsmToken::AmpAmp: OS << "AmpAmp"; break;
60 case AsmToken::At: OS << "At"; break;
61 case AsmToken::BackSlash: OS << "BackSlash"; break;
62 case AsmToken::BigNum: OS << "BigNum"; break;
63 case AsmToken::Caret: OS << "Caret"; break;
64 case AsmToken::Colon: OS << "Colon"; break;
65 case AsmToken::Comma: OS << "Comma"; break;
66 case AsmToken::Comment: OS << "Comment"; break;
67 case AsmToken::Dollar: OS << "Dollar"; break;
68 case AsmToken::Dot: OS << "Dot"; break;
69 case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
70 case AsmToken::Eof: OS << "Eof"; break;
71 case AsmToken::Equal: OS << "Equal"; break;
72 case AsmToken::EqualEqual: OS << "EqualEqual"; break;
73 case AsmToken::Exclaim: OS << "Exclaim"; break;
74 case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break;
75 case AsmToken::Greater: OS << "Greater"; break;
76 case AsmToken::GreaterEqual: OS << "GreaterEqual"; break;
77 case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
78 case AsmToken::Hash: OS << "Hash"; break;
79 case AsmToken::HashDirective: OS << "HashDirective"; break;
80 case AsmToken::LBrac: OS << "LBrac"; break;
81 case AsmToken::LCurly: OS << "LCurly"; break;
82 case AsmToken::LParen: OS << "LParen"; break;
83 case AsmToken::Less: OS << "Less"; break;
84 case AsmToken::LessEqual: OS << "LessEqual"; break;
85 case AsmToken::LessGreater: OS << "LessGreater"; break;
86 case AsmToken::LessLess: OS << "LessLess"; break;
87 case AsmToken::Minus: OS << "Minus"; break;
88 case AsmToken::MinusGreater: OS << "MinusGreater"; break;
89 case AsmToken::Percent: OS << "Percent"; break;
90 case AsmToken::Pipe: OS << "Pipe"; break;
91 case AsmToken::PipePipe: OS << "PipePipe"; break;
92 case AsmToken::Plus: OS << "Plus"; break;
93 case AsmToken::Question: OS << "Question"; break;
94 case AsmToken::RBrac: OS << "RBrac"; break;
95 case AsmToken::RCurly: OS << "RCurly"; break;
96 case AsmToken::RParen: OS << "RParen"; break;
97 case AsmToken::Slash: OS << "Slash"; break;
98 case AsmToken::Space: OS << "Space"; break;
99 case AsmToken::Star: OS << "Star"; break;
100 case AsmToken::Tilde: OS << "Tilde"; break;
101 // clang-format on
102 }
103
104 // Print the token string.
105 OS << " (\"";
107 OS << "\")";
108}
109
110AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
111 // For COFF targets, this is true, while for ELF targets, it should be false.
112 // Currently, @specifier parsing depends on '@' being included in the token.
113 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@") &&
114 MAI.useAtForSpecifier();
115 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
116
117 CurTok.emplace_back(AsmToken::Space, StringRef());
118}
119
120void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
121 bool EndStatementAtEOF) {
122 // Buffer must be NULL-terminated. NULL terminator must reside at `Buf.end()`.
123 // It must be safe to dereference `Buf.end()`.
124 assert(*Buf.end() == '\0' &&
125 "Buffer provided to AsmLexer lacks null terminator.");
126
127 CurBuf = Buf;
128
129 if (ptr)
130 CurPtr = ptr;
131 else
132 CurPtr = CurBuf.begin();
133
134 TokStart = nullptr;
135 this->EndStatementAtEOF = EndStatementAtEOF;
136}
137
138/// ReturnError - Set the error to the specified string at the specified
139/// location. This is defined to always return AsmToken::Error.
140AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
141 SetError(SMLoc::getFromPointer(Loc), Msg);
142
143 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
144}
145
146int AsmLexer::getNextChar() {
147 if (CurPtr == CurBuf.end())
148 return EOF;
149 return (unsigned char)*CurPtr++;
150}
151
152int AsmLexer::peekNextChar() {
153 if (CurPtr == CurBuf.end())
154 return EOF;
155 return (unsigned char)*CurPtr;
156}
157
158/// The leading integral digit sequence and dot should have already been
159/// consumed, some or all of the fractional digit sequence *can* have been
160/// consumed.
161AsmToken AsmLexer::LexFloatLiteral() {
162 // Skip the fractional digit sequence.
163 while (isDigit(*CurPtr))
164 ++CurPtr;
165
166 if (*CurPtr == '-' || *CurPtr == '+')
167 return ReturnError(CurPtr, "invalid sign in float literal");
168
169 // Check for exponent
170 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
171 ++CurPtr;
172
173 if (*CurPtr == '-' || *CurPtr == '+')
174 ++CurPtr;
175
176 while (isDigit(*CurPtr))
177 ++CurPtr;
178 }
179
180 return AsmToken(AsmToken::Real,
181 StringRef(TokStart, CurPtr - TokStart));
182}
183
184/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
185/// while making sure there are enough actual digits around for the constant to
186/// be valid.
187///
188/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
189/// before we get here.
190AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
191 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
192 "unexpected parse state in floating hex");
193 bool NoFracDigits = true;
194
195 // Skip the fractional part if there is one
196 if (*CurPtr == '.') {
197 ++CurPtr;
198
199 const char *FracStart = CurPtr;
200 while (isHexDigit(*CurPtr))
201 ++CurPtr;
202
203 NoFracDigits = CurPtr == FracStart;
204 }
205
206 if (NoIntDigits && NoFracDigits)
207 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
208 "expected at least one significand digit");
209
210 // Make sure we do have some kind of proper exponent part
211 if (*CurPtr != 'p' && *CurPtr != 'P')
212 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
213 "expected exponent part 'p'");
214 ++CurPtr;
215
216 if (*CurPtr == '+' || *CurPtr == '-')
217 ++CurPtr;
218
219 // N.b. exponent digits are *not* hex
220 const char *ExpStart = CurPtr;
221 while (isDigit(*CurPtr))
222 ++CurPtr;
223
224 if (CurPtr == ExpStart)
225 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
226 "expected at least one exponent digit");
227
228 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
229}
230
231/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
232static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
233 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
234 (AllowAt && C == '@') || (AllowHash && C == '#');
235}
236
237AsmToken AsmLexer::LexIdentifier() {
238 // Check for floating point literals.
239 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
240 // Disambiguate a .1243foo identifier from a floating literal.
241 while (isDigit(*CurPtr))
242 ++CurPtr;
243
244 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
245 AllowHashInIdentifier) ||
246 *CurPtr == 'e' || *CurPtr == 'E')
247 return LexFloatLiteral();
248 }
249
250 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
251 ++CurPtr;
252
253 // Handle . as a special case.
254 if (CurPtr == TokStart+1 && TokStart[0] == '.')
255 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
256
257 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
258}
259
260/// LexSlash: Slash: /
261/// C-Style Comment: /* ... */
262/// C-style Comment: // ...
263AsmToken AsmLexer::LexSlash() {
264 if (!MAI.shouldAllowAdditionalComments()) {
265 IsAtStartOfStatement = false;
266 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
267 }
268
269 switch (*CurPtr) {
270 case '*':
271 IsAtStartOfStatement = false;
272 break; // C style comment.
273 case '/':
274 ++CurPtr;
275 return LexLineComment();
276 default:
277 IsAtStartOfStatement = false;
278 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
279 }
280
281 // C Style comment.
282 ++CurPtr; // skip the star.
283 const char *CommentTextStart = CurPtr;
284 while (CurPtr != CurBuf.end()) {
285 switch (*CurPtr++) {
286 case '*':
287 // End of the comment?
288 if (*CurPtr != '/')
289 break;
290 // If we have a CommentConsumer, notify it about the comment.
291 if (CommentConsumer) {
292 CommentConsumer->HandleComment(
293 SMLoc::getFromPointer(CommentTextStart),
294 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
295 }
296 ++CurPtr; // End the */.
297 return AsmToken(AsmToken::Comment,
298 StringRef(TokStart, CurPtr - TokStart));
299 }
300 }
301 return ReturnError(TokStart, "unterminated comment");
302}
303
304/// LexLineComment: Comment: #[^\n]*
305/// : //[^\n]*
306AsmToken AsmLexer::LexLineComment() {
307 // Mark This as an end of statement with a body of the
308 // comment. While it would be nicer to leave this two tokens,
309 // backwards compatability with TargetParsers makes keeping this in this form
310 // better.
311 const char *CommentTextStart = CurPtr;
312 int CurChar = getNextChar();
313 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
314 CurChar = getNextChar();
315 const char *NewlinePtr = CurPtr;
316 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
317 ++CurPtr;
318
319 // If we have a CommentConsumer, notify it about the comment.
320 if (CommentConsumer) {
321 CommentConsumer->HandleComment(
322 SMLoc::getFromPointer(CommentTextStart),
323 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
324 }
325
326 IsAtStartOfLine = true;
327 // This is a whole line comment. leave newline
328 if (IsAtStartOfStatement)
329 return AsmToken(AsmToken::EndOfStatement,
330 StringRef(TokStart, CurPtr - TokStart));
331 IsAtStartOfStatement = true;
332
333 return AsmToken(AsmToken::EndOfStatement,
334 StringRef(TokStart, CurPtr - 1 - TokStart));
335}
336
337static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
338 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
339 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
340 ++CurPtr;
341 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
342 ++CurPtr;
343 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
344 ++CurPtr;
345}
346
347// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
348// integer as a hexadecimal, possibly with leading zeroes.
349static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
350 bool LexHex) {
351 const char *FirstNonDec = nullptr;
352 const char *LookAhead = CurPtr;
353 while (true) {
354 if (isDigit(*LookAhead)) {
355 ++LookAhead;
356 } else {
357 if (!FirstNonDec)
358 FirstNonDec = LookAhead;
359
360 // Keep going if we are looking for a 'h' suffix.
361 if (LexHex && isHexDigit(*LookAhead))
362 ++LookAhead;
363 else
364 break;
365 }
366 }
367 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
368 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
369 if (isHex)
370 return 16;
371 return DefaultRadix;
372}
373
374static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
375 while (hexDigitValue(*CurPtr) < DefaultRadix) {
376 ++CurPtr;
377 }
378 return CurPtr;
379}
380
382 if (Value.isIntN(64))
385}
386
387static std::string radixName(unsigned Radix) {
388 switch (Radix) {
389 case 2:
390 return "binary";
391 case 8:
392 return "octal";
393 case 10:
394 return "decimal";
395 case 16:
396 return "hexadecimal";
397 default:
398 return "base-" + std::to_string(Radix);
399 }
400}
401
402/// LexDigit: First character is [0-9].
403/// Local Label: [0-9][:]
404/// Forward/Backward Label: [0-9][fb]
405/// Binary integer: 0b[01]+
406/// Octal integer: 0[0-7]+
407/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
408/// Decimal integer: [1-9][0-9]*
409AsmToken AsmLexer::LexDigit() {
410 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
411 // MASM-flavor octal integer: [0-7]+[oOqQ]
412 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
413 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
414 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
415 const char *FirstNonBinary =
416 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
417 const char *FirstNonDecimal =
418 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
419 const char *OldCurPtr = CurPtr;
420 while (isHexDigit(*CurPtr)) {
421 switch (*CurPtr) {
422 default:
423 if (!FirstNonDecimal) {
424 FirstNonDecimal = CurPtr;
425 }
426 [[fallthrough]];
427 case '9':
428 case '8':
429 case '7':
430 case '6':
431 case '5':
432 case '4':
433 case '3':
434 case '2':
435 if (!FirstNonBinary) {
436 FirstNonBinary = CurPtr;
437 }
438 break;
439 case '1':
440 case '0':
441 break;
442 }
443 ++CurPtr;
444 }
445 if (*CurPtr == '.') {
446 // MASM float literals (other than hex floats) always contain a ".", and
447 // are always written in decimal.
448 ++CurPtr;
449 return LexFloatLiteral();
450 }
451
452 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
453 ++CurPtr;
454 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
455 }
456
457 unsigned Radix = 0;
458 if (*CurPtr == 'h' || *CurPtr == 'H') {
459 // hexadecimal number
460 ++CurPtr;
461 Radix = 16;
462 } else if (*CurPtr == 't' || *CurPtr == 'T') {
463 // decimal number
464 ++CurPtr;
465 Radix = 10;
466 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
467 *CurPtr == 'Q') {
468 // octal number
469 ++CurPtr;
470 Radix = 8;
471 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
472 // binary number
473 ++CurPtr;
474 Radix = 2;
475 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
476 DefaultRadix < 14 &&
477 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
478 Radix = 10;
479 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
480 DefaultRadix < 12 &&
481 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
482 Radix = 2;
483 }
484
485 if (Radix) {
486 StringRef Result(TokStart, CurPtr - TokStart);
487 APInt Value(128, 0, true);
488
489 if (Result.drop_back().getAsInteger(Radix, Value))
490 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
491
492 // MSVC accepts and ignores type suffices on integer literals.
494
495 return intToken(Result, Value);
496 }
497
498 // default-radix integers, or floating point numbers, fall through
499 CurPtr = OldCurPtr;
500 }
501
502 // MASM default-radix integers: [0-9a-fA-F]+
503 // (All other integer literals have a radix specifier.)
504 if (LexMasmIntegers && UseMasmDefaultRadix) {
505 CurPtr = findLastDigit(CurPtr, 16);
506 StringRef Result(TokStart, CurPtr - TokStart);
507
508 APInt Value(128, 0, true);
509 if (Result.getAsInteger(DefaultRadix, Value)) {
510 return ReturnError(TokStart,
511 "invalid " + radixName(DefaultRadix) + " number");
512 }
513
514 return intToken(Result, Value);
515 }
516
517 // Motorola hex integers: $[0-9a-fA-F]+
518 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
519 const char *NumStart = CurPtr;
520 while (isHexDigit(CurPtr[0]))
521 ++CurPtr;
522
523 APInt Result(128, 0);
524 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
525 return ReturnError(TokStart, "invalid hexadecimal number");
526
527 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
528 }
529
530 // Motorola binary integers: %[01]+
531 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
532 const char *NumStart = CurPtr;
533 while (*CurPtr == '0' || *CurPtr == '1')
534 ++CurPtr;
535
536 APInt Result(128, 0);
537 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
538 return ReturnError(TokStart, "invalid binary number");
539
540 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
541 }
542
543 // Decimal integer: [1-9][0-9]*
544 // HLASM-flavour decimal integer: [0-9][0-9]*
545 // FIXME: Later on, support for fb for HLASM has to be added in
546 // as they probably would be needed for asm goto
547 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
548 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
549
550 if (!LexHLASMIntegers) {
551 bool IsHex = Radix == 16;
552 // Check for floating point literals.
553 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
554 if (*CurPtr == '.')
555 ++CurPtr;
556 return LexFloatLiteral();
557 }
558 }
559
560 StringRef Result(TokStart, CurPtr - TokStart);
561
562 APInt Value(128, 0, true);
563 if (Result.getAsInteger(Radix, Value))
564 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
565
566 if (!LexHLASMIntegers)
567 // The darwin/x86 (and x86-64) assembler accepts and ignores type
568 // suffices on integer literals.
570
571 return intToken(Result, Value);
572 }
573
574 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
575 ++CurPtr;
576 // See if we actually have "0b" as part of something like "jmp 0b\n"
577 if (!isDigit(CurPtr[0])) {
578 --CurPtr;
579 StringRef Result(TokStart, CurPtr - TokStart);
580 return AsmToken(AsmToken::Integer, Result, 0);
581 }
582 const char *NumStart = CurPtr;
583 while (CurPtr[0] == '0' || CurPtr[0] == '1')
584 ++CurPtr;
585
586 // Requires at least one binary digit.
587 if (CurPtr == NumStart)
588 return ReturnError(TokStart, "invalid binary number");
589
590 StringRef Result(TokStart, CurPtr - TokStart);
591
592 APInt Value(128, 0, true);
593 if (Result.substr(2).getAsInteger(2, Value))
594 return ReturnError(TokStart, "invalid binary number");
595
596 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
597 // suffixes on integer literals.
599
600 return intToken(Result, Value);
601 }
602
603 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
604 ++CurPtr;
605 const char *NumStart = CurPtr;
606 while (isHexDigit(CurPtr[0]))
607 ++CurPtr;
608
609 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
610 // diagnosed by LexHexFloatLiteral).
611 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
612 return LexHexFloatLiteral(NumStart == CurPtr);
613
614 // Otherwise requires at least one hex digit.
615 if (CurPtr == NumStart)
616 return ReturnError(CurPtr-2, "invalid hexadecimal number");
617
618 APInt Result(128, 0);
619 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
620 return ReturnError(TokStart, "invalid hexadecimal number");
621
622 // Consume the optional [hH].
623 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
624 ++CurPtr;
625
626 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
627 // suffixes on integer literals.
629
630 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
631 }
632
633 // Either octal or hexadecimal.
634 APInt Value(128, 0, true);
635 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
636 StringRef Result(TokStart, CurPtr - TokStart);
637 if (Result.getAsInteger(Radix, Value))
638 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
639
640 // Consume the [hH].
641 if (Radix == 16)
642 ++CurPtr;
643
644 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
645 // suffixes on integer literals.
647
648 return intToken(Result, Value);
649}
650
651/// LexSingleQuote: Integer: 'b'
652AsmToken AsmLexer::LexSingleQuote() {
653 int CurChar = getNextChar();
654
655 if (LexHLASMStrings)
656 return ReturnError(TokStart, "invalid usage of character literals");
657
658 if (LexMasmStrings) {
659 while (CurChar != EOF) {
660 if (CurChar != '\'') {
661 CurChar = getNextChar();
662 } else if (peekNextChar() == '\'') {
663 // In MASM single-quote strings, doubled single-quotes mean an escaped
664 // single quote, so should be lexed in.
665 (void)getNextChar();
666 CurChar = getNextChar();
667 } else {
668 break;
669 }
670 }
671 if (CurChar == EOF)
672 return ReturnError(TokStart, "unterminated string constant");
673 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
674 }
675
676 if (CurChar == '\\')
677 CurChar = getNextChar();
678
679 if (CurChar == EOF)
680 return ReturnError(TokStart, "unterminated single quote");
681
682 CurChar = getNextChar();
683
684 if (CurChar != '\'')
685 return ReturnError(TokStart, "single quote way too long");
686
687 // The idea here being that 'c' is basically just an integral
688 // constant.
689 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
690 long long Value;
691
692 if (Res.starts_with("\'\\")) {
693 char theChar = Res[2];
694 switch (theChar) {
695 default: Value = theChar; break;
696 case '\'': Value = '\''; break;
697 case 't': Value = '\t'; break;
698 case 'n': Value = '\n'; break;
699 case 'b': Value = '\b'; break;
700 case 'f': Value = '\f'; break;
701 case 'r': Value = '\r'; break;
702 }
703 } else
704 Value = TokStart[1];
705
706 return AsmToken(AsmToken::Integer, Res, Value);
707}
708
709/// LexQuote: String: "..."
710AsmToken AsmLexer::LexQuote() {
711 int CurChar = getNextChar();
712 if (LexHLASMStrings)
713 return ReturnError(TokStart, "invalid usage of string literals");
714
715 if (LexMasmStrings) {
716 while (CurChar != EOF) {
717 if (CurChar != '"') {
718 CurChar = getNextChar();
719 } else if (peekNextChar() == '"') {
720 // In MASM double-quoted strings, doubled double-quotes mean an escaped
721 // double quote, so should be lexed in.
722 (void)getNextChar();
723 CurChar = getNextChar();
724 } else {
725 break;
726 }
727 }
728 if (CurChar == EOF)
729 return ReturnError(TokStart, "unterminated string constant");
730 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
731 }
732
733 while (CurChar != '"') {
734 if (CurChar == '\\') {
735 // Allow \", etc.
736 CurChar = getNextChar();
737 }
738
739 if (CurChar == EOF)
740 return ReturnError(TokStart, "unterminated string constant");
741
742 CurChar = getNextChar();
743 }
744
745 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
746}
747
749 TokStart = CurPtr;
750
751 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
752 !isAtStatementSeparator(CurPtr) && // End of statement marker.
753 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
754 ++CurPtr;
755 }
756 return StringRef(TokStart, CurPtr-TokStart);
757}
758
759StringRef AsmLexer::LexUntilEndOfLine() {
760 TokStart = CurPtr;
761
762 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
763 ++CurPtr;
764 }
765 return StringRef(TokStart, CurPtr-TokStart);
766}
767
769 bool ShouldSkipSpace) {
770 SaveAndRestore SavedTokenStart(TokStart);
771 SaveAndRestore SavedCurPtr(CurPtr);
772 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
773 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
774 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
775 SaveAndRestore SavedIsPeeking(IsPeeking, true);
776 std::string SavedErr = getErr();
777 SMLoc SavedErrLoc = getErrLoc();
778
779 size_t ReadCount;
780 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
781 AsmToken Token = LexToken();
782
783 Buf[ReadCount] = Token;
784
785 if (Token.is(AsmToken::Eof)) {
786 ReadCount++;
787 break;
788 }
789 }
790
791 SetError(SavedErrLoc, SavedErr);
792 return ReadCount;
793}
794
795bool AsmLexer::isAtStartOfComment(const char *Ptr) {
796 if (MAI.isHLASM() && !IsAtStartOfStatement)
797 return false;
798
799 StringRef CommentString = MAI.getCommentString();
800
801 if (CommentString.size() == 1)
802 return CommentString[0] == Ptr[0];
803
804 // Allow # preprocessor comments also be counted as comments for "##" cases
805 if (CommentString[1] == '#')
806 return CommentString[0] == Ptr[0];
807
808 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
809}
810
811bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
812 return strncmp(Ptr, MAI.getSeparatorString(),
813 strlen(MAI.getSeparatorString())) == 0;
814}
815
816AsmToken AsmLexer::LexToken() {
817 TokStart = CurPtr;
818 // This always consumes at least one character.
819 int CurChar = getNextChar();
820
821 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
822 // If this starts with a '#', this may be a cpp
823 // hash directive and otherwise a line comment.
824 AsmToken TokenBuf[2];
825 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
826 size_t num = peekTokens(Buf, true);
827 // There cannot be a space preceding this
828 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
829 TokenBuf[1].is(AsmToken::String)) {
830 CurPtr = TokStart; // reset curPtr;
831 StringRef s = LexUntilEndOfLine();
832 UnLex(TokenBuf[1]);
833 UnLex(TokenBuf[0]);
834 return AsmToken(AsmToken::HashDirective, s);
835 }
836
837 if (MAI.shouldAllowAdditionalComments())
838 return LexLineComment();
839 }
840
841 if (isAtStartOfComment(TokStart)) {
842 StringRef CommentString = MAI.getCommentString();
843 // For multi-char comment strings, advance CurPtr only if we matched the
844 // full string. This stops us from accidentally eating the newline if the
845 // current line ends in a single comment char.
846 if (CommentString.size() > 1 &&
847 StringRef(TokStart, CommentString.size()) == CommentString) {
848 CurPtr += CommentString.size() - 1;
849 }
850 return LexLineComment();
851 }
852
853 if (isAtStatementSeparator(TokStart)) {
854 CurPtr += strlen(MAI.getSeparatorString()) - 1;
855 IsAtStartOfLine = true;
856 IsAtStartOfStatement = true;
857 return AsmToken(AsmToken::EndOfStatement,
858 StringRef(TokStart, strlen(MAI.getSeparatorString())));
859 }
860
861 // If we're missing a newline at EOF, make sure we still get an
862 // EndOfStatement token before the Eof token.
863 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
864 IsAtStartOfLine = true;
865 IsAtStartOfStatement = true;
866 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
867 }
868 IsAtStartOfLine = false;
869 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
870 IsAtStartOfStatement = false;
871 switch (CurChar) {
872 default:
873 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
874 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
875 // an identifier is target-dependent. These characters are handled in the
876 // respective switch cases.
877 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
878 return LexIdentifier();
879
880 // Unknown character, emit an error.
881 return ReturnError(TokStart, "invalid character in input");
882 case EOF:
883 if (EndStatementAtEOF) {
884 IsAtStartOfLine = true;
885 IsAtStartOfStatement = true;
886 }
887 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
888 case 0:
889 case ' ':
890 case '\t':
891 IsAtStartOfStatement = OldIsAtStartOfStatement;
892 while (*CurPtr == ' ' || *CurPtr == '\t')
893 CurPtr++;
894 if (SkipSpace)
895 return LexToken(); // Ignore whitespace.
896 else
897 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
898 case '\r': {
899 IsAtStartOfLine = true;
900 IsAtStartOfStatement = true;
901 // If this is a CR followed by LF, treat that as one token.
902 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
903 ++CurPtr;
904 return AsmToken(AsmToken::EndOfStatement,
905 StringRef(TokStart, CurPtr - TokStart));
906 }
907 case '\n':
908 IsAtStartOfLine = true;
909 IsAtStartOfStatement = true;
910 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
911 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
912 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
913 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
914 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
915 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
916 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
917 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
918 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
919 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
920 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
921 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
922 case '$': {
923 if (LexMotorolaIntegers && isHexDigit(*CurPtr))
924 return LexDigit();
925 if (MAI.doesAllowDollarAtStartOfIdentifier())
926 return LexIdentifier();
927 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
928 }
929 case '@':
930 if (MAI.doesAllowAtAtStartOfIdentifier())
931 return LexIdentifier();
932 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
933 case '#':
934 if (MAI.isHLASM())
935 return LexIdentifier();
936 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
937 case '?':
938 if (MAI.doesAllowQuestionAtStartOfIdentifier())
939 return LexIdentifier();
940 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
941 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
942 case '=':
943 if (*CurPtr == '=') {
944 ++CurPtr;
945 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
946 }
947 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
948 case '-':
949 if (*CurPtr == '>') {
950 ++CurPtr;
951 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
952 }
953 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
954 case '|':
955 if (*CurPtr == '|') {
956 ++CurPtr;
957 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
958 }
959 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
960 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
961 case '&':
962 if (*CurPtr == '&') {
963 ++CurPtr;
964 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
965 }
966 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
967 case '!':
968 if (*CurPtr == '=') {
969 ++CurPtr;
970 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
971 }
972 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
973 case '%':
974 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
975 return LexDigit();
976 }
977 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
978 case '/':
979 IsAtStartOfStatement = OldIsAtStartOfStatement;
980 return LexSlash();
981 case '\'': return LexSingleQuote();
982 case '"': return LexQuote();
983 case '0': case '1': case '2': case '3': case '4':
984 case '5': case '6': case '7': case '8': case '9':
985 return LexDigit();
986 case '<':
987 switch (*CurPtr) {
988 case '<':
989 ++CurPtr;
990 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
991 case '=':
992 ++CurPtr;
993 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
994 case '>':
995 ++CurPtr;
996 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
997 default:
998 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
999 }
1000 case '>':
1001 switch (*CurPtr) {
1002 case '>':
1003 ++CurPtr;
1004 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
1005 case '=':
1006 ++CurPtr;
1007 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
1008 default:
1009 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
1010 }
1011
1012 // TODO: Quoted identifiers (objc methods etc)
1013 // local labels: [0-9][:]
1014 // Forward/backward labels: [0-9][fb]
1015 // Integers, fp constants, character constants.
1016 }
1017}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
static std::string radixName(unsigned Radix)
Definition AsmLexer.cpp:387
static void SkipIgnoredIntegerSuffix(const char *&CurPtr)
Definition AsmLexer.cpp:337
static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, bool LexHex)
Definition AsmLexer.cpp:349
static AsmToken intToken(StringRef Ref, APInt &Value)
Definition AsmLexer.cpp:381
static const char * findLastDigit(const char *CurPtr, unsigned DefaultRadix)
Definition AsmLexer.cpp:374
static bool isIdentifierChar(char C)
Return true if the given character satisfies the following regular expression: [-a-zA-Z$....
Definition MILexer.cpp:118
This file provides utility classes that use RAII to save and restore values.
This file contains some functions that are useful when dealing with strings.
Class for arbitrary precision integers.
Definition APInt.h:78
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM_ABI AsmLexer(const MCAsmInfo &MAI)
Definition AsmLexer.cpp:110
void UnLex(AsmToken const &Token)
Definition AsmLexer.h:106
bool is(AsmToken::TokenKind K) const
Check if the current token has kind K.
Definition AsmLexer.h:147
SMLoc getErrLoc()
Get the current error location.
Definition AsmLexer.h:138
const std::string & getErr()
Get the current error string.
Definition AsmLexer.h:141
LLVM_ABI StringRef LexUntilEndOfStatement()
Definition AsmLexer.cpp:748
LLVM_ABI void setBuffer(StringRef Buf, const char *ptr=nullptr, bool EndStatementAtEOF=true)
Set buffer to be lexed.
Definition AsmLexer.cpp:120
LLVM_ABI size_t peekTokens(MutableArrayRef< AsmToken > Buf, bool ShouldSkipSpace=true)
Look ahead an arbitrary number of tokens.
Definition AsmLexer.cpp:768
Target independent representation for an assembler token.
Definition MCAsmMacro.h:22
LLVM_ABI SMLoc getLoc() const
Definition AsmLexer.cpp:31
StringRef getString() const
Get the string for the current token, this includes all characters (for example, the quotes on string...
Definition MCAsmMacro.h:103
bool is(TokenKind K) const
Definition MCAsmMacro.h:75
LLVM_ABI SMLoc getEndLoc() const
Definition AsmLexer.cpp:33
LLVM_ABI void dump(raw_ostream &OS) const
Definition AsmLexer.cpp:39
LLVM_ABI SMRange getLocRange() const
Definition AsmLexer.cpp:37
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool isHLASM() const
Definition MCAsmInfo.h:520
StringRef getCommentString() const
Definition MCAsmInfo.h:538
const char * getSeparatorString() const
Definition MCAsmInfo.h:533
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
Represents a location in source code.
Definition SMLoc.h:22
static SMLoc getFromPointer(const char *Ptr)
Definition SMLoc.h:35
Represents a range in source code.
Definition SMLoc.h:47
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:261
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
iterator end() const
Definition StringRef.h:114
LLVM Value Representation.
Definition Value.h:75
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & write_escaped(StringRef Str, bool UseHexEscapes=false)
Output Str, turning '\', '\t', ' ', '"', and anything that doesn't satisfy llvm::isPrint into an esca...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
unsigned hexDigitValue(char C)
Interpret the given character C as a hexadecimal digit and return its value.
bool isDigit(char C)
Checks if character C is one of the 10 decimal digits.
bool isAlnum(char C)
Checks whether character C is either a decimal digit or an uppercase or lowercase letter as classifie...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
bool isHexDigit(char C)
Checks if character C is a hexadecimal numeric character.
A utility class that uses RAII to save and restore the value of a variable.