LLVM 20.0.0git
AsmLexer.cpp
Go to the documentation of this file.
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class implements the lexer for assembly files.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/APInt.h"
15#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/StringRef.h"
19#include "llvm/MC/MCAsmInfo.h"
22#include "llvm/Support/SMLoc.h"
24#include <cassert>
25#include <cctype>
26#include <cstdio>
27#include <cstring>
28#include <string>
29#include <tuple>
30#include <utility>
31
32using namespace llvm;
33
34AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
37}
38
39AsmLexer::~AsmLexer() = default;
40
41void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42 bool EndStatementAtEOF) {
43 CurBuf = Buf;
44
45 if (ptr)
46 CurPtr = ptr;
47 else
48 CurPtr = CurBuf.begin();
49
50 TokStart = nullptr;
51 this->EndStatementAtEOF = EndStatementAtEOF;
52}
53
54/// ReturnError - Set the error to the specified string at the specified
55/// location. This is defined to always return AsmToken::Error.
56AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
58
59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60}
61
62int AsmLexer::getNextChar() {
63 if (CurPtr == CurBuf.end())
64 return EOF;
65 return (unsigned char)*CurPtr++;
66}
67
68int AsmLexer::peekNextChar() {
69 if (CurPtr == CurBuf.end())
70 return EOF;
71 return (unsigned char)*CurPtr;
72}
73
74/// The leading integral digit sequence and dot should have already been
75/// consumed, some or all of the fractional digit sequence *can* have been
76/// consumed.
77AsmToken AsmLexer::LexFloatLiteral() {
78 // Skip the fractional digit sequence.
79 while (isDigit(*CurPtr))
80 ++CurPtr;
81
82 if (*CurPtr == '-' || *CurPtr == '+')
83 return ReturnError(CurPtr, "invalid sign in float literal");
84
85 // Check for exponent
86 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87 ++CurPtr;
88
89 if (*CurPtr == '-' || *CurPtr == '+')
90 ++CurPtr;
91
92 while (isDigit(*CurPtr))
93 ++CurPtr;
94 }
95
97 StringRef(TokStart, CurPtr - TokStart));
98}
99
100/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101/// while making sure there are enough actual digits around for the constant to
102/// be valid.
103///
104/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105/// before we get here.
106AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108 "unexpected parse state in floating hex");
109 bool NoFracDigits = true;
110
111 // Skip the fractional part if there is one
112 if (*CurPtr == '.') {
113 ++CurPtr;
114
115 const char *FracStart = CurPtr;
116 while (isHexDigit(*CurPtr))
117 ++CurPtr;
118
119 NoFracDigits = CurPtr == FracStart;
120 }
121
122 if (NoIntDigits && NoFracDigits)
123 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124 "expected at least one significand digit");
125
126 // Make sure we do have some kind of proper exponent part
127 if (*CurPtr != 'p' && *CurPtr != 'P')
128 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
129 "expected exponent part 'p'");
130 ++CurPtr;
131
132 if (*CurPtr == '+' || *CurPtr == '-')
133 ++CurPtr;
134
135 // N.b. exponent digits are *not* hex
136 const char *ExpStart = CurPtr;
137 while (isDigit(*CurPtr))
138 ++CurPtr;
139
140 if (CurPtr == ExpStart)
141 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
142 "expected at least one exponent digit");
143
145}
146
147/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150 (AllowAt && C == '@') || (AllowHash && C == '#');
151}
152
153AsmToken AsmLexer::LexIdentifier() {
154 // Check for floating point literals.
155 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
156 // Disambiguate a .1243foo identifier from a floating literal.
157 while (isDigit(*CurPtr))
158 ++CurPtr;
159
162 *CurPtr == 'e' || *CurPtr == 'E')
163 return LexFloatLiteral();
164 }
165
167 ++CurPtr;
168
169 // Handle . as a special case.
170 if (CurPtr == TokStart+1 && TokStart[0] == '.')
172
174}
175
176/// LexSlash: Slash: /
177/// C-Style Comment: /* ... */
178/// C-style Comment: // ...
179AsmToken AsmLexer::LexSlash() {
181 IsAtStartOfStatement = false;
183 }
184
185 switch (*CurPtr) {
186 case '*':
187 IsAtStartOfStatement = false;
188 break; // C style comment.
189 case '/':
190 ++CurPtr;
191 return LexLineComment();
192 default:
193 IsAtStartOfStatement = false;
195 }
196
197 // C Style comment.
198 ++CurPtr; // skip the star.
199 const char *CommentTextStart = CurPtr;
200 while (CurPtr != CurBuf.end()) {
201 switch (*CurPtr++) {
202 case '*':
203 // End of the comment?
204 if (*CurPtr != '/')
205 break;
206 // If we have a CommentConsumer, notify it about the comment.
207 if (CommentConsumer) {
209 SMLoc::getFromPointer(CommentTextStart),
210 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211 }
212 ++CurPtr; // End the */.
214 StringRef(TokStart, CurPtr - TokStart));
215 }
216 }
217 return ReturnError(TokStart, "unterminated comment");
218}
219
220/// LexLineComment: Comment: #[^\n]*
221/// : //[^\n]*
222AsmToken AsmLexer::LexLineComment() {
223 // Mark This as an end of statement with a body of the
224 // comment. While it would be nicer to leave this two tokens,
225 // backwards compatability with TargetParsers makes keeping this in this form
226 // better.
227 const char *CommentTextStart = CurPtr;
228 int CurChar = getNextChar();
229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230 CurChar = getNextChar();
231 const char *NewlinePtr = CurPtr;
232 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
233 ++CurPtr;
234
235 // If we have a CommentConsumer, notify it about the comment.
236 if (CommentConsumer) {
238 SMLoc::getFromPointer(CommentTextStart),
239 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
240 }
241
242 IsAtStartOfLine = true;
243 // This is a whole line comment. leave newline
244 if (IsAtStartOfStatement)
246 StringRef(TokStart, CurPtr - TokStart));
247 IsAtStartOfStatement = true;
248
250 StringRef(TokStart, CurPtr - 1 - TokStart));
251}
252
253static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
255 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
256 ++CurPtr;
257 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
258 ++CurPtr;
259 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
260 ++CurPtr;
261}
262
263// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264// integer as a hexadecimal, possibly with leading zeroes.
265static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
266 bool LexHex) {
267 const char *FirstNonDec = nullptr;
268 const char *LookAhead = CurPtr;
269 while (true) {
270 if (isDigit(*LookAhead)) {
271 ++LookAhead;
272 } else {
273 if (!FirstNonDec)
274 FirstNonDec = LookAhead;
275
276 // Keep going if we are looking for a 'h' suffix.
277 if (LexHex && isHexDigit(*LookAhead))
278 ++LookAhead;
279 else
280 break;
281 }
282 }
283 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
284 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
285 if (isHex)
286 return 16;
287 return DefaultRadix;
288}
289
290static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
291 while (hexDigitValue(*CurPtr) < DefaultRadix) {
292 ++CurPtr;
293 }
294 return CurPtr;
295}
296
298 if (Value.isIntN(64))
301}
302
303static std::string radixName(unsigned Radix) {
304 switch (Radix) {
305 case 2:
306 return "binary";
307 case 8:
308 return "octal";
309 case 10:
310 return "decimal";
311 case 16:
312 return "hexadecimal";
313 default:
314 return "base-" + std::to_string(Radix);
315 }
316}
317
318/// LexDigit: First character is [0-9].
319/// Local Label: [0-9][:]
320/// Forward/Backward Label: [0-9][fb]
321/// Binary integer: 0b[01]+
322/// Octal integer: 0[0-7]+
323/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324/// Decimal integer: [1-9][0-9]*
325AsmToken AsmLexer::LexDigit() {
326 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327 // MASM-flavor octal integer: [0-7]+[oOqQ]
328 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
331 const char *FirstNonBinary =
332 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
333 const char *FirstNonDecimal =
334 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
335 const char *OldCurPtr = CurPtr;
336 while (isHexDigit(*CurPtr)) {
337 switch (*CurPtr) {
338 default:
339 if (!FirstNonDecimal) {
340 FirstNonDecimal = CurPtr;
341 }
342 [[fallthrough]];
343 case '9':
344 case '8':
345 case '7':
346 case '6':
347 case '5':
348 case '4':
349 case '3':
350 case '2':
351 if (!FirstNonBinary) {
352 FirstNonBinary = CurPtr;
353 }
354 break;
355 case '1':
356 case '0':
357 break;
358 }
359 ++CurPtr;
360 }
361 if (*CurPtr == '.') {
362 // MASM float literals (other than hex floats) always contain a ".", and
363 // are always written in decimal.
364 ++CurPtr;
365 return LexFloatLiteral();
366 }
367
368 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
369 ++CurPtr;
371 }
372
373 unsigned Radix = 0;
374 if (*CurPtr == 'h' || *CurPtr == 'H') {
375 // hexadecimal number
376 ++CurPtr;
377 Radix = 16;
378 } else if (*CurPtr == 't' || *CurPtr == 'T') {
379 // decimal number
380 ++CurPtr;
381 Radix = 10;
382 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
383 *CurPtr == 'Q') {
384 // octal number
385 ++CurPtr;
386 Radix = 8;
387 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
388 // binary number
389 ++CurPtr;
390 Radix = 2;
391 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
392 DefaultRadix < 14 &&
393 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
394 Radix = 10;
395 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
396 DefaultRadix < 12 &&
397 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
398 Radix = 2;
399 }
400
401 if (Radix) {
403 APInt Value(128, 0, true);
404
405 if (Result.drop_back().getAsInteger(Radix, Value))
406 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
407
408 // MSVC accepts and ignores type suffices on integer literals.
410
411 return intToken(Result, Value);
412 }
413
414 // default-radix integers, or floating point numbers, fall through
415 CurPtr = OldCurPtr;
416 }
417
418 // MASM default-radix integers: [0-9a-fA-F]+
419 // (All other integer literals have a radix specifier.)
421 CurPtr = findLastDigit(CurPtr, 16);
423
424 APInt Value(128, 0, true);
425 if (Result.getAsInteger(DefaultRadix, Value)) {
426 return ReturnError(TokStart,
427 "invalid " + radixName(DefaultRadix) + " number");
428 }
429
430 return intToken(Result, Value);
431 }
432
433 // Motorola hex integers: $[0-9a-fA-F]+
434 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
435 const char *NumStart = CurPtr;
436 while (isHexDigit(CurPtr[0]))
437 ++CurPtr;
438
439 APInt Result(128, 0);
440 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
441 return ReturnError(TokStart, "invalid hexadecimal number");
442
443 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
444 }
445
446 // Motorola binary integers: %[01]+
447 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
448 const char *NumStart = CurPtr;
449 while (*CurPtr == '0' || *CurPtr == '1')
450 ++CurPtr;
451
452 APInt Result(128, 0);
453 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
454 return ReturnError(TokStart, "invalid binary number");
455
456 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
457 }
458
459 // Decimal integer: [1-9][0-9]*
460 // HLASM-flavour decimal integer: [0-9][0-9]*
461 // FIXME: Later on, support for fb for HLASM has to be added in
462 // as they probably would be needed for asm goto
463 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
464 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
465
466 if (!LexHLASMIntegers) {
467 bool IsHex = Radix == 16;
468 // Check for floating point literals.
469 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
470 if (*CurPtr == '.')
471 ++CurPtr;
472 return LexFloatLiteral();
473 }
474 }
475
477
478 APInt Value(128, 0, true);
479 if (Result.getAsInteger(Radix, Value))
480 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
481
482 if (!LexHLASMIntegers)
483 // The darwin/x86 (and x86-64) assembler accepts and ignores type
484 // suffices on integer literals.
486
487 return intToken(Result, Value);
488 }
489
490 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
491 ++CurPtr;
492 // See if we actually have "0b" as part of something like "jmp 0b\n"
493 if (!isDigit(CurPtr[0])) {
494 --CurPtr;
496 return AsmToken(AsmToken::Integer, Result, 0);
497 }
498 const char *NumStart = CurPtr;
499 while (CurPtr[0] == '0' || CurPtr[0] == '1')
500 ++CurPtr;
501
502 // Requires at least one binary digit.
503 if (CurPtr == NumStart)
504 return ReturnError(TokStart, "invalid binary number");
505
507
508 APInt Value(128, 0, true);
509 if (Result.substr(2).getAsInteger(2, Value))
510 return ReturnError(TokStart, "invalid binary number");
511
512 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513 // suffixes on integer literals.
515
516 return intToken(Result, Value);
517 }
518
519 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
520 ++CurPtr;
521 const char *NumStart = CurPtr;
522 while (isHexDigit(CurPtr[0]))
523 ++CurPtr;
524
525 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526 // diagnosed by LexHexFloatLiteral).
527 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
528 return LexHexFloatLiteral(NumStart == CurPtr);
529
530 // Otherwise requires at least one hex digit.
531 if (CurPtr == NumStart)
532 return ReturnError(CurPtr-2, "invalid hexadecimal number");
533
534 APInt Result(128, 0);
535 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
536 return ReturnError(TokStart, "invalid hexadecimal number");
537
538 // Consume the optional [hH].
539 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
540 ++CurPtr;
541
542 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543 // suffixes on integer literals.
545
546 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
547 }
548
549 // Either octal or hexadecimal.
550 APInt Value(128, 0, true);
551 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
553 if (Result.getAsInteger(Radix, Value))
554 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
555
556 // Consume the [hH].
557 if (Radix == 16)
558 ++CurPtr;
559
560 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561 // suffixes on integer literals.
563
564 return intToken(Result, Value);
565}
566
567/// LexSingleQuote: Integer: 'b'
568AsmToken AsmLexer::LexSingleQuote() {
569 int CurChar = getNextChar();
570
571 if (LexHLASMStrings)
572 return ReturnError(TokStart, "invalid usage of character literals");
573
574 if (LexMasmStrings) {
575 while (CurChar != EOF) {
576 if (CurChar != '\'') {
577 CurChar = getNextChar();
578 } else if (peekNextChar() == '\'') {
579 // In MASM single-quote strings, doubled single-quotes mean an escaped
580 // single quote, so should be lexed in.
581 (void)getNextChar();
582 CurChar = getNextChar();
583 } else {
584 break;
585 }
586 }
587 if (CurChar == EOF)
588 return ReturnError(TokStart, "unterminated string constant");
590 }
591
592 if (CurChar == '\\')
593 CurChar = getNextChar();
594
595 if (CurChar == EOF)
596 return ReturnError(TokStart, "unterminated single quote");
597
598 CurChar = getNextChar();
599
600 if (CurChar != '\'')
601 return ReturnError(TokStart, "single quote way too long");
602
603 // The idea here being that 'c' is basically just an integral
604 // constant.
605 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
606 long long Value;
607
608 if (Res.starts_with("\'\\")) {
609 char theChar = Res[2];
610 switch (theChar) {
611 default: Value = theChar; break;
612 case '\'': Value = '\''; break;
613 case 't': Value = '\t'; break;
614 case 'n': Value = '\n'; break;
615 case 'b': Value = '\b'; break;
616 case 'f': Value = '\f'; break;
617 case 'r': Value = '\r'; break;
618 }
619 } else
620 Value = TokStart[1];
621
622 return AsmToken(AsmToken::Integer, Res, Value);
623}
624
625/// LexQuote: String: "..."
626AsmToken AsmLexer::LexQuote() {
627 int CurChar = getNextChar();
628 if (LexHLASMStrings)
629 return ReturnError(TokStart, "invalid usage of string literals");
630
631 if (LexMasmStrings) {
632 while (CurChar != EOF) {
633 if (CurChar != '"') {
634 CurChar = getNextChar();
635 } else if (peekNextChar() == '"') {
636 // In MASM double-quoted strings, doubled double-quotes mean an escaped
637 // double quote, so should be lexed in.
638 (void)getNextChar();
639 CurChar = getNextChar();
640 } else {
641 break;
642 }
643 }
644 if (CurChar == EOF)
645 return ReturnError(TokStart, "unterminated string constant");
647 }
648
649 while (CurChar != '"') {
650 if (CurChar == '\\') {
651 // Allow \", etc.
652 CurChar = getNextChar();
653 }
654
655 if (CurChar == EOF)
656 return ReturnError(TokStart, "unterminated string constant");
657
658 CurChar = getNextChar();
659 }
660
662}
663
665 TokStart = CurPtr;
666
667 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
668 !isAtStatementSeparator(CurPtr) && // End of statement marker.
669 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
670 ++CurPtr;
671 }
672 return StringRef(TokStart, CurPtr-TokStart);
673}
674
675StringRef AsmLexer::LexUntilEndOfLine() {
676 TokStart = CurPtr;
677
678 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
679 ++CurPtr;
680 }
681 return StringRef(TokStart, CurPtr-TokStart);
682}
683
685 bool ShouldSkipSpace) {
686 SaveAndRestore SavedTokenStart(TokStart);
687 SaveAndRestore SavedCurPtr(CurPtr);
688 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
689 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
690 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
691 SaveAndRestore SavedIsPeeking(IsPeeking, true);
692 std::string SavedErr = getErr();
693 SMLoc SavedErrLoc = getErrLoc();
694
695 size_t ReadCount;
696 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
697 AsmToken Token = LexToken();
698
699 Buf[ReadCount] = Token;
700
701 if (Token.is(AsmToken::Eof))
702 break;
703 }
704
705 SetError(SavedErrLoc, SavedErr);
706 return ReadCount;
707}
708
709bool AsmLexer::isAtStartOfComment(const char *Ptr) {
710 if (MAI.isHLASM() && !IsAtStartOfStatement)
711 return false;
712
713 StringRef CommentString = MAI.getCommentString();
714
715 if (CommentString.size() == 1)
716 return CommentString[0] == Ptr[0];
717
718 // Allow # preprocessor comments also be counted as comments for "##" cases
719 if (CommentString[1] == '#')
720 return CommentString[0] == Ptr[0];
721
722 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
723}
724
725bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
726 return strncmp(Ptr, MAI.getSeparatorString(),
727 strlen(MAI.getSeparatorString())) == 0;
728}
729
731 TokStart = CurPtr;
732 // This always consumes at least one character.
733 int CurChar = getNextChar();
734
735 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
736 // If this starts with a '#', this may be a cpp
737 // hash directive and otherwise a line comment.
738 AsmToken TokenBuf[2];
739 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
740 size_t num = peekTokens(Buf, true);
741 // There cannot be a space preceding this
742 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
743 TokenBuf[1].is(AsmToken::String)) {
744 CurPtr = TokStart; // reset curPtr;
745 StringRef s = LexUntilEndOfLine();
746 UnLex(TokenBuf[1]);
747 UnLex(TokenBuf[0]);
749 }
750
752 return LexLineComment();
753 }
754
755 if (isAtStartOfComment(TokStart))
756 return LexLineComment();
757
758 if (isAtStatementSeparator(TokStart)) {
759 CurPtr += strlen(MAI.getSeparatorString()) - 1;
760 IsAtStartOfLine = true;
761 IsAtStartOfStatement = true;
763 StringRef(TokStart, strlen(MAI.getSeparatorString())));
764 }
765
766 // If we're missing a newline at EOF, make sure we still get an
767 // EndOfStatement token before the Eof token.
768 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
769 IsAtStartOfLine = true;
770 IsAtStartOfStatement = true;
772 }
773 IsAtStartOfLine = false;
774 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
775 IsAtStartOfStatement = false;
776 switch (CurChar) {
777 default:
778 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
779 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
780 // an identifier is target-dependent. These characters are handled in the
781 // respective switch cases.
782 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
783 return LexIdentifier();
784
785 // Unknown character, emit an error.
786 return ReturnError(TokStart, "invalid character in input");
787 case EOF:
788 if (EndStatementAtEOF) {
789 IsAtStartOfLine = true;
790 IsAtStartOfStatement = true;
791 }
793 case 0:
794 case ' ':
795 case '\t':
796 IsAtStartOfStatement = OldIsAtStartOfStatement;
797 while (*CurPtr == ' ' || *CurPtr == '\t')
798 CurPtr++;
799 if (SkipSpace)
800 return LexToken(); // Ignore whitespace.
801 else
803 case '\r': {
804 IsAtStartOfLine = true;
805 IsAtStartOfStatement = true;
806 // If this is a CR followed by LF, treat that as one token.
807 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
808 ++CurPtr;
810 StringRef(TokStart, CurPtr - TokStart));
811 }
812 case '\n':
813 IsAtStartOfLine = true;
814 IsAtStartOfStatement = true;
816 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
817 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
818 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
819 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
820 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
821 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
822 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
823 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
824 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
825 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
826 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
827 case '$': {
828 if (LexMotorolaIntegers && isHexDigit(*CurPtr))
829 return LexDigit();
831 return LexIdentifier();
833 }
834 case '@':
836 return LexIdentifier();
838 case '#':
839 if (MAI.isHLASM())
840 return LexIdentifier();
842 case '?':
844 return LexIdentifier();
846 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
847 case '=':
848 if (*CurPtr == '=') {
849 ++CurPtr;
851 }
853 case '-':
854 if (*CurPtr == '>') {
855 ++CurPtr;
857 }
859 case '|':
860 if (*CurPtr == '|') {
861 ++CurPtr;
863 }
865 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
866 case '&':
867 if (*CurPtr == '&') {
868 ++CurPtr;
870 }
872 case '!':
873 if (*CurPtr == '=') {
874 ++CurPtr;
876 }
878 case '%':
879 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
880 return LexDigit();
881 }
882
883 if (MAI.hasMipsExpressions()) {
885 unsigned OperatorLength;
886
887 std::tie(Operator, OperatorLength) =
889 StringRef(CurPtr))
890 .StartsWith("call16", {AsmToken::PercentCall16, 7})
891 .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
892 .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
893 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
894 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
895 .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
896 .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
897 .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
898 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
899 .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
900 .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
901 .StartsWith("got", {AsmToken::PercentGot, 4})
902 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
903 .StartsWith("higher", {AsmToken::PercentHigher, 7})
904 .StartsWith("highest", {AsmToken::PercentHighest, 8})
905 .StartsWith("hi", {AsmToken::PercentHi, 3})
906 .StartsWith("lo", {AsmToken::PercentLo, 3})
907 .StartsWith("neg", {AsmToken::PercentNeg, 4})
908 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
909 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
910 .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
911 .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
912 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
913 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
915
917 CurPtr += OperatorLength - 1;
918 return AsmToken(Operator, StringRef(TokStart, OperatorLength));
919 }
920 }
922 case '/':
923 IsAtStartOfStatement = OldIsAtStartOfStatement;
924 return LexSlash();
925 case '\'': return LexSingleQuote();
926 case '"': return LexQuote();
927 case '0': case '1': case '2': case '3': case '4':
928 case '5': case '6': case '7': case '8': case '9':
929 return LexDigit();
930 case '<':
931 switch (*CurPtr) {
932 case '<':
933 ++CurPtr;
935 case '=':
936 ++CurPtr;
938 case '>':
939 ++CurPtr;
941 default:
943 }
944 case '>':
945 switch (*CurPtr) {
946 case '>':
947 ++CurPtr;
949 case '=':
950 ++CurPtr;
952 default:
954 }
955
956 // TODO: Quoted identifiers (objc methods etc)
957 // local labels: [0-9][:]
958 // Forward/backward labels: [0-9][fb]
959 // Integers, fp constants, character constants.
960 }
961}
This file implements a class to represent arbitrary precision integral constant values and operations...
static std::string radixName(unsigned Radix)
Definition: AsmLexer.cpp:303
static void SkipIgnoredIntegerSuffix(const char *&CurPtr)
Definition: AsmLexer.cpp:253
static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, bool LexHex)
Definition: AsmLexer.cpp:265
static AsmToken intToken(StringRef Ref, APInt &Value)
Definition: AsmLexer.cpp:297
static const char * findLastDigit(const char *CurPtr, unsigned DefaultRadix)
Definition: AsmLexer.cpp:290
static bool isIdentifierChar(char C)
Return true if the given character satisfies the following regular expression: [-a-zA-Z$....
Definition: MILexer.cpp:118
static bool isDigit(const char C)
static bool isHexDigit(const char C)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides utility classes that use RAII to save and restore values.
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
Class for arbitrary precision integers.
Definition: APInt.h:78
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
virtual void HandleComment(SMLoc Loc, StringRef CommentText)=0
Callback function for when a comment is lexed.
~AsmLexer() override
size_t peekTokens(MutableArrayRef< AsmToken > Buf, bool ShouldSkipSpace=true) override
Look ahead an arbitrary number of tokens.
Definition: AsmLexer.cpp:684
AsmLexer(const MCAsmInfo &MAI)
Definition: AsmLexer.cpp:34
StringRef LexUntilEndOfStatement() override
Definition: AsmLexer.cpp:664
void setBuffer(StringRef Buf, const char *ptr=nullptr, bool EndStatementAtEOF=true)
Definition: AsmLexer.cpp:41
AsmToken LexToken() override
LexToken - Read the next token and return its code.
Definition: AsmLexer.cpp:730
Target independent representation for an assembler token.
Definition: MCAsmMacro.h:21
bool is(TokenKind K) const
Definition: MCAsmMacro.h:82
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition: MCAsmInfo.h:56
bool isHLASM() const
Definition: MCAsmInfo.h:525
bool doesAllowDollarAtStartOfIdentifier() const
Definition: MCAsmInfo.h:576
bool hasMipsExpressions() const
Definition: MCAsmInfo.h:730
bool shouldUseMotorolaIntegers() const
Definition: MCAsmInfo.h:731
StringRef getCommentString() const
Definition: MCAsmInfo.h:543
const char * getSeparatorString() const
Definition: MCAsmInfo.h:538
bool doesAllowAtAtStartOfIdentifier() const
Definition: MCAsmInfo.h:573
bool shouldAllowAdditionalComments() const
Definition: MCAsmInfo.h:544
bool doesAllowQuestionAtStartOfIdentifier() const
Definition: MCAsmInfo.h:570
void UnLex(AsmToken const &Token)
Definition: MCAsmLexer.h:93
bool LexMasmHexFloats
Definition: MCAsmLexer.h:51
bool LexHLASMStrings
Definition: MCAsmLexer.h:58
bool UseMasmDefaultRadix
Definition: MCAsmLexer.h:55
bool LexHLASMIntegers
Definition: MCAsmLexer.h:57
AsmCommentConsumer * CommentConsumer
Definition: MCAsmLexer.h:59
bool LexMasmIntegers
Definition: MCAsmLexer.h:52
bool LexMotorolaIntegers
Definition: MCAsmLexer.h:54
SMLoc getErrLoc()
Get the current error location.
Definition: MCAsmLexer.h:128
const char * TokStart
Definition: MCAsmLexer.h:46
unsigned DefaultRadix
Definition: MCAsmLexer.h:56
bool AllowAtInIdentifier
Definition: MCAsmLexer.h:48
bool is(AsmToken::TokenKind K) const
Check if the current token has kind K.
Definition: MCAsmLexer.h:141
void SetError(SMLoc errLoc, const std::string &err)
Definition: MCAsmLexer.h:65
bool AllowHashInIdentifier
Definition: MCAsmLexer.h:49
const std::string & getErr()
Get the current error string.
Definition: MCAsmLexer.h:133
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
This is a utility class that provides an abstraction for the common functionality between Instruction...
Definition: Operator.h:32
Represents a location in source code.
Definition: SMLoc.h:23
static SMLoc getFromPointer(const char *Ptr)
Definition: SMLoc.h:36
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
iterator begin() const
Definition: StringRef.h:116
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
iterator end() const
Definition: StringRef.h:118
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
LLVM Value Representation.
Definition: Value.h:74
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Ref
The access may reference the value stored in memory.
@ Default
The result values are uniform if and only if all operands are uniform.
A utility class that uses RAII to save and restore the value of a variable.