LLVM API Documentation
00001 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This class implements the lexer for assembly files. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #include "llvm/MC/MCParser/AsmLexer.h" 00015 #include "llvm/MC/MCAsmInfo.h" 00016 #include "llvm/Support/MemoryBuffer.h" 00017 #include "llvm/Support/SMLoc.h" 00018 #include <cctype> 00019 #include <cerrno> 00020 #include <cstdio> 00021 #include <cstdlib> 00022 using namespace llvm; 00023 00024 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 00025 CurBuf = NULL; 00026 CurPtr = NULL; 00027 isAtStartOfLine = true; 00028 } 00029 00030 AsmLexer::~AsmLexer() { 00031 } 00032 00033 void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 00034 CurBuf = buf; 00035 00036 if (ptr) 00037 CurPtr = ptr; 00038 else 00039 CurPtr = CurBuf->getBufferStart(); 00040 00041 TokStart = 0; 00042 } 00043 00044 /// ReturnError - Set the error to the specified string at the specified 00045 /// location. This is defined to always return AsmToken::Error. 00046 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 00047 SetError(SMLoc::getFromPointer(Loc), Msg); 00048 00049 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 00050 } 00051 00052 int AsmLexer::getNextChar() { 00053 char CurChar = *CurPtr++; 00054 switch (CurChar) { 00055 default: 00056 return (unsigned char)CurChar; 00057 case 0: 00058 // A nul character in the stream is either the end of the current buffer or 00059 // a random nul in the file. Disambiguate that here. 00060 if (CurPtr-1 != CurBuf->getBufferEnd()) 00061 return 0; // Just whitespace. 00062 00063 // Otherwise, return end of file. 00064 --CurPtr; // Another call to lex will return EOF again. 00065 return EOF; 00066 } 00067 } 00068 00069 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 00070 /// 00071 /// The leading integral digit sequence and dot should have already been 00072 /// consumed, some or all of the fractional digit sequence *can* have been 00073 /// consumed. 00074 AsmToken AsmLexer::LexFloatLiteral() { 00075 // Skip the fractional digit sequence. 00076 while (isdigit(*CurPtr)) 00077 ++CurPtr; 00078 00079 // Check for exponent; we intentionally accept a slighlty wider set of 00080 // literals here and rely on the upstream client to reject invalid ones (e.g., 00081 // "1e+"). 00082 if (*CurPtr == 'e' || *CurPtr == 'E') { 00083 ++CurPtr; 00084 if (*CurPtr == '-' || *CurPtr == '+') 00085 ++CurPtr; 00086 while (isdigit(*CurPtr)) 00087 ++CurPtr; 00088 } 00089 00090 return AsmToken(AsmToken::Real, 00091 StringRef(TokStart, CurPtr - TokStart)); 00092 } 00093 00094 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 00095 static bool IsIdentifierChar(char c) { 00096 return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; 00097 } 00098 AsmToken AsmLexer::LexIdentifier() { 00099 // Check for floating point literals. 00100 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 00101 // Disambiguate a .1243foo identifier from a floating literal. 00102 while (isdigit(*CurPtr)) 00103 ++CurPtr; 00104 if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) 00105 return LexFloatLiteral(); 00106 } 00107 00108 while (IsIdentifierChar(*CurPtr)) 00109 ++CurPtr; 00110 00111 // Handle . as a special case. 00112 if (CurPtr == TokStart+1 && TokStart[0] == '.') 00113 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 00114 00115 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 00116 } 00117 00118 /// LexSlash: Slash: / 00119 /// C-Style Comment: /* ... */ 00120 AsmToken AsmLexer::LexSlash() { 00121 switch (*CurPtr) { 00122 case '*': break; // C style comment. 00123 case '/': return ++CurPtr, LexLineComment(); 00124 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 00125 } 00126 00127 // C Style comment. 00128 ++CurPtr; // skip the star. 00129 while (1) { 00130 int CurChar = getNextChar(); 00131 switch (CurChar) { 00132 case EOF: 00133 return ReturnError(TokStart, "unterminated comment"); 00134 case '*': 00135 // End of the comment? 00136 if (CurPtr[0] != '/') break; 00137 00138 ++CurPtr; // End the */. 00139 return LexToken(); 00140 } 00141 } 00142 } 00143 00144 /// LexLineComment: Comment: #[^\n]* 00145 /// : //[^\n]* 00146 AsmToken AsmLexer::LexLineComment() { 00147 // FIXME: This is broken if we happen to a comment at the end of a file, which 00148 // was .included, and which doesn't end with a newline. 00149 int CurChar = getNextChar(); 00150 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 00151 CurChar = getNextChar(); 00152 00153 if (CurChar == EOF) 00154 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 00155 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 00156 } 00157 00158 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 00159 // Skip ULL, UL, U, L and LL suffices. 00160 if (CurPtr[0] == 'U') 00161 ++CurPtr; 00162 if (CurPtr[0] == 'L') 00163 ++CurPtr; 00164 if (CurPtr[0] == 'L') 00165 ++CurPtr; 00166 } 00167 00168 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the 00169 // integer as a hexadecimal, possibly with leading zeroes. 00170 static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) { 00171 const char *FirstHex = 0; 00172 const char *LookAhead = CurPtr; 00173 while (1) { 00174 if (isdigit(*LookAhead)) { 00175 ++LookAhead; 00176 } else if (isxdigit(*LookAhead)) { 00177 if (!FirstHex) 00178 FirstHex = LookAhead; 00179 ++LookAhead; 00180 } else { 00181 break; 00182 } 00183 } 00184 bool isHex = *LookAhead == 'h' || *LookAhead == 'H'; 00185 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex; 00186 if (isHex) 00187 return 16; 00188 return DefaultRadix; 00189 } 00190 00191 /// LexDigit: First character is [0-9]. 00192 /// Local Label: [0-9][:] 00193 /// Forward/Backward Label: [0-9][fb] 00194 /// Binary integer: 0b[01]+ 00195 /// Octal integer: 0[0-7]+ 00196 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 00197 /// Decimal integer: [1-9][0-9]* 00198 AsmToken AsmLexer::LexDigit() { 00199 // Decimal integer: [1-9][0-9]* 00200 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 00201 unsigned Radix = doLookAhead(CurPtr, 10); 00202 bool isHex = Radix == 16; 00203 // Check for floating point literals. 00204 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) { 00205 ++CurPtr; 00206 return LexFloatLiteral(); 00207 } 00208 00209 StringRef Result(TokStart, CurPtr - TokStart); 00210 00211 long long Value; 00212 if (Result.getAsInteger(Radix, Value)) { 00213 // Allow positive values that are too large to fit into a signed 64-bit 00214 // integer, but that do fit in an unsigned one, we just convert them over. 00215 unsigned long long UValue; 00216 if (Result.getAsInteger(Radix, UValue)) 00217 return ReturnError(TokStart, !isHex ? "invalid decimal number" : 00218 "invalid hexdecimal number"); 00219 Value = (long long)UValue; 00220 } 00221 00222 // Consume the [bB][hH]. 00223 if (Radix == 2 || Radix == 16) 00224 ++CurPtr; 00225 00226 // The darwin/x86 (and x86-64) assembler accepts and ignores type 00227 // suffices on integer literals. 00228 SkipIgnoredIntegerSuffix(CurPtr); 00229 00230 return AsmToken(AsmToken::Integer, Result, Value); 00231 } 00232 00233 if (*CurPtr == 'b') { 00234 ++CurPtr; 00235 // See if we actually have "0b" as part of something like "jmp 0b\n" 00236 if (!isdigit(CurPtr[0])) { 00237 --CurPtr; 00238 StringRef Result(TokStart, CurPtr - TokStart); 00239 return AsmToken(AsmToken::Integer, Result, 0); 00240 } 00241 const char *NumStart = CurPtr; 00242 while (CurPtr[0] == '0' || CurPtr[0] == '1') 00243 ++CurPtr; 00244 00245 // Requires at least one binary digit. 00246 if (CurPtr == NumStart) 00247 return ReturnError(TokStart, "invalid binary number"); 00248 00249 StringRef Result(TokStart, CurPtr - TokStart); 00250 00251 long long Value; 00252 if (Result.substr(2).getAsInteger(2, Value)) 00253 return ReturnError(TokStart, "invalid binary number"); 00254 00255 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 00256 // suffixes on integer literals. 00257 SkipIgnoredIntegerSuffix(CurPtr); 00258 00259 return AsmToken(AsmToken::Integer, Result, Value); 00260 } 00261 00262 if (*CurPtr == 'x') { 00263 ++CurPtr; 00264 const char *NumStart = CurPtr; 00265 while (isxdigit(CurPtr[0])) 00266 ++CurPtr; 00267 00268 // Requires at least one hex digit. 00269 if (CurPtr == NumStart) 00270 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 00271 00272 unsigned long long Result; 00273 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 00274 return ReturnError(TokStart, "invalid hexadecimal number"); 00275 00276 // Consume the optional [hH]. 00277 if (*CurPtr == 'h' || *CurPtr == 'H') 00278 ++CurPtr; 00279 00280 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 00281 // suffixes on integer literals. 00282 SkipIgnoredIntegerSuffix(CurPtr); 00283 00284 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 00285 (int64_t)Result); 00286 } 00287 00288 // Either octal or hexadecimal. 00289 long long Value; 00290 unsigned Radix = doLookAhead(CurPtr, 8); 00291 bool isHex = Radix == 16; 00292 StringRef Result(TokStart, CurPtr - TokStart); 00293 if (Result.getAsInteger(Radix, Value)) 00294 return ReturnError(TokStart, !isHex ? "invalid octal number" : 00295 "invalid hexdecimal number"); 00296 00297 // Consume the [hH]. 00298 if (Radix == 16) 00299 ++CurPtr; 00300 00301 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 00302 // suffixes on integer literals. 00303 SkipIgnoredIntegerSuffix(CurPtr); 00304 00305 return AsmToken(AsmToken::Integer, Result, Value); 00306 } 00307 00308 /// LexSingleQuote: Integer: 'b' 00309 AsmToken AsmLexer::LexSingleQuote() { 00310 int CurChar = getNextChar(); 00311 00312 if (CurChar == '\\') 00313 CurChar = getNextChar(); 00314 00315 if (CurChar == EOF) 00316 return ReturnError(TokStart, "unterminated single quote"); 00317 00318 CurChar = getNextChar(); 00319 00320 if (CurChar != '\'') 00321 return ReturnError(TokStart, "single quote way too long"); 00322 00323 // The idea here being that 'c' is basically just an integral 00324 // constant. 00325 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 00326 long long Value; 00327 00328 if (Res.startswith("\'\\")) { 00329 char theChar = Res[2]; 00330 switch (theChar) { 00331 default: Value = theChar; break; 00332 case '\'': Value = '\''; break; 00333 case 't': Value = '\t'; break; 00334 case 'n': Value = '\n'; break; 00335 case 'b': Value = '\b'; break; 00336 } 00337 } else 00338 Value = TokStart[1]; 00339 00340 return AsmToken(AsmToken::Integer, Res, Value); 00341 } 00342 00343 00344 /// LexQuote: String: "..." 00345 AsmToken AsmLexer::LexQuote() { 00346 int CurChar = getNextChar(); 00347 // TODO: does gas allow multiline string constants? 00348 while (CurChar != '"') { 00349 if (CurChar == '\\') { 00350 // Allow \", etc. 00351 CurChar = getNextChar(); 00352 } 00353 00354 if (CurChar == EOF) 00355 return ReturnError(TokStart, "unterminated string constant"); 00356 00357 CurChar = getNextChar(); 00358 } 00359 00360 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 00361 } 00362 00363 StringRef AsmLexer::LexUntilEndOfStatement() { 00364 TokStart = CurPtr; 00365 00366 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 00367 !isAtStatementSeparator(CurPtr) && // End of statement marker. 00368 *CurPtr != '\n' && 00369 *CurPtr != '\r' && 00370 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 00371 ++CurPtr; 00372 } 00373 return StringRef(TokStart, CurPtr-TokStart); 00374 } 00375 00376 StringRef AsmLexer::LexUntilEndOfLine() { 00377 TokStart = CurPtr; 00378 00379 while (*CurPtr != '\n' && 00380 *CurPtr != '\r' && 00381 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 00382 ++CurPtr; 00383 } 00384 return StringRef(TokStart, CurPtr-TokStart); 00385 } 00386 00387 bool AsmLexer::isAtStartOfComment(char Char) { 00388 // FIXME: This won't work for multi-character comment indicators like "//". 00389 return Char == *MAI.getCommentString(); 00390 } 00391 00392 bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 00393 return strncmp(Ptr, MAI.getSeparatorString(), 00394 strlen(MAI.getSeparatorString())) == 0; 00395 } 00396 00397 AsmToken AsmLexer::LexToken() { 00398 TokStart = CurPtr; 00399 // This always consumes at least one character. 00400 int CurChar = getNextChar(); 00401 00402 if (isAtStartOfComment(CurChar)) { 00403 // If this comment starts with a '#', then return the Hash token and let 00404 // the assembler parser see if it can be parsed as a cpp line filename 00405 // comment. We do this only if we are at the start of a line. 00406 if (CurChar == '#' && isAtStartOfLine) 00407 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 00408 isAtStartOfLine = true; 00409 return LexLineComment(); 00410 } 00411 if (isAtStatementSeparator(TokStart)) { 00412 CurPtr += strlen(MAI.getSeparatorString()) - 1; 00413 return AsmToken(AsmToken::EndOfStatement, 00414 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 00415 } 00416 00417 // If we're missing a newline at EOF, make sure we still get an 00418 // EndOfStatement token before the Eof token. 00419 if (CurChar == EOF && !isAtStartOfLine) { 00420 isAtStartOfLine = true; 00421 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 00422 } 00423 00424 isAtStartOfLine = false; 00425 switch (CurChar) { 00426 default: 00427 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 00428 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 00429 return LexIdentifier(); 00430 00431 // Unknown character, emit an error. 00432 return ReturnError(TokStart, "invalid character in input"); 00433 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 00434 case 0: 00435 case ' ': 00436 case '\t': 00437 if (SkipSpace) { 00438 // Ignore whitespace. 00439 return LexToken(); 00440 } else { 00441 int len = 1; 00442 while (*CurPtr==' ' || *CurPtr=='\t') { 00443 CurPtr++; 00444 len++; 00445 } 00446 return AsmToken(AsmToken::Space, StringRef(TokStart, len)); 00447 } 00448 case '\n': // FALL THROUGH. 00449 case '\r': 00450 isAtStartOfLine = true; 00451 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 00452 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 00453 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 00454 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 00455 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 00456 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 00457 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 00458 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 00459 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 00460 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 00461 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 00462 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 00463 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 00464 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 00465 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 00466 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 00467 case '=': 00468 if (*CurPtr == '=') 00469 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 00470 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 00471 case '|': 00472 if (*CurPtr == '|') 00473 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 00474 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 00475 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 00476 case '&': 00477 if (*CurPtr == '&') 00478 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 00479 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 00480 case '!': 00481 if (*CurPtr == '=') 00482 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 00483 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 00484 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 00485 case '/': return LexSlash(); 00486 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 00487 case '\'': return LexSingleQuote(); 00488 case '"': return LexQuote(); 00489 case '0': case '1': case '2': case '3': case '4': 00490 case '5': case '6': case '7': case '8': case '9': 00491 return LexDigit(); 00492 case '<': 00493 switch (*CurPtr) { 00494 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 00495 StringRef(TokStart, 2)); 00496 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 00497 StringRef(TokStart, 2)); 00498 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 00499 StringRef(TokStart, 2)); 00500 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 00501 } 00502 case '>': 00503 switch (*CurPtr) { 00504 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 00505 StringRef(TokStart, 2)); 00506 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 00507 StringRef(TokStart, 2)); 00508 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 00509 } 00510 00511 // TODO: Quoted identifiers (objc methods etc) 00512 // local labels: [0-9][:] 00513 // Forward/backward labels: [0-9][fb] 00514 // Integers, fp constants, character constants. 00515 } 00516 }