Bug Summary

File:tools/clang/lib/Lex/Lexer.cpp
Warning:line 235, column 11
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Lexer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-eagerly-assume -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -mrelocation-model pic -pic-level 2 -mthread-model posix -relaxed-aliasing -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-7/lib/clang/7.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-7~svn329677/build-llvm/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include -I /build/llvm-toolchain-snapshot-7~svn329677/build-llvm/tools/clang/include -I /build/llvm-toolchain-snapshot-7~svn329677/build-llvm/include -I /build/llvm-toolchain-snapshot-7~svn329677/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/x86_64-linux-gnu/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/x86_64-linux-gnu/c++/7.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/c++/7.3.0/backward -internal-isystem /usr/include/clang/7.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-7/lib/clang/7.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-7~svn329677/build-llvm/tools/clang/lib/Lex -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -fobjc-runtime=gcc -fno-common -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2018-04-11-031539-24776-1 -x c++ /build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp

/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp

1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the Lexer and Token interfaces.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/Lex/Lexer.h"
15#include "UnicodeCharSets.h"
16#include "clang/Basic/CharInfo.h"
17#include "clang/Basic/IdentifierTable.h"
18#include "clang/Basic/LangOptions.h"
19#include "clang/Basic/SourceLocation.h"
20#include "clang/Basic/SourceManager.h"
21#include "clang/Basic/TokenKinds.h"
22#include "clang/Lex/LexDiagnostic.h"
23#include "clang/Lex/LiteralSupport.h"
24#include "clang/Lex/MultipleIncludeOpt.h"
25#include "clang/Lex/Preprocessor.h"
26#include "clang/Lex/PreprocessorOptions.h"
27#include "clang/Lex/Token.h"
28#include "clang/Basic/Diagnostic.h"
29#include "clang/Basic/LLVM.h"
30#include "clang/Basic/TokenKinds.h"
31#include "llvm/ADT/None.h"
32#include "llvm/ADT/Optional.h"
33#include "llvm/ADT/StringExtras.h"
34#include "llvm/ADT/StringSwitch.h"
35#include "llvm/ADT/StringRef.h"
36#include "llvm/Support/Compiler.h"
37#include "llvm/Support/ConvertUTF.h"
38#include "llvm/Support/MathExtras.h"
39#include "llvm/Support/MemoryBuffer.h"
40#include "llvm/Support/NativeFormatting.h"
41#include "llvm/Support/UnicodeCharRanges.h"
42#include <algorithm>
43#include <cassert>
44#include <cstddef>
45#include <cstdint>
46#include <cstring>
47#include <string>
48#include <tuple>
49#include <utility>
50
51using namespace clang;
52
53//===----------------------------------------------------------------------===//
54// Token Class Implementation
55//===----------------------------------------------------------------------===//
56
57/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
58bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
59 if (isAnnotation())
60 return false;
61 if (IdentifierInfo *II = getIdentifierInfo())
62 return II->getObjCKeywordID() == objcKey;
63 return false;
64}
65
66/// getObjCKeywordID - Return the ObjC keyword kind.
67tok::ObjCKeywordKind Token::getObjCKeywordID() const {
68 if (isAnnotation())
69 return tok::objc_not_keyword;
70 IdentifierInfo *specId = getIdentifierInfo();
71 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
72}
73
74//===----------------------------------------------------------------------===//
75// Lexer Class Implementation
76//===----------------------------------------------------------------------===//
77
78void Lexer::anchor() {}
79
80void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
81 const char *BufEnd) {
82 BufferStart = BufStart;
83 BufferPtr = BufPtr;
84 BufferEnd = BufEnd;
85
86 assert(BufEnd[0] == 0 &&(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 88, __extension__ __PRETTY_FUNCTION__))
87 "We assume that the input buffer has a null character at the end"(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 88, __extension__ __PRETTY_FUNCTION__))
88 " to simplify lexing!")(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 88, __extension__ __PRETTY_FUNCTION__))
;
89
90 // Check whether we have a BOM in the beginning of the buffer. If yes - act
91 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
92 // skip the UTF-8 BOM if it's present.
93 if (BufferStart == BufferPtr) {
94 // Determine the size of the BOM.
95 StringRef Buf(BufferStart, BufferEnd - BufferStart);
96 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
97 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
98 .Default(0);
99
100 // Skip the BOM.
101 BufferPtr += BOMLength;
102 }
103
104 Is_PragmaLexer = false;
105 CurrentConflictMarkerState = CMK_None;
106
107 // Start of the file is a start of line.
108 IsAtStartOfLine = true;
109 IsAtPhysicalStartOfLine = true;
110
111 HasLeadingSpace = false;
112 HasLeadingEmptyMacro = false;
113
114 // We are not after parsing a #.
115 ParsingPreprocessorDirective = false;
116
117 // We are not after parsing #include.
118 ParsingFilename = false;
119
120 // We are not in raw mode. Raw mode disables diagnostics and interpretation
121 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
122 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
123 // or otherwise skipping over tokens.
124 LexingRawMode = false;
125
126 // Default to not keeping comments.
127 ExtendedTokenMode = 0;
128}
129
130/// Lexer constructor - Create a new lexer object for the specified buffer
131/// with the specified preprocessor managing the lexing process. This lexer
132/// assumes that the associated file buffer and Preprocessor objects will
133/// outlive it, so it doesn't take ownership of either of them.
134Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
135 : PreprocessorLexer(&PP, FID),
136 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
137 LangOpts(PP.getLangOpts()) {
138 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
139 InputFile->getBufferEnd());
140
141 resetExtendedTokenMode();
142}
143
144/// Lexer constructor - Create a new raw lexer object. This object is only
145/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
146/// range will outlive it, so it doesn't take ownership of it.
147Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
148 const char *BufStart, const char *BufPtr, const char *BufEnd)
149 : FileLoc(fileloc), LangOpts(langOpts) {
150 InitLexer(BufStart, BufPtr, BufEnd);
151
152 // We *are* in raw mode.
153 LexingRawMode = true;
154}
155
156/// Lexer constructor - Create a new raw lexer object. This object is only
157/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
158/// range will outlive it, so it doesn't take ownership of it.
159Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
160 const SourceManager &SM, const LangOptions &langOpts)
161 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
162 FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
163
164void Lexer::resetExtendedTokenMode() {
165 assert(PP && "Cannot reset token mode without a preprocessor")(static_cast <bool> (PP && "Cannot reset token mode without a preprocessor"
) ? void (0) : __assert_fail ("PP && \"Cannot reset token mode without a preprocessor\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 165, __extension__ __PRETTY_FUNCTION__))
;
166 if (LangOpts.TraditionalCPP)
167 SetKeepWhitespaceMode(true);
168 else
169 SetCommentRetentionState(PP->getCommentRetentionState());
170}
171
172/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
173/// _Pragma expansion. This has a variety of magic semantics that this method
174/// sets up. It returns a new'd Lexer that must be delete'd when done.
175///
176/// On entrance to this routine, TokStartLoc is a macro location which has a
177/// spelling loc that indicates the bytes to be lexed for the token and an
178/// expansion location that indicates where all lexed tokens should be
179/// "expanded from".
180///
181/// TODO: It would really be nice to make _Pragma just be a wrapper around a
182/// normal lexer that remaps tokens as they fly by. This would require making
183/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
184/// interface that could handle this stuff. This would pull GetMappedTokenLoc
185/// out of the critical path of the lexer!
186///
187Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
188 SourceLocation ExpansionLocStart,
189 SourceLocation ExpansionLocEnd,
190 unsigned TokLen, Preprocessor &PP) {
191 SourceManager &SM = PP.getSourceManager();
192
193 // Create the lexer as if we were going to lex the file normally.
194 FileID SpellingFID = SM.getFileID(SpellingLoc);
195 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
196 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
197
198 // Now that the lexer is created, change the start/end locations so that we
199 // just lex the subsection of the file that we want. This is lexing from a
200 // scratch buffer.
201 const char *StrData = SM.getCharacterData(SpellingLoc);
202
203 L->BufferPtr = StrData;
204 L->BufferEnd = StrData+TokLen;
205 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!")(static_cast <bool> (L->BufferEnd[0] == 0 &&
"Buffer is not nul terminated!") ? void (0) : __assert_fail (
"L->BufferEnd[0] == 0 && \"Buffer is not nul terminated!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 205, __extension__ __PRETTY_FUNCTION__))
;
206
207 // Set the SourceLocation with the remapping information. This ensures that
208 // GetMappedTokenLoc will remap the tokens as they are lexed.
209 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
210 ExpansionLocStart,
211 ExpansionLocEnd, TokLen);
212
213 // Ensure that the lexer thinks it is inside a directive, so that end \n will
214 // return an EOD token.
215 L->ParsingPreprocessorDirective = true;
216
217 // This lexer really is for _Pragma.
218 L->Is_PragmaLexer = true;
219 return L;
220}
221
222template <typename T> static void StringifyImpl(T &Str, char Quote) {
223 typename T::size_type i = 0, e = Str.size();
224 while (i < e) {
225 if (Str[i] == '\\' || Str[i] == Quote) {
226 Str.insert(Str.begin() + i, '\\');
227 i += 2;
228 ++e;
229 } else if (Str[i] == '\n' || Str[i] == '\r') {
230 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
231 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
232 Str[i] != Str[i + 1]) {
233 Str[i] = '\\';
234 Str[i + 1] = 'n';
235 } else {
236 // Replace '\n' and '\r' to '\\' followed by 'n'.
237 Str[i] = '\\';
238 Str.insert(Str.begin() + i + 1, 'n');
239 ++e;
240 }
241 i += 2;
242 } else
243 ++i;
244 }
245}
246
247std::string Lexer::Stringify(StringRef Str, bool Charify) {
248 std::string Result = Str;
249 char Quote = Charify ? '\'' : '"';
250 StringifyImpl(Result, Quote);
251 return Result;
252}
253
254void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
255
256//===----------------------------------------------------------------------===//
257// Token Spelling
258//===----------------------------------------------------------------------===//
259
260/// \brief Slow case of getSpelling. Extract the characters comprising the
261/// spelling of this token from the provided input buffer.
262static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
263 const LangOptions &LangOpts, char *Spelling) {
264 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token")(static_cast <bool> (Tok.needsCleaning() && "getSpellingSlow called on simple token"
) ? void (0) : __assert_fail ("Tok.needsCleaning() && \"getSpellingSlow called on simple token\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 264, __extension__ __PRETTY_FUNCTION__))
;
265
266 size_t Length = 0;
267 const char *BufEnd = BufPtr + Tok.getLength();
268
269 if (tok::isStringLiteral(Tok.getKind())) {
270 // Munch the encoding-prefix and opening double-quote.
271 while (BufPtr < BufEnd) {
272 unsigned Size;
273 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
274 BufPtr += Size;
275
276 if (Spelling[Length - 1] == '"')
277 break;
278 }
279
280 // Raw string literals need special handling; trigraph expansion and line
281 // splicing do not occur within their d-char-sequence nor within their
282 // r-char-sequence.
283 if (Length >= 2 &&
284 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
285 // Search backwards from the end of the token to find the matching closing
286 // quote.
287 const char *RawEnd = BufEnd;
288 do --RawEnd; while (*RawEnd != '"');
289 size_t RawLength = RawEnd - BufPtr + 1;
290
291 // Everything between the quotes is included verbatim in the spelling.
292 memcpy(Spelling + Length, BufPtr, RawLength);
293 Length += RawLength;
294 BufPtr += RawLength;
295
296 // The rest of the token is lexed normally.
297 }
298 }
299
300 while (BufPtr < BufEnd) {
301 unsigned Size;
302 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
303 BufPtr += Size;
304 }
305
306 assert(Length < Tok.getLength() &&(static_cast <bool> (Length < Tok.getLength() &&
"NeedsCleaning flag set on token that didn't need cleaning!"
) ? void (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 307, __extension__ __PRETTY_FUNCTION__))
307 "NeedsCleaning flag set on token that didn't need cleaning!")(static_cast <bool> (Length < Tok.getLength() &&
"NeedsCleaning flag set on token that didn't need cleaning!"
) ? void (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 307, __extension__ __PRETTY_FUNCTION__))
;
308 return Length;
309}
310
311/// getSpelling() - Return the 'spelling' of this token. The spelling of a
312/// token are the characters used to represent the token in the source file
313/// after trigraph expansion and escaped-newline folding. In particular, this
314/// wants to get the true, uncanonicalized, spelling of things like digraphs
315/// UCNs, etc.
316StringRef Lexer::getSpelling(SourceLocation loc,
317 SmallVectorImpl<char> &buffer,
318 const SourceManager &SM,
319 const LangOptions &options,
320 bool *invalid) {
321 // Break down the source location.
322 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
323
324 // Try to the load the file buffer.
325 bool invalidTemp = false;
326 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
327 if (invalidTemp) {
328 if (invalid) *invalid = true;
329 return {};
330 }
331
332 const char *tokenBegin = file.data() + locInfo.second;
333
334 // Lex from the start of the given location.
335 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
336 file.begin(), tokenBegin, file.end());
337 Token token;
338 lexer.LexFromRawLexer(token);
339
340 unsigned length = token.getLength();
341
342 // Common case: no need for cleaning.
343 if (!token.needsCleaning())
344 return StringRef(tokenBegin, length);
345
346 // Hard case, we need to relex the characters into the string.
347 buffer.resize(length);
348 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
349 return StringRef(buffer.data(), buffer.size());
350}
351
352/// getSpelling() - Return the 'spelling' of this token. The spelling of a
353/// token are the characters used to represent the token in the source file
354/// after trigraph expansion and escaped-newline folding. In particular, this
355/// wants to get the true, uncanonicalized, spelling of things like digraphs
356/// UCNs, etc.
357std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
358 const LangOptions &LangOpts, bool *Invalid) {
359 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(static_cast <bool> ((int)Tok.getLength() >= 0 &&
"Token character range is bogus!") ? void (0) : __assert_fail
("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 359, __extension__ __PRETTY_FUNCTION__))
;
360
361 bool CharDataInvalid = false;
362 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
363 &CharDataInvalid);
364 if (Invalid)
365 *Invalid = CharDataInvalid;
366 if (CharDataInvalid)
367 return {};
368
369 // If this token contains nothing interesting, return it directly.
370 if (!Tok.needsCleaning())
371 return std::string(TokStart, TokStart + Tok.getLength());
372
373 std::string Result;
374 Result.resize(Tok.getLength());
375 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
376 return Result;
377}
378
379/// getSpelling - This method is used to get the spelling of a token into a
380/// preallocated buffer, instead of as an std::string. The caller is required
381/// to allocate enough space for the token, which is guaranteed to be at least
382/// Tok.getLength() bytes long. The actual length of the token is returned.
383///
384/// Note that this method may do two possible things: it may either fill in
385/// the buffer specified with characters, or it may *change the input pointer*
386/// to point to a constant buffer with the data already in it (avoiding a
387/// copy). The caller is not allowed to modify the returned buffer pointer
388/// if an internal buffer is returned.
389unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
390 const SourceManager &SourceMgr,
391 const LangOptions &LangOpts, bool *Invalid) {
392 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(static_cast <bool> ((int)Tok.getLength() >= 0 &&
"Token character range is bogus!") ? void (0) : __assert_fail
("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 392, __extension__ __PRETTY_FUNCTION__))
;
393
394 const char *TokStart = nullptr;
395 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
396 if (Tok.is(tok::raw_identifier))
397 TokStart = Tok.getRawIdentifier().data();
398 else if (!Tok.hasUCN()) {
399 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
400 // Just return the string from the identifier table, which is very quick.
401 Buffer = II->getNameStart();
402 return II->getLength();
403 }
404 }
405
406 // NOTE: this can be checked even after testing for an IdentifierInfo.
407 if (Tok.isLiteral())
408 TokStart = Tok.getLiteralData();
409
410 if (!TokStart) {
411 // Compute the start of the token in the input lexer buffer.
412 bool CharDataInvalid = false;
413 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
414 if (Invalid)
415 *Invalid = CharDataInvalid;
416 if (CharDataInvalid) {
417 Buffer = "";
418 return 0;
419 }
420 }
421
422 // If this token contains nothing interesting, return it directly.
423 if (!Tok.needsCleaning()) {
424 Buffer = TokStart;
425 return Tok.getLength();
426 }
427
428 // Otherwise, hard case, relex the characters into the string.
429 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
430}
431
432/// MeasureTokenLength - Relex the token at the specified location and return
433/// its length in bytes in the input file. If the token needs cleaning (e.g.
434/// includes a trigraph or an escaped newline) then this count includes bytes
435/// that are part of that.
436unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
437 const SourceManager &SM,
438 const LangOptions &LangOpts) {
439 Token TheTok;
440 if (getRawToken(Loc, TheTok, SM, LangOpts))
441 return 0;
442 return TheTok.getLength();
443}
444
445/// \brief Relex the token at the specified location.
446/// \returns true if there was a failure, false on success.
447bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
448 const SourceManager &SM,
449 const LangOptions &LangOpts,
450 bool IgnoreWhiteSpace) {
451 // TODO: this could be special cased for common tokens like identifiers, ')',
452 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
453 // all obviously single-char tokens. This could use
454 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
455 // something.
456
457 // If this comes from a macro expansion, we really do want the macro name, not
458 // the token this macro expanded to.
459 Loc = SM.getExpansionLoc(Loc);
460 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
461 bool Invalid = false;
462 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
463 if (Invalid)
464 return true;
465
466 const char *StrData = Buffer.data()+LocInfo.second;
467
468 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
469 return true;
470
471 // Create a lexer starting at the beginning of this token.
472 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
473 Buffer.begin(), StrData, Buffer.end());
474 TheLexer.SetCommentRetentionState(true);
475 TheLexer.LexFromRawLexer(Result);
476 return false;
477}
478
479/// Returns the pointer that points to the beginning of line that contains
480/// the given offset, or null if the offset if invalid.
481static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
482 const char *BufStart = Buffer.data();
483 if (Offset >= Buffer.size())
484 return nullptr;
485
486 const char *LexStart = BufStart + Offset;
487 for (; LexStart != BufStart; --LexStart) {
488 if (isVerticalWhitespace(LexStart[0]) &&
489 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
490 // LexStart should point at first character of logical line.
491 ++LexStart;
492 break;
493 }
494 }
495 return LexStart;
496}
497
498static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
499 const SourceManager &SM,
500 const LangOptions &LangOpts) {
501 assert(Loc.isFileID())(static_cast <bool> (Loc.isFileID()) ? void (0) : __assert_fail
("Loc.isFileID()", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 501, __extension__ __PRETTY_FUNCTION__))
;
502 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
503 if (LocInfo.first.isInvalid())
504 return Loc;
505
506 bool Invalid = false;
507 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
508 if (Invalid)
509 return Loc;
510
511 // Back up from the current location until we hit the beginning of a line
512 // (or the buffer). We'll relex from that point.
513 const char *StrData = Buffer.data() + LocInfo.second;
514 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
515 if (!LexStart || LexStart == StrData)
516 return Loc;
517
518 // Create a lexer starting at the beginning of this token.
519 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
520 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
521 Buffer.end());
522 TheLexer.SetCommentRetentionState(true);
523
524 // Lex tokens until we find the token that contains the source location.
525 Token TheTok;
526 do {
527 TheLexer.LexFromRawLexer(TheTok);
528
529 if (TheLexer.getBufferLocation() > StrData) {
530 // Lexing this token has taken the lexer past the source location we're
531 // looking for. If the current token encompasses our source location,
532 // return the beginning of that token.
533 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
534 return TheTok.getLocation();
535
536 // We ended up skipping over the source location entirely, which means
537 // that it points into whitespace. We're done here.
538 break;
539 }
540 } while (TheTok.getKind() != tok::eof);
541
542 // We've passed our source location; just return the original source location.
543 return Loc;
544}
545
546SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
547 const SourceManager &SM,
548 const LangOptions &LangOpts) {
549 if (Loc.isFileID())
550 return getBeginningOfFileToken(Loc, SM, LangOpts);
551
552 if (!SM.isMacroArgExpansion(Loc))
553 return Loc;
554
555 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
556 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
557 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
558 std::pair<FileID, unsigned> BeginFileLocInfo =
559 SM.getDecomposedLoc(BeginFileLoc);
560 assert(FileLocInfo.first == BeginFileLocInfo.first &&(static_cast <bool> (FileLocInfo.first == BeginFileLocInfo
.first && FileLocInfo.second >= BeginFileLocInfo.second
) ? void (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 561, __extension__ __PRETTY_FUNCTION__))
561 FileLocInfo.second >= BeginFileLocInfo.second)(static_cast <bool> (FileLocInfo.first == BeginFileLocInfo
.first && FileLocInfo.second >= BeginFileLocInfo.second
) ? void (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 561, __extension__ __PRETTY_FUNCTION__))
;
562 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
563}
564
565namespace {
566
567enum PreambleDirectiveKind {
568 PDK_Skipped,
569 PDK_Unknown
570};
571
572} // namespace
573
574PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
575 const LangOptions &LangOpts,
576 unsigned MaxLines) {
577 // Create a lexer starting at the beginning of the file. Note that we use a
578 // "fake" file source location at offset 1 so that the lexer will track our
579 // position within the file.
580 const unsigned StartOffset = 1;
581 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
582 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
583 Buffer.end());
584 TheLexer.SetCommentRetentionState(true);
585
586 bool InPreprocessorDirective = false;
587 Token TheTok;
588 SourceLocation ActiveCommentLoc;
589
590 unsigned MaxLineOffset = 0;
591 if (MaxLines) {
592 const char *CurPtr = Buffer.begin();
593 unsigned CurLine = 0;
594 while (CurPtr != Buffer.end()) {
595 char ch = *CurPtr++;
596 if (ch == '\n') {
597 ++CurLine;
598 if (CurLine == MaxLines)
599 break;
600 }
601 }
602 if (CurPtr != Buffer.end())
603 MaxLineOffset = CurPtr - Buffer.begin();
604 }
605
606 do {
607 TheLexer.LexFromRawLexer(TheTok);
608
609 if (InPreprocessorDirective) {
610 // If we've hit the end of the file, we're done.
611 if (TheTok.getKind() == tok::eof) {
612 break;
613 }
614
615 // If we haven't hit the end of the preprocessor directive, skip this
616 // token.
617 if (!TheTok.isAtStartOfLine())
618 continue;
619
620 // We've passed the end of the preprocessor directive, and will look
621 // at this token again below.
622 InPreprocessorDirective = false;
623 }
624
625 // Keep track of the # of lines in the preamble.
626 if (TheTok.isAtStartOfLine()) {
627 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
628
629 // If we were asked to limit the number of lines in the preamble,
630 // and we're about to exceed that limit, we're done.
631 if (MaxLineOffset && TokOffset >= MaxLineOffset)
632 break;
633 }
634
635 // Comments are okay; skip over them.
636 if (TheTok.getKind() == tok::comment) {
637 if (ActiveCommentLoc.isInvalid())
638 ActiveCommentLoc = TheTok.getLocation();
639 continue;
640 }
641
642 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
643 // This is the start of a preprocessor directive.
644 Token HashTok = TheTok;
645 InPreprocessorDirective = true;
646 ActiveCommentLoc = SourceLocation();
647
648 // Figure out which directive this is. Since we're lexing raw tokens,
649 // we don't have an identifier table available. Instead, just look at
650 // the raw identifier to recognize and categorize preprocessor directives.
651 TheLexer.LexFromRawLexer(TheTok);
652 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
653 StringRef Keyword = TheTok.getRawIdentifier();
654 PreambleDirectiveKind PDK
655 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
656 .Case("include", PDK_Skipped)
657 .Case("__include_macros", PDK_Skipped)
658 .Case("define", PDK_Skipped)
659 .Case("undef", PDK_Skipped)
660 .Case("line", PDK_Skipped)
661 .Case("error", PDK_Skipped)
662 .Case("pragma", PDK_Skipped)
663 .Case("import", PDK_Skipped)
664 .Case("include_next", PDK_Skipped)
665 .Case("warning", PDK_Skipped)
666 .Case("ident", PDK_Skipped)
667 .Case("sccs", PDK_Skipped)
668 .Case("assert", PDK_Skipped)
669 .Case("unassert", PDK_Skipped)
670 .Case("if", PDK_Skipped)
671 .Case("ifdef", PDK_Skipped)
672 .Case("ifndef", PDK_Skipped)
673 .Case("elif", PDK_Skipped)
674 .Case("else", PDK_Skipped)
675 .Case("endif", PDK_Skipped)
676 .Default(PDK_Unknown);
677
678 switch (PDK) {
679 case PDK_Skipped:
680 continue;
681
682 case PDK_Unknown:
683 // We don't know what this directive is; stop at the '#'.
684 break;
685 }
686 }
687
688 // We only end up here if we didn't recognize the preprocessor
689 // directive or it was one that can't occur in the preamble at this
690 // point. Roll back the current token to the location of the '#'.
691 InPreprocessorDirective = false;
692 TheTok = HashTok;
693 }
694
695 // We hit a token that we don't recognize as being in the
696 // "preprocessing only" part of the file, so we're no longer in
697 // the preamble.
698 break;
699 } while (true);
700
701 SourceLocation End;
702 if (ActiveCommentLoc.isValid())
703 End = ActiveCommentLoc; // don't truncate a decl comment.
704 else
705 End = TheTok.getLocation();
706
707 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
708 TheTok.isAtStartOfLine());
709}
710
711/// AdvanceToTokenCharacter - Given a location that specifies the start of a
712/// token, return a new location that specifies a character within the token.
713SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
714 unsigned CharNo,
715 const SourceManager &SM,
716 const LangOptions &LangOpts) {
717 // Figure out how many physical characters away the specified expansion
718 // character is. This needs to take into consideration newlines and
719 // trigraphs.
720 bool Invalid = false;
721 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
722
723 // If they request the first char of the token, we're trivially done.
724 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
725 return TokStart;
726
727 unsigned PhysOffset = 0;
728
729 // The usual case is that tokens don't contain anything interesting. Skip
730 // over the uninteresting characters. If a token only consists of simple
731 // chars, this method is extremely fast.
732 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
733 if (CharNo == 0)
734 return TokStart.getLocWithOffset(PhysOffset);
735 ++TokPtr;
736 --CharNo;
737 ++PhysOffset;
738 }
739
740 // If we have a character that may be a trigraph or escaped newline, use a
741 // lexer to parse it correctly.
742 for (; CharNo; --CharNo) {
743 unsigned Size;
744 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
745 TokPtr += Size;
746 PhysOffset += Size;
747 }
748
749 // Final detail: if we end up on an escaped newline, we want to return the
750 // location of the actual byte of the token. For example foo\<newline>bar
751 // advanced by 3 should return the location of b, not of \\. One compounding
752 // detail of this is that the escape may be made by a trigraph.
753 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
754 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
755
756 return TokStart.getLocWithOffset(PhysOffset);
757}
758
759/// \brief Computes the source location just past the end of the
760/// token at this source location.
761///
762/// This routine can be used to produce a source location that
763/// points just past the end of the token referenced by \p Loc, and
764/// is generally used when a diagnostic needs to point just after a
765/// token where it expected something different that it received. If
766/// the returned source location would not be meaningful (e.g., if
767/// it points into a macro), this routine returns an invalid
768/// source location.
769///
770/// \param Offset an offset from the end of the token, where the source
771/// location should refer to. The default offset (0) produces a source
772/// location pointing just past the end of the token; an offset of 1 produces
773/// a source location pointing to the last character in the token, etc.
774SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
775 const SourceManager &SM,
776 const LangOptions &LangOpts) {
777 if (Loc.isInvalid())
778 return {};
779
780 if (Loc.isMacroID()) {
781 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
782 return {}; // Points inside the macro expansion.
783 }
784
785 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
786 if (Len > Offset)
787 Len = Len - Offset;
788 else
789 return Loc;
790
791 return Loc.getLocWithOffset(Len);
792}
793
794/// \brief Returns true if the given MacroID location points at the first
795/// token of the macro expansion.
796bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
797 const SourceManager &SM,
798 const LangOptions &LangOpts,
799 SourceLocation *MacroBegin) {
800 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")(static_cast <bool> (loc.isValid() && loc.isMacroID
() && "Expected a valid macro loc") ? void (0) : __assert_fail
("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 800, __extension__ __PRETTY_FUNCTION__))
;
801
802 SourceLocation expansionLoc;
803 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
804 return false;
805
806 if (expansionLoc.isFileID()) {
807 // No other macro expansions, this is the first.
808 if (MacroBegin)
809 *MacroBegin = expansionLoc;
810 return true;
811 }
812
813 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
814}
815
816/// \brief Returns true if the given MacroID location points at the last
817/// token of the macro expansion.
818bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
819 const SourceManager &SM,
820 const LangOptions &LangOpts,
821 SourceLocation *MacroEnd) {
822 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")(static_cast <bool> (loc.isValid() && loc.isMacroID
() && "Expected a valid macro loc") ? void (0) : __assert_fail
("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 822, __extension__ __PRETTY_FUNCTION__))
;
823
824 SourceLocation spellLoc = SM.getSpellingLoc(loc);
825 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
826 if (tokLen == 0)
827 return false;
828
829 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
830 SourceLocation expansionLoc;
831 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
832 return false;
833
834 if (expansionLoc.isFileID()) {
835 // No other macro expansions.
836 if (MacroEnd)
837 *MacroEnd = expansionLoc;
838 return true;
839 }
840
841 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
842}
843
844static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
845 const SourceManager &SM,
846 const LangOptions &LangOpts) {
847 SourceLocation Begin = Range.getBegin();
848 SourceLocation End = Range.getEnd();
849 assert(Begin.isFileID() && End.isFileID())(static_cast <bool> (Begin.isFileID() && End.isFileID
()) ? void (0) : __assert_fail ("Begin.isFileID() && End.isFileID()"
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 849, __extension__ __PRETTY_FUNCTION__))
;
850 if (Range.isTokenRange()) {
851 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
852 if (End.isInvalid())
853 return {};
854 }
855
856 // Break down the source locations.
857 FileID FID;
858 unsigned BeginOffs;
859 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
860 if (FID.isInvalid())
861 return {};
862
863 unsigned EndOffs;
864 if (!SM.isInFileID(End, FID, &EndOffs) ||
865 BeginOffs > EndOffs)
866 return {};
867
868 return CharSourceRange::getCharRange(Begin, End);
869}
870
871CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
872 const SourceManager &SM,
873 const LangOptions &LangOpts) {
874 SourceLocation Begin = Range.getBegin();
875 SourceLocation End = Range.getEnd();
876 if (Begin.isInvalid() || End.isInvalid())
877 return {};
878
879 if (Begin.isFileID() && End.isFileID())
880 return makeRangeFromFileLocs(Range, SM, LangOpts);
881
882 if (Begin.isMacroID() && End.isFileID()) {
883 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
884 return {};
885 Range.setBegin(Begin);
886 return makeRangeFromFileLocs(Range, SM, LangOpts);
887 }
888
889 if (Begin.isFileID() && End.isMacroID()) {
890 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
891 &End)) ||
892 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
893 &End)))
894 return {};
895 Range.setEnd(End);
896 return makeRangeFromFileLocs(Range, SM, LangOpts);
897 }
898
899 assert(Begin.isMacroID() && End.isMacroID())(static_cast <bool> (Begin.isMacroID() && End.isMacroID
()) ? void (0) : __assert_fail ("Begin.isMacroID() && End.isMacroID()"
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 899, __extension__ __PRETTY_FUNCTION__))
;
900 SourceLocation MacroBegin, MacroEnd;
901 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
902 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
903 &MacroEnd)) ||
904 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
905 &MacroEnd)))) {
906 Range.setBegin(MacroBegin);
907 Range.setEnd(MacroEnd);
908 return makeRangeFromFileLocs(Range, SM, LangOpts);
909 }
910
911 bool Invalid = false;
912 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
913 &Invalid);
914 if (Invalid)
915 return {};
916
917 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
918 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
919 &Invalid);
920 if (Invalid)
921 return {};
922
923 if (EndEntry.getExpansion().isMacroArgExpansion() &&
924 BeginEntry.getExpansion().getExpansionLocStart() ==
925 EndEntry.getExpansion().getExpansionLocStart()) {
926 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
927 Range.setEnd(SM.getImmediateSpellingLoc(End));
928 return makeFileCharRange(Range, SM, LangOpts);
929 }
930 }
931
932 return {};
933}
934
935StringRef Lexer::getSourceText(CharSourceRange Range,
936 const SourceManager &SM,
937 const LangOptions &LangOpts,
938 bool *Invalid) {
939 Range = makeFileCharRange(Range, SM, LangOpts);
940 if (Range.isInvalid()) {
941 if (Invalid) *Invalid = true;
942 return {};
943 }
944
945 // Break down the source location.
946 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
947 if (beginInfo.first.isInvalid()) {
948 if (Invalid) *Invalid = true;
949 return {};
950 }
951
952 unsigned EndOffs;
953 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
954 beginInfo.second > EndOffs) {
955 if (Invalid) *Invalid = true;
956 return {};
957 }
958
959 // Try to the load the file buffer.
960 bool invalidTemp = false;
961 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
962 if (invalidTemp) {
963 if (Invalid) *Invalid = true;
964 return {};
965 }
966
967 if (Invalid) *Invalid = false;
968 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
969}
970
971StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
972 const SourceManager &SM,
973 const LangOptions &LangOpts) {
974 assert(Loc.isMacroID() && "Only reasonable to call this on macros")(static_cast <bool> (Loc.isMacroID() && "Only reasonable to call this on macros"
) ? void (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 974, __extension__ __PRETTY_FUNCTION__))
;
975
976 // Find the location of the immediate macro expansion.
977 while (true) {
978 FileID FID = SM.getFileID(Loc);
979 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
980 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
981 Loc = Expansion.getExpansionLocStart();
982 if (!Expansion.isMacroArgExpansion())
983 break;
984
985 // For macro arguments we need to check that the argument did not come
986 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
987
988 // Loc points to the argument id of the macro definition, move to the
989 // macro expansion.
990 Loc = SM.getImmediateExpansionRange(Loc).first;
991 SourceLocation SpellLoc = Expansion.getSpellingLoc();
992 if (SpellLoc.isFileID())
993 break; // No inner macro.
994
995 // If spelling location resides in the same FileID as macro expansion
996 // location, it means there is no inner macro.
997 FileID MacroFID = SM.getFileID(Loc);
998 if (SM.isInFileID(SpellLoc, MacroFID))
999 break;
1000
1001 // Argument came from inner macro.
1002 Loc = SpellLoc;
1003 }
1004
1005 // Find the spelling location of the start of the non-argument expansion
1006 // range. This is where the macro name was spelled in order to begin
1007 // expanding this macro.
1008 Loc = SM.getSpellingLoc(Loc);
1009
1010 // Dig out the buffer where the macro name was spelled and the extents of the
1011 // name so that we can render it into the expansion note.
1012 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1013 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1014 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1015 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1016}
1017
1018StringRef Lexer::getImmediateMacroNameForDiagnostics(
1019 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1020 assert(Loc.isMacroID() && "Only reasonable to call this on macros")(static_cast <bool> (Loc.isMacroID() && "Only reasonable to call this on macros"
) ? void (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 1020, __extension__ __PRETTY_FUNCTION__))
;
1021 // Walk past macro argument expanions.
1022 while (SM.isMacroArgExpansion(Loc))
1023 Loc = SM.getImmediateExpansionRange(Loc).first;
1024
1025 // If the macro's spelling has no FileID, then it's actually a token paste
1026 // or stringization (or similar) and not a macro at all.
1027 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1028 return {};
1029
1030 // Find the spelling location of the start of the non-argument expansion
1031 // range. This is where the macro name was spelled in order to begin
1032 // expanding this macro.
1033 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).first);
1034
1035 // Dig out the buffer where the macro name was spelled and the extents of the
1036 // name so that we can render it into the expansion note.
1037 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1038 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1039 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1040 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1041}
1042
1043bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1044 return isIdentifierBody(c, LangOpts.DollarIdents);
1045}
1046
1047bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1048 assert(isVerticalWhitespace(Str[0]))(static_cast <bool> (isVerticalWhitespace(Str[0])) ? void
(0) : __assert_fail ("isVerticalWhitespace(Str[0])", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 1048, __extension__ __PRETTY_FUNCTION__))
;
1049 if (Str - 1 < BufferStart)
1050 return false;
1051
1052 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1053 (Str[0] == '\r' && Str[-1] == '\n')) {
1054 if (Str - 2 < BufferStart)
1055 return false;
1056 --Str;
1057 }
1058 --Str;
1059
1060 // Rewind to first non-space character:
1061 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1062 --Str;
1063
1064 return *Str == '\\';
1065}
1066
1067StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1068 const SourceManager &SM) {
1069 if (Loc.isInvalid() || Loc.isMacroID())
1070 return {};
1071 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1072 if (LocInfo.first.isInvalid())
1073 return {};
1074 bool Invalid = false;
1075 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1076 if (Invalid)
1077 return {};
1078 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1079 if (!Line)
1080 return {};
1081 StringRef Rest = Buffer.substr(Line - Buffer.data());
1082 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1083 return NumWhitespaceChars == StringRef::npos
1084 ? ""
1085 : Rest.take_front(NumWhitespaceChars);
1086}
1087
1088//===----------------------------------------------------------------------===//
1089// Diagnostics forwarding code.
1090//===----------------------------------------------------------------------===//
1091
1092/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1093/// lexer buffer was all expanded at a single point, perform the mapping.
1094/// This is currently only used for _Pragma implementation, so it is the slow
1095/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1096static LLVM_ATTRIBUTE_NOINLINE__attribute__((noinline)) SourceLocation GetMappedTokenLoc(
1097 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1098static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1099 SourceLocation FileLoc,
1100 unsigned CharNo, unsigned TokLen) {
1101 assert(FileLoc.isMacroID() && "Must be a macro expansion")(static_cast <bool> (FileLoc.isMacroID() && "Must be a macro expansion"
) ? void (0) : __assert_fail ("FileLoc.isMacroID() && \"Must be a macro expansion\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 1101, __extension__ __PRETTY_FUNCTION__))
;
1102
1103 // Otherwise, we're lexing "mapped tokens". This is used for things like
1104 // _Pragma handling. Combine the expansion location of FileLoc with the
1105 // spelling location.
1106 SourceManager &SM = PP.getSourceManager();
1107
1108 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1109 // characters come from spelling(FileLoc)+Offset.
1110 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1111 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1112
1113 // Figure out the expansion loc range, which is the range covered by the
1114 // original _Pragma(...) sequence.
1115 std::pair<SourceLocation,SourceLocation> II =
1116 SM.getImmediateExpansionRange(FileLoc);
1117
1118 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen);
1119}
1120
1121/// getSourceLocation - Return a source location identifier for the specified
1122/// offset in the current file.
1123SourceLocation Lexer::getSourceLocation(const char *Loc,
1124 unsigned TokLen) const {
1125 assert(Loc >= BufferStart && Loc <= BufferEnd &&(static_cast <bool> (Loc >= BufferStart && Loc
<= BufferEnd && "Location out of range for this buffer!"
) ? void (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 1126, __extension__ __PRETTY_FUNCTION__))
1126 "Location out of range for this buffer!")(static_cast <bool> (Loc >= BufferStart && Loc
<= BufferEnd && "Location out of range for this buffer!"
) ? void (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 1126, __extension__ __PRETTY_FUNCTION__))
;
1127
1128 // In the normal case, we're just lexing from a simple file buffer, return
1129 // the file id from FileLoc with the offset specified.
1130 unsigned CharNo = Loc-BufferStart;
1131 if (FileLoc.isFileID())
1132 return FileLoc.getLocWithOffset(CharNo);
1133
1134 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1135 // tokens are lexed from where the _Pragma was defined.
1136 assert(PP && "This doesn't work on raw lexers")(static_cast <bool> (PP && "This doesn't work on raw lexers"
) ? void (0) : __assert_fail ("PP && \"This doesn't work on raw lexers\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 1136, __extension__ __PRETTY_FUNCTION__))
;
1137 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1138}
1139
1140/// Diag - Forwarding function for diagnostics. This translate a source
1141/// position in the current buffer into a SourceLocation object for rendering.
1142DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1143 return PP->Diag(getSourceLocation(Loc), DiagID);
1144}
1145
1146//===----------------------------------------------------------------------===//
1147// Trigraph and Escaped Newline Handling Code.
1148//===----------------------------------------------------------------------===//
1149
1150/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1151/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1152static char GetTrigraphCharForLetter(char Letter) {
1153 switch (Letter) {
1154 default: return 0;
1155 case '=': return '#';
1156 case ')': return ']';
1157 case '(': return '[';
1158 case '!': return '|';
1159 case '\'': return '^';
1160 case '>': return '}';
1161 case '/': return '\\';
1162 case '<': return '{';
1163 case '-': return '~';
1164 }
1165}
1166
1167/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1168/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1169/// return the result character. Finally, emit a warning about trigraph use
1170/// whether trigraphs are enabled or not.
1171static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1172 char Res = GetTrigraphCharForLetter(*CP);
1173 if (!Res || !L) return Res;
1174
1175 if (!L->getLangOpts().Trigraphs) {
1176 if (!L->isLexingRawMode())
1177 L->Diag(CP-2, diag::trigraph_ignored);
1178 return 0;
1179 }
1180
1181 if (!L->isLexingRawMode())
1182 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1183 return Res;
1184}
1185
1186/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1187/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1188/// trigraph equivalent on entry to this function.
1189unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1190 unsigned Size = 0;
1191 while (isWhitespace(Ptr[Size])) {
1192 ++Size;
1193
1194 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1195 continue;
1196
1197 // If this is a \r\n or \n\r, skip the other half.
1198 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1199 Ptr[Size-1] != Ptr[Size])
1200 ++Size;
1201
1202 return Size;
1203 }
1204
1205 // Not an escaped newline, must be a \t or something else.
1206 return 0;
1207}
1208
1209/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1210/// them), skip over them and return the first non-escaped-newline found,
1211/// otherwise return P.
1212const char *Lexer::SkipEscapedNewLines(const char *P) {
1213 while (true) {
1214 const char *AfterEscape;
1215 if (*P == '\\') {
1216 AfterEscape = P+1;
1217 } else if (*P == '?') {
1218 // If not a trigraph for escape, bail out.
1219 if (P[1] != '?' || P[2] != '/')
1220 return P;
1221 // FIXME: Take LangOpts into account; the language might not
1222 // support trigraphs.
1223 AfterEscape = P+3;
1224 } else {
1225 return P;
1226 }
1227
1228 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1229 if (NewLineSize == 0) return P;
1230 P = AfterEscape+NewLineSize;
1231 }
1232}
1233
1234Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1235 const SourceManager &SM,
1236 const LangOptions &LangOpts) {
1237 if (Loc.isMacroID()) {
1238 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1239 return None;
1240 }
1241 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1242
1243 // Break down the source location.
1244 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1245
1246 // Try to load the file buffer.
1247 bool InvalidTemp = false;
1248 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1249 if (InvalidTemp)
1250 return None;
1251
1252 const char *TokenBegin = File.data() + LocInfo.second;
1253
1254 // Lex from the start of the given location.
1255 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1256 TokenBegin, File.end());
1257 // Find the token.
1258 Token Tok;
1259 lexer.LexFromRawLexer(Tok);
1260 return Tok;
1261}
1262
1263/// \brief Checks that the given token is the first token that occurs after the
1264/// given location (this excludes comments and whitespace). Returns the location
1265/// immediately after the specified token. If the token is not found or the
1266/// location is inside a macro, the returned source location will be invalid.
1267SourceLocation Lexer::findLocationAfterToken(
1268 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1269 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1270 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1271 if (!Tok || Tok->isNot(TKind))
1272 return {};
1273 SourceLocation TokenLoc = Tok->getLocation();
1274
1275 // Calculate how much whitespace needs to be skipped if any.
1276 unsigned NumWhitespaceChars = 0;
1277 if (SkipTrailingWhitespaceAndNewLine) {
1278 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1279 unsigned char C = *TokenEnd;
1280 while (isHorizontalWhitespace(C)) {
1281 C = *(++TokenEnd);
1282 NumWhitespaceChars++;
1283 }
1284
1285 // Skip \r, \n, \r\n, or \n\r
1286 if (C == '\n' || C == '\r') {
1287 char PrevC = C;
1288 C = *(++TokenEnd);
1289 NumWhitespaceChars++;
1290 if ((C == '\n' || C == '\r') && C != PrevC)
1291 NumWhitespaceChars++;
1292 }
1293 }
1294
1295 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1296}
1297
1298/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1299/// get its size, and return it. This is tricky in several cases:
1300/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1301/// then either return the trigraph (skipping 3 chars) or the '?',
1302/// depending on whether trigraphs are enabled or not.
1303/// 2. If this is an escaped newline (potentially with whitespace between
1304/// the backslash and newline), implicitly skip the newline and return
1305/// the char after it.
1306///
1307/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1308/// know that we can accumulate into Size, and that we have already incremented
1309/// Ptr by Size bytes.
1310///
1311/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1312/// be updated to match.
1313char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1314 Token *Tok) {
1315 // If we have a slash, look for an escaped newline.
1316 if (Ptr[0] == '\\') {
5
Taking true branch
1317 ++Size;
1318 ++Ptr;
1319Slash:
1320 // Common case, backslash-char where the char is not whitespace.
1321 if (!isWhitespace(Ptr[0])) return '\\';
6
Taking false branch
1322
1323 // See if we have optional whitespace characters between the slash and
1324 // newline.
1325 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
7
Assuming 'EscapedNewLineSize' is not equal to 0
8
Taking true branch
1326 // Remember that this token needs to be cleaned.
1327 if (Tok) Tok->setFlag(Token::NeedsCleaning);
9
Taking true branch
10
Calling 'Token::setFlag'
1328
1329 // Warn if there was whitespace between the backslash and newline.
1330 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1331 Diag(Ptr, diag::backslash_newline_space);
1332
1333 // Found backslash<whitespace><newline>. Parse the char after it.
1334 Size += EscapedNewLineSize;
1335 Ptr += EscapedNewLineSize;
1336
1337 // Use slow version to accumulate a correct size field.
1338 return getCharAndSizeSlow(Ptr, Size, Tok);
1339 }
1340
1341 // Otherwise, this is not an escaped newline, just return the slash.
1342 return '\\';
1343 }
1344
1345 // If this is a trigraph, process it.
1346 if (Ptr[0] == '?' && Ptr[1] == '?') {
1347 // If this is actually a legal trigraph (not something like "??x"), emit
1348 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1349 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1350 // Remember that this token needs to be cleaned.
1351 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1352
1353 Ptr += 3;
1354 Size += 3;
1355 if (C == '\\') goto Slash;
1356 return C;
1357 }
1358 }
1359
1360 // If this is neither, return a single character.
1361 ++Size;
1362 return *Ptr;
1363}
1364
1365/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1366/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1367/// and that we have already incremented Ptr by Size bytes.
1368///
1369/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1370/// be updated to match.
1371char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1372 const LangOptions &LangOpts) {
1373 // If we have a slash, look for an escaped newline.
1374 if (Ptr[0] == '\\') {
1375 ++Size;
1376 ++Ptr;
1377Slash:
1378 // Common case, backslash-char where the char is not whitespace.
1379 if (!isWhitespace(Ptr[0])) return '\\';
1380
1381 // See if we have optional whitespace characters followed by a newline.
1382 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1383 // Found backslash<whitespace><newline>. Parse the char after it.
1384 Size += EscapedNewLineSize;
1385 Ptr += EscapedNewLineSize;
1386
1387 // Use slow version to accumulate a correct size field.
1388 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1389 }
1390
1391 // Otherwise, this is not an escaped newline, just return the slash.
1392 return '\\';
1393 }
1394
1395 // If this is a trigraph, process it.
1396 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1397 // If this is actually a legal trigraph (not something like "??x"), return
1398 // it.
1399 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1400 Ptr += 3;
1401 Size += 3;
1402 if (C == '\\') goto Slash;
1403 return C;
1404 }
1405 }
1406
1407 // If this is neither, return a single character.
1408 ++Size;
1409 return *Ptr;
1410}
1411
1412//===----------------------------------------------------------------------===//
1413// Helper methods for lexing.
1414//===----------------------------------------------------------------------===//
1415
1416/// \brief Routine that indiscriminately sets the offset into the source file.
1417void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1418 BufferPtr = BufferStart + Offset;
1419 if (BufferPtr > BufferEnd)
1420 BufferPtr = BufferEnd;
1421 // FIXME: What exactly does the StartOfLine bit mean? There are two
1422 // possible meanings for the "start" of the line: the first token on the
1423 // unexpanded line, or the first token on the expanded line.
1424 IsAtStartOfLine = StartOfLine;
1425 IsAtPhysicalStartOfLine = StartOfLine;
1426}
1427
1428static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1429 if (LangOpts.AsmPreprocessor) {
1430 return false;
1431 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1432 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1433 C11AllowedIDCharRanges);
1434 return C11AllowedIDChars.contains(C);
1435 } else if (LangOpts.CPlusPlus) {
1436 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1437 CXX03AllowedIDCharRanges);
1438 return CXX03AllowedIDChars.contains(C);
1439 } else {
1440 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1441 C99AllowedIDCharRanges);
1442 return C99AllowedIDChars.contains(C);
1443 }
1444}
1445
1446static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1447 assert(isAllowedIDChar(C, LangOpts))(static_cast <bool> (isAllowedIDChar(C, LangOpts)) ? void
(0) : __assert_fail ("isAllowedIDChar(C, LangOpts)", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 1447, __extension__ __PRETTY_FUNCTION__))
;
1448 if (LangOpts.AsmPreprocessor) {
1449 return false;
1450 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1451 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1452 C11DisallowedInitialIDCharRanges);
1453 return !C11DisallowedInitialIDChars.contains(C);
1454 } else if (LangOpts.CPlusPlus) {
1455 return true;
1456 } else {
1457 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1458 C99DisallowedInitialIDCharRanges);
1459 return !C99DisallowedInitialIDChars.contains(C);
1460 }
1461}
1462
1463static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1464 const char *End) {
1465 return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1466 L.getSourceLocation(End));
1467}
1468
1469static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1470 CharSourceRange Range, bool IsFirst) {
1471 // Check C99 compatibility.
1472 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1473 enum {
1474 CannotAppearInIdentifier = 0,
1475 CannotStartIdentifier
1476 };
1477
1478 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1479 C99AllowedIDCharRanges);
1480 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1481 C99DisallowedInitialIDCharRanges);
1482 if (!C99AllowedIDChars.contains(C)) {
1483 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1484 << Range
1485 << CannotAppearInIdentifier;
1486 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1487 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1488 << Range
1489 << CannotStartIdentifier;
1490 }
1491 }
1492
1493 // Check C++98 compatibility.
1494 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1495 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1496 CXX03AllowedIDCharRanges);
1497 if (!CXX03AllowedIDChars.contains(C)) {
1498 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1499 << Range;
1500 }
1501 }
1502}
1503
1504/// After encountering UTF-8 character C and interpreting it as an identifier
1505/// character, check whether it's a homoglyph for a common non-identifier
1506/// source character that is unlikely to be an intentional identifier
1507/// character and warn if so.
1508static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1509 CharSourceRange Range) {
1510 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1511 struct HomoglyphPair {
1512 uint32_t Character;
1513 char LooksLike;
1514 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1515 };
1516 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1517 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1518 {U'\u037e', ';'}, // GREEK QUESTION MARK
1519 {U'\u2212', '-'}, // MINUS SIGN
1520 {U'\u2215', '/'}, // DIVISION SLASH
1521 {U'\u2216', '\\'}, // SET MINUS
1522 {U'\u2217', '*'}, // ASTERISK OPERATOR
1523 {U'\u2223', '|'}, // DIVIDES
1524 {U'\u2227', '^'}, // LOGICAL AND
1525 {U'\u2236', ':'}, // RATIO
1526 {U'\u223c', '~'}, // TILDE OPERATOR
1527 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1528 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1529 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1530 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1531 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1532 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1533 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1534 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1535 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1536 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1537 {U'\uff0c', ','}, // FULLWIDTH COMMA
1538 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1539 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1540 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1541 {U'\uff1a', ':'}, // FULLWIDTH COLON
1542 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1543 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1544 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1545 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1546 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1547 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1548 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1549 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1550 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1551 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1552 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1553 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1554 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1555 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1556 {0, 0}
1557 };
1558 auto Homoglyph =
1559 std::lower_bound(std::begin(SortedHomoglyphs),
1560 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1561 if (Homoglyph->Character == C) {
1562 llvm::SmallString<5> CharBuf;
1563 {
1564 llvm::raw_svector_ostream CharOS(CharBuf);
1565 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1566 }
1567 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1568 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1569 << Range << CharBuf << LooksLikeStr;
1570 }
1571}
1572
1573bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1574 Token &Result) {
1575 const char *UCNPtr = CurPtr + Size;
1576 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1577 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
1578 return false;
1579
1580 if (!isLexingRawMode())
1581 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1582 makeCharRange(*this, CurPtr, UCNPtr),
1583 /*IsFirst=*/false);
1584
1585 Result.setFlag(Token::HasUCN);
1586 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1587 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1588 CurPtr = UCNPtr;
1589 else
1590 while (CurPtr != UCNPtr)
1591 (void)getAndAdvanceChar(CurPtr, Result);
1592 return true;
1593}
1594
1595bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1596 const char *UnicodePtr = CurPtr;
1597 llvm::UTF32 CodePoint;
1598 llvm::ConversionResult Result =
1599 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1600 (const llvm::UTF8 *)BufferEnd,
1601 &CodePoint,
1602 llvm::strictConversion);
1603 if (Result != llvm::conversionOK ||
1604 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1605 return false;
1606
1607 if (!isLexingRawMode()) {
1608 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1609 makeCharRange(*this, CurPtr, UnicodePtr),
1610 /*IsFirst=*/false);
1611 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1612 makeCharRange(*this, CurPtr, UnicodePtr));
1613 }
1614
1615 CurPtr = UnicodePtr;
1616 return true;
1617}
1618
1619bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1620 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1621 unsigned Size;
1622 unsigned char C = *CurPtr++;
1623 while (isIdentifierBody(C))
1624 C = *CurPtr++;
1625
1626 --CurPtr; // Back up over the skipped character.
1627
1628 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
1629 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1630 //
1631 // TODO: Could merge these checks into an InfoTable flag to make the
1632 // comparison cheaper
1633 if (isASCII(C) && C != '\\' && C != '?' &&
1634 (C != '$' || !LangOpts.DollarIdents)) {
1635FinishIdentifier:
1636 const char *IdStart = BufferPtr;
1637 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1638 Result.setRawIdentifierData(IdStart);
1639
1640 // If we are in raw mode, return this identifier raw. There is no need to
1641 // look up identifier information or attempt to macro expand it.
1642 if (LexingRawMode)
1643 return true;
1644
1645 // Fill in Result.IdentifierInfo and update the token kind,
1646 // looking up the identifier in the identifier table.
1647 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1648
1649 // Finally, now that we know we have an identifier, pass this off to the
1650 // preprocessor, which may macro expand it or something.
1651 if (II->isHandleIdentifierCase())
1652 return PP->HandleIdentifier(Result);
1653
1654 if (II->getTokenID() == tok::identifier && isCodeCompletionPoint(CurPtr)
1655 && II->getPPKeywordID() == tok::pp_not_keyword
1656 && II->getObjCKeywordID() == tok::objc_not_keyword) {
1657 // Return the code-completion token.
1658 Result.setKind(tok::code_completion);
1659 cutOffLexing();
1660 return true;
1661 }
1662 return true;
1663 }
1664
1665 // Otherwise, $,\,? in identifier found. Enter slower path.
1666
1667 C = getCharAndSize(CurPtr, Size);
1668 while (true) {
1669 if (C == '$') {
1670 // If we hit a $ and they are not supported in identifiers, we are done.
1671 if (!LangOpts.DollarIdents) goto FinishIdentifier;
1672
1673 // Otherwise, emit a diagnostic and continue.
1674 if (!isLexingRawMode())
1675 Diag(CurPtr, diag::ext_dollar_in_identifier);
1676 CurPtr = ConsumeChar(CurPtr, Size, Result);
1677 C = getCharAndSize(CurPtr, Size);
1678 continue;
1679 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
1680 C = getCharAndSize(CurPtr, Size);
1681 continue;
1682 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
1683 C = getCharAndSize(CurPtr, Size);
1684 continue;
1685 } else if (!isIdentifierBody(C)) {
1686 goto FinishIdentifier;
1687 }
1688
1689 // Otherwise, this character is good, consume it.
1690 CurPtr = ConsumeChar(CurPtr, Size, Result);
1691
1692 C = getCharAndSize(CurPtr, Size);
1693 while (isIdentifierBody(C)) {
1694 CurPtr = ConsumeChar(CurPtr, Size, Result);
1695 C = getCharAndSize(CurPtr, Size);
1696 }
1697 }
1698}
1699
1700/// isHexaLiteral - Return true if Start points to a hex constant.
1701/// in microsoft mode (where this is supposed to be several different tokens).
1702bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1703 unsigned Size;
1704 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1705 if (C1 != '0')
1706 return false;
1707 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1708 return (C2 == 'x' || C2 == 'X');
1709}
1710
1711/// LexNumericConstant - Lex the remainder of a integer or floating point
1712/// constant. From[-1] is the first character lexed. Return the end of the
1713/// constant.
1714bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1715 unsigned Size;
1716 char C = getCharAndSize(CurPtr, Size);
1717 char PrevCh = 0;
1718 while (isPreprocessingNumberBody(C)) {
1719 CurPtr = ConsumeChar(CurPtr, Size, Result);
1720 PrevCh = C;
1721 C = getCharAndSize(CurPtr, Size);
1722 }
1723
1724 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1725 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1726 // If we are in Microsoft mode, don't continue if the constant is hex.
1727 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1728 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1729 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1730 }
1731
1732 // If we have a hex FP constant, continue.
1733 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1734 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1735 // not-quite-conforming extension. Only do so if this looks like it's
1736 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1737 bool IsHexFloat = true;
1738 if (!LangOpts.C99) {
1739 if (!isHexaLiteral(BufferPtr, LangOpts))
1740 IsHexFloat = false;
1741 else if (!getLangOpts().CPlusPlus17 &&
1742 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1743 IsHexFloat = false;
1744 }
1745 if (IsHexFloat)
1746 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1747 }
1748
1749 // If we have a digit separator, continue.
1750 if (C == '\'' && getLangOpts().CPlusPlus14) {
1751 unsigned NextSize;
1752 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1753 if (isIdentifierBody(Next)) {
1754 if (!isLexingRawMode())
1755 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1756 CurPtr = ConsumeChar(CurPtr, Size, Result);
1757 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1758 return LexNumericConstant(Result, CurPtr);
1759 }
1760 }
1761
1762 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1763 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1764 return LexNumericConstant(Result, CurPtr);
1765 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1766 return LexNumericConstant(Result, CurPtr);
1767
1768 // Update the location of token as well as BufferPtr.
1769 const char *TokStart = BufferPtr;
1770 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1771 Result.setLiteralData(TokStart);
1772 return true;
1773}
1774
1775/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1776/// in C++11, or warn on a ud-suffix in C++98.
1777const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1778 bool IsStringLiteral) {
1779 assert(getLangOpts().CPlusPlus)(static_cast <bool> (getLangOpts().CPlusPlus) ? void (0
) : __assert_fail ("getLangOpts().CPlusPlus", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 1779, __extension__ __PRETTY_FUNCTION__))
;
1780
1781 // Maximally munch an identifier.
1782 unsigned Size;
1783 char C = getCharAndSize(CurPtr, Size);
1784 bool Consumed = false;
1785
1786 if (!isIdentifierHead(C)) {
1787 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1788 Consumed = true;
1789 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1790 Consumed = true;
1791 else
1792 return CurPtr;
1793 }
1794
1795 if (!getLangOpts().CPlusPlus11) {
1796 if (!isLexingRawMode())
1797 Diag(CurPtr,
1798 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1799 : diag::warn_cxx11_compat_reserved_user_defined_literal)
1800 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1801 return CurPtr;
1802 }
1803
1804 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1805 // that does not start with an underscore is ill-formed. As a conforming
1806 // extension, we treat all such suffixes as if they had whitespace before
1807 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1808 // likely to be a ud-suffix than a macro, however, and accept that.
1809 if (!Consumed) {
1810 bool IsUDSuffix = false;
1811 if (C == '_')
1812 IsUDSuffix = true;
1813 else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
1814 // In C++1y, we need to look ahead a few characters to see if this is a
1815 // valid suffix for a string literal or a numeric literal (this could be
1816 // the 'operator""if' defining a numeric literal operator).
1817 const unsigned MaxStandardSuffixLength = 3;
1818 char Buffer[MaxStandardSuffixLength] = { C };
1819 unsigned Consumed = Size;
1820 unsigned Chars = 1;
1821 while (true) {
1822 unsigned NextSize;
1823 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1824 getLangOpts());
1825 if (!isIdentifierBody(Next)) {
1826 // End of suffix. Check whether this is on the whitelist.
1827 const StringRef CompleteSuffix(Buffer, Chars);
1828 IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1829 CompleteSuffix);
1830 break;
1831 }
1832
1833 if (Chars == MaxStandardSuffixLength)
1834 // Too long: can't be a standard suffix.
1835 break;
1836
1837 Buffer[Chars++] = Next;
1838 Consumed += NextSize;
1839 }
1840 }
1841
1842 if (!IsUDSuffix) {
1843 if (!isLexingRawMode())
1844 Diag(CurPtr, getLangOpts().MSVCCompat
1845 ? diag::ext_ms_reserved_user_defined_literal
1846 : diag::ext_reserved_user_defined_literal)
1847 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1848 return CurPtr;
1849 }
1850
1851 CurPtr = ConsumeChar(CurPtr, Size, Result);
1852 }
1853
1854 Result.setFlag(Token::HasUDSuffix);
1855 while (true) {
1856 C = getCharAndSize(CurPtr, Size);
1857 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
1858 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
1859 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
1860 else break;
1861 }
1862
1863 return CurPtr;
1864}
1865
1866/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1867/// either " or L" or u8" or u" or U".
1868bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1869 tok::TokenKind Kind) {
1870 // Does this string contain the \0 character?
1871 const char *NulCharacter = nullptr;
1872
1873 if (!isLexingRawMode() &&
1874 (Kind == tok::utf8_string_literal ||
1875 Kind == tok::utf16_string_literal ||
1876 Kind == tok::utf32_string_literal))
1877 Diag(BufferPtr, getLangOpts().CPlusPlus
1878 ? diag::warn_cxx98_compat_unicode_literal
1879 : diag::warn_c99_compat_unicode_literal);
1880
1881 char C = getAndAdvanceChar(CurPtr, Result);
1882 while (C != '"') {
1883 // Skip escaped characters. Escaped newlines will already be processed by
1884 // getAndAdvanceChar.
1885 if (C == '\\')
1886 C = getAndAdvanceChar(CurPtr, Result);
1887
1888 if (C == '\n' || C == '\r' || // Newline.
1889 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
1890 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1891 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1892 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1893 return true;
1894 }
1895
1896 if (C == 0) {
1897 if (isCodeCompletionPoint(CurPtr-1)) {
1898 PP->CodeCompleteNaturalLanguage();
1899 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1900 cutOffLexing();
1901 return true;
1902 }
1903
1904 NulCharacter = CurPtr-1;
1905 }
1906 C = getAndAdvanceChar(CurPtr, Result);
1907 }
1908
1909 // If we are in C++11, lex the optional ud-suffix.
1910 if (getLangOpts().CPlusPlus)
1911 CurPtr = LexUDSuffix(Result, CurPtr, true);
1912
1913 // If a nul character existed in the string, warn about it.
1914 if (NulCharacter && !isLexingRawMode())
1915 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1916
1917 // Update the location of the token as well as the BufferPtr instance var.
1918 const char *TokStart = BufferPtr;
1919 FormTokenWithChars(Result, CurPtr, Kind);
1920 Result.setLiteralData(TokStart);
1921 return true;
1922}
1923
1924/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1925/// having lexed R", LR", u8R", uR", or UR".
1926bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1927 tok::TokenKind Kind) {
1928 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1929 // Between the initial and final double quote characters of the raw string,
1930 // any transformations performed in phases 1 and 2 (trigraphs,
1931 // universal-character-names, and line splicing) are reverted.
1932
1933 if (!isLexingRawMode())
1934 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1935
1936 unsigned PrefixLen = 0;
1937
1938 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
1939 ++PrefixLen;
1940
1941 // If the last character was not a '(', then we didn't lex a valid delimiter.
1942 if (CurPtr[PrefixLen] != '(') {
1943 if (!isLexingRawMode()) {
1944 const char *PrefixEnd = &CurPtr[PrefixLen];
1945 if (PrefixLen == 16) {
1946 Diag(PrefixEnd, diag::err_raw_delim_too_long);
1947 } else {
1948 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1949 << StringRef(PrefixEnd, 1);
1950 }
1951 }
1952
1953 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1954 // it's possible the '"' was intended to be part of the raw string, but
1955 // there's not much we can do about that.
1956 while (true) {
1957 char C = *CurPtr++;
1958
1959 if (C == '"')
1960 break;
1961 if (C == 0 && CurPtr-1 == BufferEnd) {
1962 --CurPtr;
1963 break;
1964 }
1965 }
1966
1967 FormTokenWithChars(Result, CurPtr, tok::unknown);
1968 return true;
1969 }
1970
1971 // Save prefix and move CurPtr past it
1972 const char *Prefix = CurPtr;
1973 CurPtr += PrefixLen + 1; // skip over prefix and '('
1974
1975 while (true) {
1976 char C = *CurPtr++;
1977
1978 if (C == ')') {
1979 // Check for prefix match and closing quote.
1980 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
1981 CurPtr += PrefixLen + 1; // skip over prefix and '"'
1982 break;
1983 }
1984 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
1985 if (!isLexingRawMode())
1986 Diag(BufferPtr, diag::err_unterminated_raw_string)
1987 << StringRef(Prefix, PrefixLen);
1988 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1989 return true;
1990 }
1991 }
1992
1993 // If we are in C++11, lex the optional ud-suffix.
1994 if (getLangOpts().CPlusPlus)
1995 CurPtr = LexUDSuffix(Result, CurPtr, true);
1996
1997 // Update the location of token as well as BufferPtr.
1998 const char *TokStart = BufferPtr;
1999 FormTokenWithChars(Result, CurPtr, Kind);
2000 Result.setLiteralData(TokStart);
2001 return true;
2002}
2003
2004/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2005/// after having lexed the '<' character. This is used for #include filenames.
2006bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2007 // Does this string contain the \0 character?
2008 const char *NulCharacter = nullptr;
2009 const char *AfterLessPos = CurPtr;
2010 char C = getAndAdvanceChar(CurPtr, Result);
2011 while (C != '>') {
2012 // Skip escaped characters. Escaped newlines will already be processed by
2013 // getAndAdvanceChar.
2014 if (C == '\\')
2015 C = getAndAdvanceChar(CurPtr, Result);
2016
2017 if (C == '\n' || C == '\r' || // Newline.
2018 (C == 0 && (CurPtr-1 == BufferEnd || // End of file.
2019 isCodeCompletionPoint(CurPtr-1)))) {
2020 // If the filename is unterminated, then it must just be a lone <
2021 // character. Return this as such.
2022 FormTokenWithChars(Result, AfterLessPos, tok::less);
2023 return true;
2024 }
2025
2026 if (C == 0) {
2027 NulCharacter = CurPtr-1;
2028 }
2029 C = getAndAdvanceChar(CurPtr, Result);
2030 }
2031
2032 // If a nul character existed in the string, warn about it.
2033 if (NulCharacter && !isLexingRawMode())
2034 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2035
2036 // Update the location of token as well as BufferPtr.
2037 const char *TokStart = BufferPtr;
2038 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
2039 Result.setLiteralData(TokStart);
2040 return true;
2041}
2042
2043/// LexCharConstant - Lex the remainder of a character constant, after having
2044/// lexed either ' or L' or u8' or u' or U'.
2045bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2046 tok::TokenKind Kind) {
2047 // Does this character contain the \0 character?
2048 const char *NulCharacter = nullptr;
2049
2050 if (!isLexingRawMode()) {
2051 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2052 Diag(BufferPtr, getLangOpts().CPlusPlus
2053 ? diag::warn_cxx98_compat_unicode_literal
2054 : diag::warn_c99_compat_unicode_literal);
2055 else if (Kind == tok::utf8_char_constant)
2056 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2057 }
2058
2059 char C = getAndAdvanceChar(CurPtr, Result);
2060 if (C == '\'') {
2061 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2062 Diag(BufferPtr, diag::ext_empty_character);
2063 FormTokenWithChars(Result, CurPtr, tok::unknown);
2064 return true;
2065 }
2066
2067 while (C != '\'') {
2068 // Skip escaped characters.
2069 if (C == '\\')
2070 C = getAndAdvanceChar(CurPtr, Result);
2071
2072 if (C == '\n' || C == '\r' || // Newline.
2073 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2074 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2075 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2076 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2077 return true;
2078 }
2079
2080 if (C == 0) {
2081 if (isCodeCompletionPoint(CurPtr-1)) {
2082 PP->CodeCompleteNaturalLanguage();
2083 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2084 cutOffLexing();
2085 return true;
2086 }
2087
2088 NulCharacter = CurPtr-1;
2089 }
2090 C = getAndAdvanceChar(CurPtr, Result);
2091 }
2092
2093 // If we are in C++11, lex the optional ud-suffix.
2094 if (getLangOpts().CPlusPlus)
2095 CurPtr = LexUDSuffix(Result, CurPtr, false);
2096
2097 // If a nul character existed in the character, warn about it.
2098 if (NulCharacter && !isLexingRawMode())
2099 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2100
2101 // Update the location of token as well as BufferPtr.
2102 const char *TokStart = BufferPtr;
2103 FormTokenWithChars(Result, CurPtr, Kind);
2104 Result.setLiteralData(TokStart);
2105 return true;
2106}
2107
2108/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2109/// Update BufferPtr to point to the next non-whitespace character and return.
2110///
2111/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2112bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2113 bool &TokAtPhysicalStartOfLine) {
2114 // Whitespace - Skip it, then return the token after the whitespace.
2115 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2116
2117 unsigned char Char = *CurPtr;
2118
2119 // Skip consecutive spaces efficiently.
2120 while (true) {
2121 // Skip horizontal whitespace very aggressively.
2122 while (isHorizontalWhitespace(Char))
2123 Char = *++CurPtr;
2124
2125 // Otherwise if we have something other than whitespace, we're done.
2126 if (!isVerticalWhitespace(Char))
2127 break;
2128
2129 if (ParsingPreprocessorDirective) {
2130 // End of preprocessor directive line, let LexTokenInternal handle this.
2131 BufferPtr = CurPtr;
2132 return false;
2133 }
2134
2135 // OK, but handle newline.
2136 SawNewline = true;
2137 Char = *++CurPtr;
2138 }
2139
2140 // If the client wants us to return whitespace, return it now.
2141 if (isKeepWhitespaceMode()) {
2142 FormTokenWithChars(Result, CurPtr, tok::unknown);
2143 if (SawNewline) {
2144 IsAtStartOfLine = true;
2145 IsAtPhysicalStartOfLine = true;
2146 }
2147 // FIXME: The next token will not have LeadingSpace set.
2148 return true;
2149 }
2150
2151 // If this isn't immediately after a newline, there is leading space.
2152 char PrevChar = CurPtr[-1];
2153 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2154
2155 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2156 if (SawNewline) {
2157 Result.setFlag(Token::StartOfLine);
2158 TokAtPhysicalStartOfLine = true;
2159 }
2160
2161 BufferPtr = CurPtr;
2162 return false;
2163}
2164
2165/// We have just read the // characters from input. Skip until we find the
2166/// newline character that terminates the comment. Then update BufferPtr and
2167/// return.
2168///
2169/// If we're in KeepCommentMode or any CommentHandler has inserted
2170/// some tokens, this will store the first token and return true.
2171bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2172 bool &TokAtPhysicalStartOfLine) {
2173 // If Line comments aren't explicitly enabled for this language, emit an
2174 // extension warning.
2175 if (!LangOpts.LineComment && !isLexingRawMode()) {
2176 Diag(BufferPtr, diag::ext_line_comment);
2177
2178 // Mark them enabled so we only emit one warning for this translation
2179 // unit.
2180 LangOpts.LineComment = true;
2181 }
2182
2183 // Scan over the body of the comment. The common case, when scanning, is that
2184 // the comment contains normal ascii characters with nothing interesting in
2185 // them. As such, optimize for this case with the inner loop.
2186 //
2187 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2188 // character that ends the line comment.
2189 char C;
2190 while (true) {
2191 C = *CurPtr;
2192 // Skip over characters in the fast loop.
2193 while (C != 0 && // Potentially EOF.
2194 C != '\n' && C != '\r') // Newline or DOS-style newline.
2195 C = *++CurPtr;
2196
2197 const char *NextLine = CurPtr;
2198 if (C != 0) {
2199 // We found a newline, see if it's escaped.
2200 const char *EscapePtr = CurPtr-1;
2201 bool HasSpace = false;
2202 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2203 --EscapePtr;
2204 HasSpace = true;
2205 }
2206
2207 if (*EscapePtr == '\\')
2208 // Escaped newline.
2209 CurPtr = EscapePtr;
2210 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2211 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2212 // Trigraph-escaped newline.
2213 CurPtr = EscapePtr-2;
2214 else
2215 break; // This is a newline, we're done.
2216
2217 // If there was space between the backslash and newline, warn about it.
2218 if (HasSpace && !isLexingRawMode())
2219 Diag(EscapePtr, diag::backslash_newline_space);
2220 }
2221
2222 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2223 // properly decode the character. Read it in raw mode to avoid emitting
2224 // diagnostics about things like trigraphs. If we see an escaped newline,
2225 // we'll handle it below.
2226 const char *OldPtr = CurPtr;
2227 bool OldRawMode = isLexingRawMode();
2228 LexingRawMode = true;
2229 C = getAndAdvanceChar(CurPtr, Result);
2230 LexingRawMode = OldRawMode;
2231
2232 // If we only read only one character, then no special handling is needed.
2233 // We're done and can skip forward to the newline.
2234 if (C != 0 && CurPtr == OldPtr+1) {
2235 CurPtr = NextLine;
2236 break;
2237 }
2238
2239 // If we read multiple characters, and one of those characters was a \r or
2240 // \n, then we had an escaped newline within the comment. Emit diagnostic
2241 // unless the next line is also a // comment.
2242 if (CurPtr != OldPtr + 1 && C != '/' &&
2243 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2244 for (; OldPtr != CurPtr; ++OldPtr)
2245 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2246 // Okay, we found a // comment that ends in a newline, if the next
2247 // line is also a // comment, but has spaces, don't emit a diagnostic.
2248 if (isWhitespace(C)) {
2249 const char *ForwardPtr = CurPtr;
2250 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2251 ++ForwardPtr;
2252 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2253 break;
2254 }
2255
2256 if (!isLexingRawMode())
2257 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2258 break;
2259 }
2260 }
2261
2262 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2263 --CurPtr;
2264 break;
2265 }
2266
2267 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2268 PP->CodeCompleteNaturalLanguage();
2269 cutOffLexing();
2270 return false;
2271 }
2272 }
2273
2274 // Found but did not consume the newline. Notify comment handlers about the
2275 // comment unless we're in a #if 0 block.
2276 if (PP && !isLexingRawMode() &&
2277 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2278 getSourceLocation(CurPtr)))) {
2279 BufferPtr = CurPtr;
2280 return true; // A token has to be returned.
2281 }
2282
2283 // If we are returning comments as tokens, return this comment as a token.
2284 if (inKeepCommentMode())
2285 return SaveLineComment(Result, CurPtr);
2286
2287 // If we are inside a preprocessor directive and we see the end of line,
2288 // return immediately, so that the lexer can return this as an EOD token.
2289 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2290 BufferPtr = CurPtr;
2291 return false;
2292 }
2293
2294 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2295 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2296 // contribute to another token), it isn't needed for correctness. Note that
2297 // this is ok even in KeepWhitespaceMode, because we would have returned the
2298 /// comment above in that mode.
2299 ++CurPtr;
2300
2301 // The next returned token is at the start of the line.
2302 Result.setFlag(Token::StartOfLine);
2303 TokAtPhysicalStartOfLine = true;
2304 // No leading whitespace seen so far.
2305 Result.clearFlag(Token::LeadingSpace);
2306 BufferPtr = CurPtr;
2307 return false;
2308}
2309
2310/// If in save-comment mode, package up this Line comment in an appropriate
2311/// way and return it.
2312bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2313 // If we're not in a preprocessor directive, just return the // comment
2314 // directly.
2315 FormTokenWithChars(Result, CurPtr, tok::comment);
2316
2317 if (!ParsingPreprocessorDirective || LexingRawMode)
2318 return true;
2319
2320 // If this Line-style comment is in a macro definition, transmogrify it into
2321 // a C-style block comment.
2322 bool Invalid = false;
2323 std::string Spelling = PP->getSpelling(Result, &Invalid);
2324 if (Invalid)
2325 return true;
2326
2327 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?")(static_cast <bool> (Spelling[0] == '/' && Spelling
[1] == '/' && "Not line comment?") ? void (0) : __assert_fail
("Spelling[0] == '/' && Spelling[1] == '/' && \"Not line comment?\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2327, __extension__ __PRETTY_FUNCTION__))
;
2328 Spelling[1] = '*'; // Change prefix to "/*".
2329 Spelling += "*/"; // add suffix.
2330
2331 Result.setKind(tok::comment);
2332 PP->CreateString(Spelling, Result,
2333 Result.getLocation(), Result.getLocation());
2334 return true;
2335}
2336
2337/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2338/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2339/// a diagnostic if so. We know that the newline is inside of a block comment.
2340static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2341 Lexer *L) {
2342 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r')(static_cast <bool> (CurPtr[0] == '\n' || CurPtr[0] == '\r'
) ? void (0) : __assert_fail ("CurPtr[0] == '\\n' || CurPtr[0] == '\\r'"
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2342, __extension__ __PRETTY_FUNCTION__))
;
2343
2344 // Back up off the newline.
2345 --CurPtr;
2346
2347 // If this is a two-character newline sequence, skip the other character.
2348 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2349 // \n\n or \r\r -> not escaped newline.
2350 if (CurPtr[0] == CurPtr[1])
2351 return false;
2352 // \n\r or \r\n -> skip the newline.
2353 --CurPtr;
2354 }
2355
2356 // If we have horizontal whitespace, skip over it. We allow whitespace
2357 // between the slash and newline.
2358 bool HasSpace = false;
2359 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2360 --CurPtr;
2361 HasSpace = true;
2362 }
2363
2364 // If we have a slash, we know this is an escaped newline.
2365 if (*CurPtr == '\\') {
2366 if (CurPtr[-1] != '*') return false;
2367 } else {
2368 // It isn't a slash, is it the ?? / trigraph?
2369 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
2370 CurPtr[-3] != '*')
2371 return false;
2372
2373 // This is the trigraph ending the comment. Emit a stern warning!
2374 CurPtr -= 2;
2375
2376 // If no trigraphs are enabled, warn that we ignored this trigraph and
2377 // ignore this * character.
2378 if (!L->getLangOpts().Trigraphs) {
2379 if (!L->isLexingRawMode())
2380 L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2381 return false;
2382 }
2383 if (!L->isLexingRawMode())
2384 L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2385 }
2386
2387 // Warn about having an escaped newline between the */ characters.
2388 if (!L->isLexingRawMode())
2389 L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2390
2391 // If there was space between the backslash and newline, warn about it.
2392 if (HasSpace && !L->isLexingRawMode())
2393 L->Diag(CurPtr, diag::backslash_newline_space);
2394
2395 return true;
2396}
2397
2398#ifdef __SSE2__1
2399#include <emmintrin.h>
2400#elif __ALTIVEC__
2401#include <altivec.h>
2402#undef bool
2403#endif
2404
2405/// We have just read from input the / and * characters that started a comment.
2406/// Read until we find the * and / characters that terminate the comment.
2407/// Note that we don't bother decoding trigraphs or escaped newlines in block
2408/// comments, because they cannot cause the comment to end. The only thing
2409/// that can happen is the comment could end with an escaped newline between
2410/// the terminating * and /.
2411///
2412/// If we're in KeepCommentMode or any CommentHandler has inserted
2413/// some tokens, this will store the first token and return true.
2414bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2415 bool &TokAtPhysicalStartOfLine) {
2416 // Scan one character past where we should, looking for a '/' character. Once
2417 // we find it, check to see if it was preceded by a *. This common
2418 // optimization helps people who like to put a lot of * characters in their
2419 // comments.
2420
2421 // The first character we get with newlines and trigraphs skipped to handle
2422 // the degenerate /*/ case below correctly if the * has an escaped newline
2423 // after it.
2424 unsigned CharSize;
2425 unsigned char C = getCharAndSize(CurPtr, CharSize);
2426 CurPtr += CharSize;
2427 if (C == 0 && CurPtr == BufferEnd+1) {
2428 if (!isLexingRawMode())
2429 Diag(BufferPtr, diag::err_unterminated_block_comment);
2430 --CurPtr;
2431
2432 // KeepWhitespaceMode should return this broken comment as a token. Since
2433 // it isn't a well formed comment, just return it as an 'unknown' token.
2434 if (isKeepWhitespaceMode()) {
2435 FormTokenWithChars(Result, CurPtr, tok::unknown);
2436 return true;
2437 }
2438
2439 BufferPtr = CurPtr;
2440 return false;
2441 }
2442
2443 // Check to see if the first character after the '/*' is another /. If so,
2444 // then this slash does not end the block comment, it is part of it.
2445 if (C == '/')
2446 C = *CurPtr++;
2447
2448 while (true) {
2449 // Skip over all non-interesting characters until we find end of buffer or a
2450 // (probably ending) '/' character.
2451 if (CurPtr + 24 < BufferEnd &&
2452 // If there is a code-completion point avoid the fast scan because it
2453 // doesn't check for '\0'.
2454 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2455 // While not aligned to a 16-byte boundary.
2456 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
2457 C = *CurPtr++;
2458
2459 if (C == '/') goto FoundSlash;
2460
2461#ifdef __SSE2__1
2462 __m128i Slashes = _mm_set1_epi8('/');
2463 while (CurPtr+16 <= BufferEnd) {
2464 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2465 Slashes));
2466 if (cmp != 0) {
2467 // Adjust the pointer to point directly after the first slash. It's
2468 // not necessary to set C here, it will be overwritten at the end of
2469 // the outer loop.
2470 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2471 goto FoundSlash;
2472 }
2473 CurPtr += 16;
2474 }
2475#elif __ALTIVEC__
2476 __vector unsigned char Slashes = {
2477 '/', '/', '/', '/', '/', '/', '/', '/',
2478 '/', '/', '/', '/', '/', '/', '/', '/'
2479 };
2480 while (CurPtr+16 <= BufferEnd &&
2481 !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
2482 CurPtr += 16;
2483#else
2484 // Scan for '/' quickly. Many block comments are very large.
2485 while (CurPtr[0] != '/' &&
2486 CurPtr[1] != '/' &&
2487 CurPtr[2] != '/' &&
2488 CurPtr[3] != '/' &&
2489 CurPtr+4 < BufferEnd) {
2490 CurPtr += 4;
2491 }
2492#endif
2493
2494 // It has to be one of the bytes scanned, increment to it and read one.
2495 C = *CurPtr++;
2496 }
2497
2498 // Loop to scan the remainder.
2499 while (C != '/' && C != '\0')
2500 C = *CurPtr++;
2501
2502 if (C == '/') {
2503 FoundSlash:
2504 if (CurPtr[-2] == '*') // We found the final */. We're done!
2505 break;
2506
2507 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2508 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2509 // We found the final */, though it had an escaped newline between the
2510 // * and /. We're done!
2511 break;
2512 }
2513 }
2514 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2515 // If this is a /* inside of the comment, emit a warning. Don't do this
2516 // if this is a /*/, which will end the comment. This misses cases with
2517 // embedded escaped newlines, but oh well.
2518 if (!isLexingRawMode())
2519 Diag(CurPtr-1, diag::warn_nested_block_comment);
2520 }
2521 } else if (C == 0 && CurPtr == BufferEnd+1) {
2522 if (!isLexingRawMode())
2523 Diag(BufferPtr, diag::err_unterminated_block_comment);
2524 // Note: the user probably forgot a */. We could continue immediately
2525 // after the /*, but this would involve lexing a lot of what really is the
2526 // comment, which surely would confuse the parser.
2527 --CurPtr;
2528
2529 // KeepWhitespaceMode should return this broken comment as a token. Since
2530 // it isn't a well formed comment, just return it as an 'unknown' token.
2531 if (isKeepWhitespaceMode()) {
2532 FormTokenWithChars(Result, CurPtr, tok::unknown);
2533 return true;
2534 }
2535
2536 BufferPtr = CurPtr;
2537 return false;
2538 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2539 PP->CodeCompleteNaturalLanguage();
2540 cutOffLexing();
2541 return false;
2542 }
2543
2544 C = *CurPtr++;
2545 }
2546
2547 // Notify comment handlers about the comment unless we're in a #if 0 block.
2548 if (PP && !isLexingRawMode() &&
2549 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2550 getSourceLocation(CurPtr)))) {
2551 BufferPtr = CurPtr;
2552 return true; // A token has to be returned.
2553 }
2554
2555 // If we are returning comments as tokens, return this comment as a token.
2556 if (inKeepCommentMode()) {
2557 FormTokenWithChars(Result, CurPtr, tok::comment);
2558 return true;
2559 }
2560
2561 // It is common for the tokens immediately after a /**/ comment to be
2562 // whitespace. Instead of going through the big switch, handle it
2563 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2564 // have already returned above with the comment as a token.
2565 if (isHorizontalWhitespace(*CurPtr)) {
2566 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2567 return false;
2568 }
2569
2570 // Otherwise, just return so that the next character will be lexed as a token.
2571 BufferPtr = CurPtr;
2572 Result.setFlag(Token::LeadingSpace);
2573 return false;
2574}
2575
2576//===----------------------------------------------------------------------===//
2577// Primary Lexing Entry Points
2578//===----------------------------------------------------------------------===//
2579
2580/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2581/// uninterpreted string. This switches the lexer out of directive mode.
2582void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2583 assert(ParsingPreprocessorDirective && ParsingFilename == false &&(static_cast <bool> (ParsingPreprocessorDirective &&
ParsingFilename == false && "Must be in a preprocessing directive!"
) ? void (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2584, __extension__ __PRETTY_FUNCTION__))
2584 "Must be in a preprocessing directive!")(static_cast <bool> (ParsingPreprocessorDirective &&
ParsingFilename == false && "Must be in a preprocessing directive!"
) ? void (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2584, __extension__ __PRETTY_FUNCTION__))
;
2585 Token Tmp;
2586
2587 // CurPtr - Cache BufferPtr in an automatic variable.
2588 const char *CurPtr = BufferPtr;
2589 while (true) {
1
Loop condition is true. Entering loop body
2590 char Char = getAndAdvanceChar(CurPtr, Tmp);
2
Calling 'Lexer::getAndAdvanceChar'
2591 switch (Char) {
2592 default:
2593 if (Result)
2594 Result->push_back(Char);
2595 break;
2596 case 0: // Null.
2597 // Found end of file?
2598 if (CurPtr-1 != BufferEnd) {
2599 if (isCodeCompletionPoint(CurPtr-1)) {
2600 PP->CodeCompleteNaturalLanguage();
2601 cutOffLexing();
2602 return;
2603 }
2604
2605 // Nope, normal character, continue.
2606 if (Result)
2607 Result->push_back(Char);
2608 break;
2609 }
2610 // FALL THROUGH.
2611 LLVM_FALLTHROUGH[[clang::fallthrough]];
2612 case '\r':
2613 case '\n':
2614 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2615 assert(CurPtr[-1] == Char && "Trigraphs for newline?")(static_cast <bool> (CurPtr[-1] == Char && "Trigraphs for newline?"
) ? void (0) : __assert_fail ("CurPtr[-1] == Char && \"Trigraphs for newline?\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2615, __extension__ __PRETTY_FUNCTION__))
;
2616 BufferPtr = CurPtr-1;
2617
2618 // Next, lex the character, which should handle the EOD transition.
2619 Lex(Tmp);
2620 if (Tmp.is(tok::code_completion)) {
2621 if (PP)
2622 PP->CodeCompleteNaturalLanguage();
2623 Lex(Tmp);
2624 }
2625 assert(Tmp.is(tok::eod) && "Unexpected token!")(static_cast <bool> (Tmp.is(tok::eod) && "Unexpected token!"
) ? void (0) : __assert_fail ("Tmp.is(tok::eod) && \"Unexpected token!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2625, __extension__ __PRETTY_FUNCTION__))
;
2626
2627 // Finally, we're done;
2628 return;
2629 }
2630 }
2631}
2632
2633/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2634/// condition, reporting diagnostics and handling other edge cases as required.
2635/// This returns true if Result contains a token, false if PP.Lex should be
2636/// called again.
2637bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2638 // If we hit the end of the file while parsing a preprocessor directive,
2639 // end the preprocessor directive first. The next token returned will
2640 // then be the end of file.
2641 if (ParsingPreprocessorDirective) {
2642 // Done parsing the "line".
2643 ParsingPreprocessorDirective = false;
2644 // Update the location of token as well as BufferPtr.
2645 FormTokenWithChars(Result, CurPtr, tok::eod);
2646
2647 // Restore comment saving mode, in case it was disabled for directive.
2648 if (PP)
2649 resetExtendedTokenMode();
2650 return true; // Have a token.
2651 }
2652
2653 // If we are in raw mode, return this event as an EOF token. Let the caller
2654 // that put us in raw mode handle the event.
2655 if (isLexingRawMode()) {
2656 Result.startToken();
2657 BufferPtr = BufferEnd;
2658 FormTokenWithChars(Result, BufferEnd, tok::eof);
2659 return true;
2660 }
2661
2662 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2663 PP->setRecordedPreambleConditionalStack(ConditionalStack);
2664 ConditionalStack.clear();
2665 }
2666
2667 // Issue diagnostics for unterminated #if and missing newline.
2668
2669 // If we are in a #if directive, emit an error.
2670 while (!ConditionalStack.empty()) {
2671 if (PP->getCodeCompletionFileLoc() != FileLoc)
2672 PP->Diag(ConditionalStack.back().IfLoc,
2673 diag::err_pp_unterminated_conditional);
2674 ConditionalStack.pop_back();
2675 }
2676
2677 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2678 // a pedwarn.
2679 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2680 DiagnosticsEngine &Diags = PP->getDiagnostics();
2681 SourceLocation EndLoc = getSourceLocation(BufferEnd);
2682 unsigned DiagID;
2683
2684 if (LangOpts.CPlusPlus11) {
2685 // C++11 [lex.phases] 2.2 p2
2686 // Prefer the C++98 pedantic compatibility warning over the generic,
2687 // non-extension, user-requested "missing newline at EOF" warning.
2688 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2689 DiagID = diag::warn_cxx98_compat_no_newline_eof;
2690 } else {
2691 DiagID = diag::warn_no_newline_eof;
2692 }
2693 } else {
2694 DiagID = diag::ext_no_newline_eof;
2695 }
2696
2697 Diag(BufferEnd, DiagID)
2698 << FixItHint::CreateInsertion(EndLoc, "\n");
2699 }
2700
2701 BufferPtr = CurPtr;
2702
2703 // Finally, let the preprocessor handle this.
2704 return PP->HandleEndOfFile(Result, isPragmaLexer());
2705}
2706
2707/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2708/// the specified lexer will return a tok::l_paren token, 0 if it is something
2709/// else and 2 if there are no more tokens in the buffer controlled by the
2710/// lexer.
2711unsigned Lexer::isNextPPTokenLParen() {
2712 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?")(static_cast <bool> (!LexingRawMode && "How can we expand a macro from a skipping buffer?"
) ? void (0) : __assert_fail ("!LexingRawMode && \"How can we expand a macro from a skipping buffer?\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2712, __extension__ __PRETTY_FUNCTION__))
;
2713
2714 // Switch to 'skipping' mode. This will ensure that we can lex a token
2715 // without emitting diagnostics, disables macro expansion, and will cause EOF
2716 // to return an EOF token instead of popping the include stack.
2717 LexingRawMode = true;
2718
2719 // Save state that can be changed while lexing so that we can restore it.
2720 const char *TmpBufferPtr = BufferPtr;
2721 bool inPPDirectiveMode = ParsingPreprocessorDirective;
2722 bool atStartOfLine = IsAtStartOfLine;
2723 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2724 bool leadingSpace = HasLeadingSpace;
2725
2726 Token Tok;
2727 Lex(Tok);
2728
2729 // Restore state that may have changed.
2730 BufferPtr = TmpBufferPtr;
2731 ParsingPreprocessorDirective = inPPDirectiveMode;
2732 HasLeadingSpace = leadingSpace;
2733 IsAtStartOfLine = atStartOfLine;
2734 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2735
2736 // Restore the lexer back to non-skipping mode.
2737 LexingRawMode = false;
2738
2739 if (Tok.is(tok::eof))
2740 return 2;
2741 return Tok.is(tok::l_paren);
2742}
2743
2744/// \brief Find the end of a version control conflict marker.
2745static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2746 ConflictMarkerKind CMK) {
2747 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2748 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2749 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2750 size_t Pos = RestOfBuffer.find(Terminator);
2751 while (Pos != StringRef::npos) {
2752 // Must occur at start of line.
2753 if (Pos == 0 ||
2754 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
2755 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2756 Pos = RestOfBuffer.find(Terminator);
2757 continue;
2758 }
2759 return RestOfBuffer.data()+Pos;
2760 }
2761 return nullptr;
2762}
2763
2764/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2765/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2766/// and recover nicely. This returns true if it is a conflict marker and false
2767/// if not.
2768bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2769 // Only a conflict marker if it starts at the beginning of a line.
2770 if (CurPtr != BufferStart &&
2771 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2772 return false;
2773
2774 // Check to see if we have <<<<<<< or >>>>.
2775 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2776 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
2777 return false;
2778
2779 // If we have a situation where we don't care about conflict markers, ignore
2780 // it.
2781 if (CurrentConflictMarkerState || isLexingRawMode())
2782 return false;
2783
2784 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
2785
2786 // Check to see if there is an ending marker somewhere in the buffer at the
2787 // start of a line to terminate this conflict marker.
2788 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2789 // We found a match. We are really in a conflict marker.
2790 // Diagnose this, and ignore to the end of line.
2791 Diag(CurPtr, diag::err_conflict_marker);
2792 CurrentConflictMarkerState = Kind;
2793
2794 // Skip ahead to the end of line. We know this exists because the
2795 // end-of-conflict marker starts with \r or \n.
2796 while (*CurPtr != '\r' && *CurPtr != '\n') {
2797 assert(CurPtr != BufferEnd && "Didn't find end of line")(static_cast <bool> (CurPtr != BufferEnd && "Didn't find end of line"
) ? void (0) : __assert_fail ("CurPtr != BufferEnd && \"Didn't find end of line\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2797, __extension__ __PRETTY_FUNCTION__))
;
2798 ++CurPtr;
2799 }
2800 BufferPtr = CurPtr;
2801 return true;
2802 }
2803
2804 // No end of conflict marker found.
2805 return false;
2806}
2807
2808/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2809/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2810/// is the end of a conflict marker. Handle it by ignoring up until the end of
2811/// the line. This returns true if it is a conflict marker and false if not.
2812bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2813 // Only a conflict marker if it starts at the beginning of a line.
2814 if (CurPtr != BufferStart &&
2815 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2816 return false;
2817
2818 // If we have a situation where we don't care about conflict markers, ignore
2819 // it.
2820 if (!CurrentConflictMarkerState || isLexingRawMode())
2821 return false;
2822
2823 // Check to see if we have the marker (4 characters in a row).
2824 for (unsigned i = 1; i != 4; ++i)
2825 if (CurPtr[i] != CurPtr[0])
2826 return false;
2827
2828 // If we do have it, search for the end of the conflict marker. This could
2829 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
2830 // be the end of conflict marker.
2831 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2832 CurrentConflictMarkerState)) {
2833 CurPtr = End;
2834
2835 // Skip ahead to the end of line.
2836 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2837 ++CurPtr;
2838
2839 BufferPtr = CurPtr;
2840
2841 // No longer in the conflict marker.
2842 CurrentConflictMarkerState = CMK_None;
2843 return true;
2844 }
2845
2846 return false;
2847}
2848
2849static const char *findPlaceholderEnd(const char *CurPtr,
2850 const char *BufferEnd) {
2851 if (CurPtr == BufferEnd)
2852 return nullptr;
2853 BufferEnd -= 1; // Scan until the second last character.
2854 for (; CurPtr != BufferEnd; ++CurPtr) {
2855 if (CurPtr[0] == '#' && CurPtr[1] == '>')
2856 return CurPtr + 2;
2857 }
2858 return nullptr;
2859}
2860
2861bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2862 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!")(static_cast <bool> (CurPtr[-1] == '<' && CurPtr
[0] == '#' && "Not a placeholder!") ? void (0) : __assert_fail
("CurPtr[-1] == '<' && CurPtr[0] == '#' && \"Not a placeholder!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 2862, __extension__ __PRETTY_FUNCTION__))
;
2863 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
2864 return false;
2865 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2866 if (!End)
2867 return false;
2868 const char *Start = CurPtr - 1;
2869 if (!LangOpts.AllowEditorPlaceholders)
2870 Diag(Start, diag::err_placeholder_in_source);
2871 Result.startToken();
2872 FormTokenWithChars(Result, End, tok::raw_identifier);
2873 Result.setRawIdentifierData(Start);
2874 PP->LookUpIdentifierInfo(Result);
2875 Result.setFlag(Token::IsEditorPlaceholder);
2876 BufferPtr = End;
2877 return true;
2878}
2879
2880bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2881 if (PP && PP->isCodeCompletionEnabled()) {
2882 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2883 return Loc == PP->getCodeCompletionLoc();
2884 }
2885
2886 return false;
2887}
2888
2889uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2890 Token *Result) {
2891 unsigned CharSize;
2892 char Kind = getCharAndSize(StartPtr, CharSize);
2893
2894 unsigned NumHexDigits;
2895 if (Kind == 'u')
2896 NumHexDigits = 4;
2897 else if (Kind == 'U')
2898 NumHexDigits = 8;
2899 else
2900 return 0;
2901
2902 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
2903 if (Result && !isLexingRawMode())
2904 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2905 return 0;
2906 }
2907
2908 const char *CurPtr = StartPtr + CharSize;
2909 const char *KindLoc = &CurPtr[-1];
2910
2911 uint32_t CodePoint = 0;
2912 for (unsigned i = 0; i < NumHexDigits; ++i) {
2913 char C = getCharAndSize(CurPtr, CharSize);
2914
2915 unsigned Value = llvm::hexDigitValue(C);
2916 if (Value == -1U) {
2917 if (Result && !isLexingRawMode()) {
2918 if (i == 0) {
2919 Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2920 << StringRef(KindLoc, 1);
2921 } else {
2922 Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2923
2924 // If the user wrote \U1234, suggest a fixit to \u.
2925 if (i == 4 && NumHexDigits == 8) {
2926 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
2927 Diag(KindLoc, diag::note_ucn_four_not_eight)
2928 << FixItHint::CreateReplacement(URange, "u");
2929 }
2930 }
2931 }
2932
2933 return 0;
2934 }
2935
2936 CodePoint <<= 4;
2937 CodePoint += Value;
2938
2939 CurPtr += CharSize;
2940 }
2941
2942 if (Result) {
2943 Result->setFlag(Token::HasUCN);
2944 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
2945 StartPtr = CurPtr;
2946 else
2947 while (StartPtr != CurPtr)
2948 (void)getAndAdvanceChar(StartPtr, *Result);
2949 } else {
2950 StartPtr = CurPtr;
2951 }
2952
2953 // Don't apply C family restrictions to UCNs in assembly mode
2954 if (LangOpts.AsmPreprocessor)
2955 return CodePoint;
2956
2957 // C99 6.4.3p2: A universal character name shall not specify a character whose
2958 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
2959 // 0060 (`), nor one in the range D800 through DFFF inclusive.)
2960 // C++11 [lex.charset]p2: If the hexadecimal value for a
2961 // universal-character-name corresponds to a surrogate code point (in the
2962 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
2963 // if the hexadecimal value for a universal-character-name outside the
2964 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
2965 // string literal corresponds to a control character (in either of the
2966 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
2967 // basic source character set, the program is ill-formed.
2968 if (CodePoint < 0xA0) {
2969 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
2970 return CodePoint;
2971
2972 // We don't use isLexingRawMode() here because we need to warn about bad
2973 // UCNs even when skipping preprocessing tokens in a #if block.
2974 if (Result && PP) {
2975 if (CodePoint < 0x20 || CodePoint >= 0x7F)
2976 Diag(BufferPtr, diag::err_ucn_control_character);
2977 else {
2978 char C = static_cast<char>(CodePoint);
2979 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
2980 }
2981 }
2982
2983 return 0;
2984 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
2985 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
2986 // We don't use isLexingRawMode() here because we need to diagnose bad
2987 // UCNs even when skipping preprocessing tokens in a #if block.
2988 if (Result && PP) {
2989 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
2990 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
2991 else
2992 Diag(BufferPtr, diag::err_ucn_escape_invalid);
2993 }
2994 return 0;
2995 }
2996
2997 return CodePoint;
2998}
2999
3000bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3001 const char *CurPtr) {
3002 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3003 UnicodeWhitespaceCharRanges);
3004 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3005 UnicodeWhitespaceChars.contains(C)) {
3006 Diag(BufferPtr, diag::ext_unicode_whitespace)
3007 << makeCharRange(*this, BufferPtr, CurPtr);
3008
3009 Result.setFlag(Token::LeadingSpace);
3010 return true;
3011 }
3012 return false;
3013}
3014
3015bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3016 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
3017 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3018 !PP->isPreprocessedOutput()) {
3019 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3020 makeCharRange(*this, BufferPtr, CurPtr),
3021 /*IsFirst=*/true);
3022 }
3023
3024 MIOpt.ReadToken();
3025 return LexIdentifier(Result, CurPtr);
3026 }
3027
3028 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3029 !PP->isPreprocessedOutput() &&
3030 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
3031 // Non-ASCII characters tend to creep into source code unintentionally.
3032 // Instead of letting the parser complain about the unknown token,
3033 // just drop the character.
3034 // Note that we can /only/ do this when the non-ASCII character is actually
3035 // spelled as Unicode, not written as a UCN. The standard requires that
3036 // we not throw away any possible preprocessor tokens, but there's a
3037 // loophole in the mapping of Unicode characters to basic character set
3038 // characters that allows us to map these particular characters to, say,
3039 // whitespace.
3040 Diag(BufferPtr, diag::err_non_ascii)
3041 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3042
3043 BufferPtr = CurPtr;
3044 return false;
3045 }
3046
3047 // Otherwise, we have an explicit UCN or a character that's unlikely to show
3048 // up by accident.
3049 MIOpt.ReadToken();
3050 FormTokenWithChars(Result, CurPtr, tok::unknown);
3051 return true;
3052}
3053
3054void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3055 IsAtStartOfLine = Result.isAtStartOfLine();
3056 HasLeadingSpace = Result.hasLeadingSpace();
3057 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3058 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3059}
3060
3061bool Lexer::Lex(Token &Result) {
3062 // Start a new token.
3063 Result.startToken();
3064
3065 // Set up misc whitespace flags for LexTokenInternal.
3066 if (IsAtStartOfLine) {
3067 Result.setFlag(Token::StartOfLine);
3068 IsAtStartOfLine = false;
3069 }
3070
3071 if (HasLeadingSpace) {
3072 Result.setFlag(Token::LeadingSpace);
3073 HasLeadingSpace = false;
3074 }
3075
3076 if (HasLeadingEmptyMacro) {
3077 Result.setFlag(Token::LeadingEmptyMacro);
3078 HasLeadingEmptyMacro = false;
3079 }
3080
3081 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3082 IsAtPhysicalStartOfLine = false;
3083 bool isRawLex = isLexingRawMode();
3084 (void) isRawLex;
3085 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3086 // (After the LexTokenInternal call, the lexer might be destroyed.)
3087 assert((returnedToken || !isRawLex) && "Raw lex must succeed")(static_cast <bool> ((returnedToken || !isRawLex) &&
"Raw lex must succeed") ? void (0) : __assert_fail ("(returnedToken || !isRawLex) && \"Raw lex must succeed\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 3087, __extension__ __PRETTY_FUNCTION__))
;
3088 return returnedToken;
3089}
3090
3091/// LexTokenInternal - This implements a simple C family lexer. It is an
3092/// extremely performance critical piece of code. This assumes that the buffer
3093/// has a null character at the end of the file. This returns a preprocessing
3094/// token, not a normal token, as such, it is an internal interface. It assumes
3095/// that the Flags of result have been cleared before calling this.
3096bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3097LexNextToken:
3098 // New token, can't need cleaning yet.
3099 Result.clearFlag(Token::NeedsCleaning);
3100 Result.setIdentifierInfo(nullptr);
3101
3102 // CurPtr - Cache BufferPtr in an automatic variable.
3103 const char *CurPtr = BufferPtr;
3104
3105 // Small amounts of horizontal whitespace is very common between tokens.
3106 if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
3107 ++CurPtr;
3108 while ((*CurPtr == ' ') || (*CurPtr == '\t'))
3109 ++CurPtr;
3110
3111 // If we are keeping whitespace and other tokens, just return what we just
3112 // skipped. The next lexer invocation will return the token after the
3113 // whitespace.
3114 if (isKeepWhitespaceMode()) {
3115 FormTokenWithChars(Result, CurPtr, tok::unknown);
3116 // FIXME: The next token will not have LeadingSpace set.
3117 return true;
3118 }
3119
3120 BufferPtr = CurPtr;
3121 Result.setFlag(Token::LeadingSpace);
3122 }
3123
3124 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3125
3126 // Read a character, advancing over it.
3127 char Char = getAndAdvanceChar(CurPtr, Result);
3128 tok::TokenKind Kind;
3129
3130 switch (Char) {
3131 case 0: // Null.
3132 // Found end of file?
3133 if (CurPtr-1 == BufferEnd)
3134 return LexEndOfFile(Result, CurPtr-1);
3135
3136 // Check if we are performing code completion.
3137 if (isCodeCompletionPoint(CurPtr-1)) {
3138 // Return the code-completion token.
3139 Result.startToken();
3140 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3141 return true;
3142 }
3143
3144 if (!isLexingRawMode())
3145 Diag(CurPtr-1, diag::null_in_file);
3146 Result.setFlag(Token::LeadingSpace);
3147 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3148 return true; // KeepWhitespaceMode
3149
3150 // We know the lexer hasn't changed, so just try again with this lexer.
3151 // (We manually eliminate the tail call to avoid recursion.)
3152 goto LexNextToken;
3153
3154 case 26: // DOS & CP/M EOF: "^Z".
3155 // If we're in Microsoft extensions mode, treat this as end of file.
3156 if (LangOpts.MicrosoftExt) {
3157 if (!isLexingRawMode())
3158 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3159 return LexEndOfFile(Result, CurPtr-1);
3160 }
3161
3162 // If Microsoft extensions are disabled, this is just random garbage.
3163 Kind = tok::unknown;
3164 break;
3165
3166 case '\r':
3167 if (CurPtr[0] == '\n')
3168 Char = getAndAdvanceChar(CurPtr, Result);
3169 LLVM_FALLTHROUGH[[clang::fallthrough]];
3170 case '\n':
3171 // If we are inside a preprocessor directive and we see the end of line,
3172 // we know we are done with the directive, so return an EOD token.
3173 if (ParsingPreprocessorDirective) {
3174 // Done parsing the "line".
3175 ParsingPreprocessorDirective = false;
3176
3177 // Restore comment saving mode, in case it was disabled for directive.
3178 if (PP)
3179 resetExtendedTokenMode();
3180
3181 // Since we consumed a newline, we are back at the start of a line.
3182 IsAtStartOfLine = true;
3183 IsAtPhysicalStartOfLine = true;
3184
3185 Kind = tok::eod;
3186 break;
3187 }
3188
3189 // No leading whitespace seen so far.
3190 Result.clearFlag(Token::LeadingSpace);
3191
3192 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3193 return true; // KeepWhitespaceMode
3194
3195 // We only saw whitespace, so just try again with this lexer.
3196 // (We manually eliminate the tail call to avoid recursion.)
3197 goto LexNextToken;
3198 case ' ':
3199 case '\t':
3200 case '\f':
3201 case '\v':
3202 SkipHorizontalWhitespace:
3203 Result.setFlag(Token::LeadingSpace);
3204 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3205 return true; // KeepWhitespaceMode
3206
3207 SkipIgnoredUnits:
3208 CurPtr = BufferPtr;
3209
3210 // If the next token is obviously a // or /* */ comment, skip it efficiently
3211 // too (without going through the big switch stmt).
3212 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3213 LangOpts.LineComment &&
3214 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3215 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3216 return true; // There is a token to return.
3217 goto SkipIgnoredUnits;
3218 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3219 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3220 return true; // There is a token to return.
3221 goto SkipIgnoredUnits;
3222 } else if (isHorizontalWhitespace(*CurPtr)) {
3223 goto SkipHorizontalWhitespace;
3224 }
3225 // We only saw whitespace, so just try again with this lexer.
3226 // (We manually eliminate the tail call to avoid recursion.)
3227 goto LexNextToken;
3228
3229 // C99 6.4.4.1: Integer Constants.
3230 // C99 6.4.4.2: Floating Constants.
3231 case '0': case '1': case '2': case '3': case '4':
3232 case '5': case '6': case '7': case '8': case '9':
3233 // Notify MIOpt that we read a non-whitespace/non-comment token.
3234 MIOpt.ReadToken();
3235 return LexNumericConstant(Result, CurPtr);
3236
3237 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3238 // Notify MIOpt that we read a non-whitespace/non-comment token.
3239 MIOpt.ReadToken();
3240
3241 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3242 Char = getCharAndSize(CurPtr, SizeTmp);
3243
3244 // UTF-16 string literal
3245 if (Char == '"')
3246 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3247 tok::utf16_string_literal);
3248
3249 // UTF-16 character constant
3250 if (Char == '\'')
3251 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3252 tok::utf16_char_constant);
3253
3254 // UTF-16 raw string literal
3255 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3256 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3257 return LexRawStringLiteral(Result,
3258 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3259 SizeTmp2, Result),
3260 tok::utf16_string_literal);
3261
3262 if (Char == '8') {
3263 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3264
3265 // UTF-8 string literal
3266 if (Char2 == '"')
3267 return LexStringLiteral(Result,
3268 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3269 SizeTmp2, Result),
3270 tok::utf8_string_literal);
3271 if (Char2 == '\'' && LangOpts.CPlusPlus17)
3272 return LexCharConstant(
3273 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3274 SizeTmp2, Result),
3275 tok::utf8_char_constant);
3276
3277 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3278 unsigned SizeTmp3;
3279 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3280 // UTF-8 raw string literal
3281 if (Char3 == '"') {
3282 return LexRawStringLiteral(Result,
3283 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3284 SizeTmp2, Result),
3285 SizeTmp3, Result),
3286 tok::utf8_string_literal);
3287 }
3288 }
3289 }
3290 }
3291
3292 // treat u like the start of an identifier.
3293 return LexIdentifier(Result, CurPtr);
3294
3295 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
3296 // Notify MIOpt that we read a non-whitespace/non-comment token.
3297 MIOpt.ReadToken();
3298
3299 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3300 Char = getCharAndSize(CurPtr, SizeTmp);
3301
3302 // UTF-32 string literal
3303 if (Char == '"')
3304 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3305 tok::utf32_string_literal);
3306
3307 // UTF-32 character constant
3308 if (Char == '\'')
3309 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3310 tok::utf32_char_constant);
3311
3312 // UTF-32 raw string literal
3313 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3314 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3315 return LexRawStringLiteral(Result,
3316 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3317 SizeTmp2, Result),
3318 tok::utf32_string_literal);
3319 }
3320
3321 // treat U like the start of an identifier.
3322 return LexIdentifier(Result, CurPtr);
3323
3324 case 'R': // Identifier or C++0x raw string literal
3325 // Notify MIOpt that we read a non-whitespace/non-comment token.
3326 MIOpt.ReadToken();
3327
3328 if (LangOpts.CPlusPlus11) {
3329 Char = getCharAndSize(CurPtr, SizeTmp);
3330
3331 if (Char == '"')
3332 return LexRawStringLiteral(Result,
3333 ConsumeChar(CurPtr, SizeTmp, Result),
3334 tok::string_literal);
3335 }
3336
3337 // treat R like the start of an identifier.
3338 return LexIdentifier(Result, CurPtr);
3339
3340 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3341 // Notify MIOpt that we read a non-whitespace/non-comment token.
3342 MIOpt.ReadToken();
3343 Char = getCharAndSize(CurPtr, SizeTmp);
3344
3345 // Wide string literal.
3346 if (Char == '"')
3347 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3348 tok::wide_string_literal);
3349
3350 // Wide raw string literal.
3351 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3352 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3353 return LexRawStringLiteral(Result,
3354 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3355 SizeTmp2, Result),
3356 tok::wide_string_literal);
3357
3358 // Wide character constant.
3359 if (Char == '\'')
3360 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3361 tok::wide_char_constant);
3362 // FALL THROUGH, treating L like the start of an identifier.
3363 LLVM_FALLTHROUGH[[clang::fallthrough]];
3364
3365 // C99 6.4.2: Identifiers.
3366 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3367 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3368 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3369 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3370 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3371 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3372 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3373 case 'v': case 'w': case 'x': case 'y': case 'z':
3374 case '_':
3375 // Notify MIOpt that we read a non-whitespace/non-comment token.
3376 MIOpt.ReadToken();
3377 return LexIdentifier(Result, CurPtr);
3378
3379 case '$': // $ in identifiers.
3380 if (LangOpts.DollarIdents) {
3381 if (!isLexingRawMode())
3382 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3383 // Notify MIOpt that we read a non-whitespace/non-comment token.
3384 MIOpt.ReadToken();
3385 return LexIdentifier(Result, CurPtr);
3386 }
3387
3388 Kind = tok::unknown;
3389 break;
3390
3391 // C99 6.4.4: Character Constants.
3392 case '\'':
3393 // Notify MIOpt that we read a non-whitespace/non-comment token.
3394 MIOpt.ReadToken();
3395 return LexCharConstant(Result, CurPtr, tok::char_constant);
3396
3397 // C99 6.4.5: String Literals.
3398 case '"':
3399 // Notify MIOpt that we read a non-whitespace/non-comment token.
3400 MIOpt.ReadToken();
3401 return LexStringLiteral(Result, CurPtr, tok::string_literal);
3402
3403 // C99 6.4.6: Punctuators.
3404 case '?':
3405 Kind = tok::question;
3406 break;
3407 case '[':
3408 Kind = tok::l_square;
3409 break;
3410 case ']':
3411 Kind = tok::r_square;
3412 break;
3413 case '(':
3414 Kind = tok::l_paren;
3415 break;
3416 case ')':
3417 Kind = tok::r_paren;
3418 break;
3419 case '{':
3420 Kind = tok::l_brace;
3421 break;
3422 case '}':
3423 Kind = tok::r_brace;
3424 break;
3425 case '.':
3426 Char = getCharAndSize(CurPtr, SizeTmp);
3427 if (Char >= '0' && Char <= '9') {
3428 // Notify MIOpt that we read a non-whitespace/non-comment token.
3429 MIOpt.ReadToken();
3430
3431 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3432 } else if (LangOpts.CPlusPlus && Char == '*') {
3433 Kind = tok::periodstar;
3434 CurPtr += SizeTmp;
3435 } else if (Char == '.' &&
3436 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3437 Kind = tok::ellipsis;
3438 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3439 SizeTmp2, Result);
3440 } else {
3441 Kind = tok::period;
3442 }
3443 break;
3444 case '&':
3445 Char = getCharAndSize(CurPtr, SizeTmp);
3446 if (Char == '&') {
3447 Kind = tok::ampamp;
3448 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3449 } else if (Char == '=') {
3450 Kind = tok::ampequal;
3451 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3452 } else {
3453 Kind = tok::amp;
3454 }
3455 break;
3456 case '*':
3457 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3458 Kind = tok::starequal;
3459 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3460 } else {
3461 Kind = tok::star;
3462 }
3463 break;
3464 case '+':
3465 Char = getCharAndSize(CurPtr, SizeTmp);
3466 if (Char == '+') {
3467 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3468 Kind = tok::plusplus;
3469 } else if (Char == '=') {
3470 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3471 Kind = tok::plusequal;
3472 } else {
3473 Kind = tok::plus;
3474 }
3475 break;
3476 case '-':
3477 Char = getCharAndSize(CurPtr, SizeTmp);
3478 if (Char == '-') { // --
3479 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3480 Kind = tok::minusminus;
3481 } else if (Char == '>' && LangOpts.CPlusPlus &&
3482 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3483 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3484 SizeTmp2, Result);
3485 Kind = tok::arrowstar;
3486 } else if (Char == '>') { // ->
3487 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3488 Kind = tok::arrow;
3489 } else if (Char == '=') { // -=
3490 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3491 Kind = tok::minusequal;
3492 } else {
3493 Kind = tok::minus;
3494 }
3495 break;
3496 case '~':
3497 Kind = tok::tilde;
3498 break;
3499 case '!':
3500 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3501 Kind = tok::exclaimequal;
3502 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3503 } else {
3504 Kind = tok::exclaim;
3505 }
3506 break;
3507 case '/':
3508 // 6.4.9: Comments
3509 Char = getCharAndSize(CurPtr, SizeTmp);
3510 if (Char == '/') { // Line comment.
3511 // Even if Line comments are disabled (e.g. in C89 mode), we generally
3512 // want to lex this as a comment. There is one problem with this though,
3513 // that in one particular corner case, this can change the behavior of the
3514 // resultant program. For example, In "foo //**/ bar", C89 would lex
3515 // this as "foo / bar" and languages with Line comments would lex it as
3516 // "foo". Check to see if the character after the second slash is a '*'.
3517 // If so, we will lex that as a "/" instead of the start of a comment.
3518 // However, we never do this if we are just preprocessing.
3519 bool TreatAsComment = LangOpts.LineComment &&
3520 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3521 if (!TreatAsComment)
3522 if (!(PP && PP->isPreprocessedOutput()))
3523 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3524
3525 if (TreatAsComment) {
3526 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3527 TokAtPhysicalStartOfLine))
3528 return true; // There is a token to return.
3529
3530 // It is common for the tokens immediately after a // comment to be
3531 // whitespace (indentation for the next line). Instead of going through
3532 // the big switch, handle it efficiently now.
3533 goto SkipIgnoredUnits;
3534 }
3535 }
3536
3537 if (Char == '*') { // /**/ comment.
3538 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3539 TokAtPhysicalStartOfLine))
3540 return true; // There is a token to return.
3541
3542 // We only saw whitespace, so just try again with this lexer.
3543 // (We manually eliminate the tail call to avoid recursion.)
3544 goto LexNextToken;
3545 }
3546
3547 if (Char == '=') {
3548 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3549 Kind = tok::slashequal;
3550 } else {
3551 Kind = tok::slash;
3552 }
3553 break;
3554 case '%':
3555 Char = getCharAndSize(CurPtr, SizeTmp);
3556 if (Char == '=') {
3557 Kind = tok::percentequal;
3558 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3559 } else if (LangOpts.Digraphs && Char == '>') {
3560 Kind = tok::r_brace; // '%>' -> '}'
3561 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3562 } else if (LangOpts.Digraphs && Char == ':') {
3563 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3564 Char = getCharAndSize(CurPtr, SizeTmp);
3565 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3566 Kind = tok::hashhash; // '%:%:' -> '##'
3567 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3568 SizeTmp2, Result);
3569 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
3570 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3571 if (!isLexingRawMode())
3572 Diag(BufferPtr, diag::ext_charize_microsoft);
3573 Kind = tok::hashat;
3574 } else { // '%:' -> '#'
3575 // We parsed a # character. If this occurs at the start of the line,
3576 // it's actually the start of a preprocessing directive. Callback to
3577 // the preprocessor to handle it.
3578 // TODO: -fpreprocessed mode??
3579 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3580 goto HandleDirective;
3581
3582 Kind = tok::hash;
3583 }
3584 } else {
3585 Kind = tok::percent;
3586 }
3587 break;
3588 case '<':
3589 Char = getCharAndSize(CurPtr, SizeTmp);
3590 if (ParsingFilename) {
3591 return LexAngledStringLiteral(Result, CurPtr);
3592 } else if (Char == '<') {
3593 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3594 if (After == '=') {
3595 Kind = tok::lesslessequal;
3596 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3597 SizeTmp2, Result);
3598 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
3599 // If this is actually a '<<<<<<<' version control conflict marker,
3600 // recognize it as such and recover nicely.
3601 goto LexNextToken;
3602 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
3603 // If this is '<<<<' and we're in a Perforce-style conflict marker,
3604 // ignore it.
3605 goto LexNextToken;
3606 } else if (LangOpts.CUDA && After == '<') {
3607 Kind = tok::lesslessless;
3608 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3609 SizeTmp2, Result);
3610 } else {
3611 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3612 Kind = tok::lessless;
3613 }
3614 } else if (Char == '=') {
3615 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3616 if (After == '>') {
3617 if (getLangOpts().CPlusPlus2a) {
3618 if (!isLexingRawMode())
3619 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3620 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3621 SizeTmp2, Result);
3622 Kind = tok::spaceship;
3623 break;
3624 }
3625 // Suggest adding a space between the '<=' and the '>' to avoid a
3626 // change in semantics if this turns up in C++ <=17 mode.
3627 if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
3628 Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
3629 << FixItHint::CreateInsertion(
3630 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3631 }
3632 }
3633 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3634 Kind = tok::lessequal;
3635 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
3636 if (LangOpts.CPlusPlus11 &&
3637 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3638 // C++0x [lex.pptoken]p3:
3639 // Otherwise, if the next three characters are <:: and the subsequent
3640 // character is neither : nor >, the < is treated as a preprocessor
3641 // token by itself and not as the first character of the alternative
3642 // token <:.
3643 unsigned SizeTmp3;
3644 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3645 if (After != ':' && After != '>') {
3646 Kind = tok::less;
3647 if (!isLexingRawMode())
3648 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3649 break;
3650 }
3651 }
3652
3653 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3654 Kind = tok::l_square;
3655 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
3656 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3657 Kind = tok::l_brace;
3658 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
3659 lexEditorPlaceholder(Result, CurPtr)) {
3660 return true;
3661 } else {
3662 Kind = tok::less;
3663 }
3664 break;
3665 case '>':
3666 Char = getCharAndSize(CurPtr, SizeTmp);
3667 if (Char == '=') {
3668 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3669 Kind = tok::greaterequal;
3670 } else if (Char == '>') {
3671 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3672 if (After == '=') {
3673 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3674 SizeTmp2, Result);
3675 Kind = tok::greatergreaterequal;
3676 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
3677 // If this is actually a '>>>>' conflict marker, recognize it as such
3678 // and recover nicely.
3679 goto LexNextToken;
3680 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
3681 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3682 goto LexNextToken;
3683 } else if (LangOpts.CUDA && After == '>') {
3684 Kind = tok::greatergreatergreater;
3685 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3686 SizeTmp2, Result);
3687 } else {
3688 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3689 Kind = tok::greatergreater;
3690 }
3691 } else {
3692 Kind = tok::greater;
3693 }
3694 break;
3695 case '^':
3696 Char = getCharAndSize(CurPtr, SizeTmp);
3697 if (Char == '=') {
3698 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3699 Kind = tok::caretequal;
3700 } else if (LangOpts.OpenCL && Char == '^') {
3701 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3702 Kind = tok::caretcaret;
3703 } else {
3704 Kind = tok::caret;
3705 }
3706 break;
3707 case '|':
3708 Char = getCharAndSize(CurPtr, SizeTmp);
3709 if (Char == '=') {
3710 Kind = tok::pipeequal;
3711 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3712 } else if (Char == '|') {
3713 // If this is '|||||||' and we're in a conflict marker, ignore it.
3714 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
3715 goto LexNextToken;
3716 Kind = tok::pipepipe;
3717 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3718 } else {
3719 Kind = tok::pipe;
3720 }
3721 break;
3722 case ':':
3723 Char = getCharAndSize(CurPtr, SizeTmp);
3724 if (LangOpts.Digraphs && Char == '>') {
3725 Kind = tok::r_square; // ':>' -> ']'
3726 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3727 } else if ((LangOpts.CPlusPlus ||
3728 LangOpts.DoubleSquareBracketAttributes) &&
3729 Char == ':') {
3730 Kind = tok::coloncolon;
3731 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3732 } else {
3733 Kind = tok::colon;
3734 }
3735 break;
3736 case ';':
3737 Kind = tok::semi;
3738 break;
3739 case '=':
3740 Char = getCharAndSize(CurPtr, SizeTmp);
3741 if (Char == '=') {
3742 // If this is '====' and we're in a conflict marker, ignore it.
3743 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
3744 goto LexNextToken;
3745
3746 Kind = tok::equalequal;
3747 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3748 } else {
3749 Kind = tok::equal;
3750 }
3751 break;
3752 case ',':
3753 Kind = tok::comma;
3754 break;
3755 case '#':
3756 Char = getCharAndSize(CurPtr, SizeTmp);
3757 if (Char == '#') {
3758 Kind = tok::hashhash;
3759 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3760 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
3761 Kind = tok::hashat;
3762 if (!isLexingRawMode())
3763 Diag(BufferPtr, diag::ext_charize_microsoft);
3764 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3765 } else {
3766 // We parsed a # character. If this occurs at the start of the line,
3767 // it's actually the start of a preprocessing directive. Callback to
3768 // the preprocessor to handle it.
3769 // TODO: -fpreprocessed mode??
3770 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3771 goto HandleDirective;
3772
3773 Kind = tok::hash;
3774 }
3775 break;
3776
3777 case '@':
3778 // Objective C support.
3779 if (CurPtr[-1] == '@' && LangOpts.ObjC1)
3780 Kind = tok::at;
3781 else
3782 Kind = tok::unknown;
3783 break;
3784
3785 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3786 case '\\':
3787 if (!LangOpts.AsmPreprocessor) {
3788 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3789 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3790 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3791 return true; // KeepWhitespaceMode
3792
3793 // We only saw whitespace, so just try again with this lexer.
3794 // (We manually eliminate the tail call to avoid recursion.)
3795 goto LexNextToken;
3796 }
3797
3798 return LexUnicode(Result, CodePoint, CurPtr);
3799 }
3800 }
3801
3802 Kind = tok::unknown;
3803 break;
3804
3805 default: {
3806 if (isASCII(Char)) {
3807 Kind = tok::unknown;
3808 break;
3809 }
3810
3811 llvm::UTF32 CodePoint;
3812
3813 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3814 // an escaped newline.
3815 --CurPtr;
3816 const char *UTF8StartPtr = CurPtr;
3817 llvm::ConversionResult Status =
3818 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3819 (const llvm::UTF8 *)BufferEnd,
3820 &CodePoint,
3821 llvm::strictConversion);
3822 if (Status == llvm::conversionOK) {
3823 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3824 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3825 return true; // KeepWhitespaceMode
3826
3827 // We only saw whitespace, so just try again with this lexer.
3828 // (We manually eliminate the tail call to avoid recursion.)
3829 goto LexNextToken;
3830 }
3831 if (!isLexingRawMode())
3832 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
3833 makeCharRange(*this, UTF8StartPtr, CurPtr));
3834 return LexUnicode(Result, CodePoint, CurPtr);
3835 }
3836
3837 if (isLexingRawMode() || ParsingPreprocessorDirective ||
3838 PP->isPreprocessedOutput()) {
3839 ++CurPtr;
3840 Kind = tok::unknown;
3841 break;
3842 }
3843
3844 // Non-ASCII characters tend to creep into source code unintentionally.
3845 // Instead of letting the parser complain about the unknown token,
3846 // just diagnose the invalid UTF-8, then drop the character.
3847 Diag(CurPtr, diag::err_invalid_utf8);
3848
3849 BufferPtr = CurPtr+1;
3850 // We're pretending the character didn't exist, so just try again with
3851 // this lexer.
3852 // (We manually eliminate the tail call to avoid recursion.)
3853 goto LexNextToken;
3854 }
3855 }
3856
3857 // Notify MIOpt that we read a non-whitespace/non-comment token.
3858 MIOpt.ReadToken();
3859
3860 // Update the location of token as well as BufferPtr.
3861 FormTokenWithChars(Result, CurPtr, Kind);
3862 return true;
3863
3864HandleDirective:
3865 // We parsed a # character and it's the start of a preprocessing directive.
3866
3867 FormTokenWithChars(Result, CurPtr, tok::hash);
3868 PP->HandleDirective(Result);
3869
3870 if (PP->hadModuleLoaderFatalFailure()) {
3871 // With a fatal failure in the module loader, we abort parsing.
3872 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof")(static_cast <bool> (Result.is(tok::eof) && "Preprocessor did not set tok:eof"
) ? void (0) : __assert_fail ("Result.is(tok::eof) && \"Preprocessor did not set tok:eof\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/lib/Lex/Lexer.cpp"
, 3872, __extension__ __PRETTY_FUNCTION__))
;
3873 return true;
3874 }
3875
3876 // We parsed the directive; lex a token with the new state.
3877 return false;
3878}

/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Lexer.h

1//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the Lexer interface.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_LEX_LEXER_H
15#define LLVM_CLANG_LEX_LEXER_H
16
17#include "clang/Basic/LangOptions.h"
18#include "clang/Basic/SourceLocation.h"
19#include "clang/Basic/TokenKinds.h"
20#include "clang/Lex/PreprocessorLexer.h"
21#include "clang/Lex/Token.h"
22#include "llvm/ADT/Optional.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include <cassert>
26#include <cstdint>
27#include <string>
28
29namespace llvm {
30
31class MemoryBuffer;
32
33} // namespace llvm
34
35namespace clang {
36
37class DiagnosticBuilder;
38class Preprocessor;
39class SourceManager;
40
41/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
42/// recovering from.
43enum ConflictMarkerKind {
44 /// Not within a conflict marker.
45 CMK_None,
46
47 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
48 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
49 CMK_Normal,
50
51 /// A Perforce-style conflict marker, initiated by 4 ">"s,
52 /// separated by 4 "="s, and terminated by 4 "<"s.
53 CMK_Perforce
54};
55
56/// Describes the bounds (start, size) of the preamble and a flag required by
57/// PreprocessorOptions::PrecompiledPreambleBytes.
58/// The preamble includes the BOM, if any.
59struct PreambleBounds {
60 /// \brief Size of the preamble in bytes.
61 unsigned Size;
62
63 /// \brief Whether the preamble ends at the start of a new line.
64 ///
65 /// Used to inform the lexer as to whether it's starting at the beginning of
66 /// a line after skipping the preamble.
67 bool PreambleEndsAtStartOfLine;
68
69 PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
70 : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
71};
72
73/// Lexer - This provides a simple interface that turns a text buffer into a
74/// stream of tokens. This provides no support for file reading or buffering,
75/// or buffering/seeking of tokens, only forward lexing is supported. It relies
76/// on the specified Preprocessor object to handle preprocessor directives, etc.
77class Lexer : public PreprocessorLexer {
78 friend class Preprocessor;
79
80 void anchor() override;
81
82 //===--------------------------------------------------------------------===//
83 // Constant configuration values for this lexer.
84
85 // Start of the buffer.
86 const char *BufferStart;
87
88 // End of the buffer.
89 const char *BufferEnd;
90
91 // Location for start of file.
92 SourceLocation FileLoc;
93
94 // LangOpts enabled by this language (cache).
95 LangOptions LangOpts;
96
97 // True if lexer for _Pragma handling.
98 bool Is_PragmaLexer;
99
100 //===--------------------------------------------------------------------===//
101 // Context-specific lexing flags set by the preprocessor.
102 //
103
104 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
105 /// and return them as tokens. This is used for -C and -CC modes, and
106 /// whitespace preservation can be useful for some clients that want to lex
107 /// the file in raw mode and get every character from the file.
108 ///
109 /// When this is set to 2 it returns comments and whitespace. When set to 1
110 /// it returns comments, when it is set to 0 it returns normal tokens only.
111 unsigned char ExtendedTokenMode;
112
113 //===--------------------------------------------------------------------===//
114 // Context that changes as the file is lexed.
115 // NOTE: any state that mutates when in raw mode must have save/restore code
116 // in Lexer::isNextPPTokenLParen.
117
118 // BufferPtr - Current pointer into the buffer. This is the next character
119 // to be lexed.
120 const char *BufferPtr;
121
122 // IsAtStartOfLine - True if the next lexed token should get the "start of
123 // line" flag set on it.
124 bool IsAtStartOfLine;
125
126 bool IsAtPhysicalStartOfLine;
127
128 bool HasLeadingSpace;
129
130 bool HasLeadingEmptyMacro;
131
132 // CurrentConflictMarkerState - The kind of conflict marker we are handling.
133 ConflictMarkerKind CurrentConflictMarkerState;
134
135 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
136
137public:
138 /// Lexer constructor - Create a new lexer object for the specified buffer
139 /// with the specified preprocessor managing the lexing process. This lexer
140 /// assumes that the associated file buffer and Preprocessor objects will
141 /// outlive it, so it doesn't take ownership of either of them.
142 Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, Preprocessor &PP);
143
144 /// Lexer constructor - Create a new raw lexer object. This object is only
145 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
146 /// text range will outlive it, so it doesn't take ownership of it.
147 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
148 const char *BufStart, const char *BufPtr, const char *BufEnd);
149
150 /// Lexer constructor - Create a new raw lexer object. This object is only
151 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
152 /// text range will outlive it, so it doesn't take ownership of it.
153 Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer,
154 const SourceManager &SM, const LangOptions &LangOpts);
155
156 Lexer(const Lexer &) = delete;
157 Lexer &operator=(const Lexer &) = delete;
158
159 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
160 /// _Pragma expansion. This has a variety of magic semantics that this method
161 /// sets up. It returns a new'd Lexer that must be delete'd when done.
162 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
163 SourceLocation ExpansionLocStart,
164 SourceLocation ExpansionLocEnd,
165 unsigned TokLen, Preprocessor &PP);
166
167 /// getLangOpts - Return the language features currently enabled.
168 /// NOTE: this lexer modifies features as a file is parsed!
169 const LangOptions &getLangOpts() const { return LangOpts; }
170
171 /// getFileLoc - Return the File Location for the file we are lexing out of.
172 /// The physical location encodes the location where the characters come from,
173 /// the virtual location encodes where we should *claim* the characters came
174 /// from. Currently this is only used by _Pragma handling.
175 SourceLocation getFileLoc() const { return FileLoc; }
176
177private:
178 /// Lex - Return the next token in the file. If this is the end of file, it
179 /// return the tok::eof token. This implicitly involves the preprocessor.
180 bool Lex(Token &Result);
181
182public:
183 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
184 bool isPragmaLexer() const { return Is_PragmaLexer; }
185
186private:
187 /// IndirectLex - An indirect call to 'Lex' that can be invoked via
188 /// the PreprocessorLexer interface.
189 void IndirectLex(Token &Result) override { Lex(Result); }
190
191public:
192 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
193 /// associated preprocessor object. Return true if the 'next character to
194 /// read' pointer points at the end of the lexer buffer, false otherwise.
195 bool LexFromRawLexer(Token &Result) {
196 assert(LexingRawMode && "Not already in raw mode!")(static_cast <bool> (LexingRawMode && "Not already in raw mode!"
) ? void (0) : __assert_fail ("LexingRawMode && \"Not already in raw mode!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Lexer.h"
, 196, __extension__ __PRETTY_FUNCTION__))
;
197 Lex(Result);
198 // Note that lexing to the end of the buffer doesn't implicitly delete the
199 // lexer when in raw mode.
200 return BufferPtr == BufferEnd;
201 }
202
203 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
204 /// every character in the file, including whitespace and comments. This
205 /// should only be used in raw mode, as the preprocessor is not prepared to
206 /// deal with the excess tokens.
207 bool isKeepWhitespaceMode() const {
208 return ExtendedTokenMode > 1;
209 }
210
211 /// SetKeepWhitespaceMode - This method lets clients enable or disable
212 /// whitespace retention mode.
213 void SetKeepWhitespaceMode(bool Val) {
214 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&(static_cast <bool> ((!Val || LexingRawMode || LangOpts
.TraditionalCPP) && "Can only retain whitespace in raw mode or -traditional-cpp"
) ? void (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Lexer.h"
, 215, __extension__ __PRETTY_FUNCTION__))
215 "Can only retain whitespace in raw mode or -traditional-cpp")(static_cast <bool> ((!Val || LexingRawMode || LangOpts
.TraditionalCPP) && "Can only retain whitespace in raw mode or -traditional-cpp"
) ? void (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Lexer.h"
, 215, __extension__ __PRETTY_FUNCTION__))
;
216 ExtendedTokenMode = Val ? 2 : 0;
217 }
218
219 /// inKeepCommentMode - Return true if the lexer should return comments as
220 /// tokens.
221 bool inKeepCommentMode() const {
222 return ExtendedTokenMode > 0;
223 }
224
225 /// SetCommentRetentionMode - Change the comment retention mode of the lexer
226 /// to the specified mode. This is really only useful when lexing in raw
227 /// mode, because otherwise the lexer needs to manage this.
228 void SetCommentRetentionState(bool Mode) {
229 assert(!isKeepWhitespaceMode() &&(static_cast <bool> (!isKeepWhitespaceMode() &&
"Can't play with comment retention state when retaining whitespace"
) ? void (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Lexer.h"
, 230, __extension__ __PRETTY_FUNCTION__))
230 "Can't play with comment retention state when retaining whitespace")(static_cast <bool> (!isKeepWhitespaceMode() &&
"Can't play with comment retention state when retaining whitespace"
) ? void (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Lexer.h"
, 230, __extension__ __PRETTY_FUNCTION__))
;
231 ExtendedTokenMode = Mode ? 1 : 0;
232 }
233
234 /// Sets the extended token mode back to its initial value, according to the
235 /// language options and preprocessor. This controls whether the lexer
236 /// produces comment and whitespace tokens.
237 ///
238 /// This requires the lexer to have an associated preprocessor. A standalone
239 /// lexer has nothing to reset to.
240 void resetExtendedTokenMode();
241
242 /// Gets source code buffer.
243 StringRef getBuffer() const {
244 return StringRef(BufferStart, BufferEnd - BufferStart);
245 }
246
247 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
248 /// uninterpreted string. This switches the lexer out of directive mode.
249 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
250
251
252 /// Diag - Forwarding function for diagnostics. This translate a source
253 /// position in the current buffer into a SourceLocation object for rendering.
254 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
255
256 /// getSourceLocation - Return a source location identifier for the specified
257 /// offset in the current file.
258 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
259
260 /// getSourceLocation - Return a source location for the next character in
261 /// the current file.
262 SourceLocation getSourceLocation() override {
263 return getSourceLocation(BufferPtr);
264 }
265
266 /// \brief Return the current location in the buffer.
267 const char *getBufferLocation() const { return BufferPtr; }
268
269 /// Stringify - Convert the specified string into a C string by i) escaping
270 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
271 /// If Charify is true, this escapes the ' character instead of ".
272 static std::string Stringify(StringRef Str, bool Charify = false);
273
274 /// Stringify - Convert the specified string into a C string by i) escaping
275 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
276 static void Stringify(SmallVectorImpl<char> &Str);
277
278 /// getSpelling - This method is used to get the spelling of a token into a
279 /// preallocated buffer, instead of as an std::string. The caller is required
280 /// to allocate enough space for the token, which is guaranteed to be at least
281 /// Tok.getLength() bytes long. The length of the actual result is returned.
282 ///
283 /// Note that this method may do two possible things: it may either fill in
284 /// the buffer specified with characters, or it may *change the input pointer*
285 /// to point to a constant buffer with the data already in it (avoiding a
286 /// copy). The caller is not allowed to modify the returned buffer pointer
287 /// if an internal buffer is returned.
288 static unsigned getSpelling(const Token &Tok, const char *&Buffer,
289 const SourceManager &SourceMgr,
290 const LangOptions &LangOpts,
291 bool *Invalid = nullptr);
292
293 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
294 /// token is the characters used to represent the token in the source file
295 /// after trigraph expansion and escaped-newline folding. In particular, this
296 /// wants to get the true, uncanonicalized, spelling of things like digraphs
297 /// UCNs, etc.
298 static std::string getSpelling(const Token &Tok,
299 const SourceManager &SourceMgr,
300 const LangOptions &LangOpts,
301 bool *Invalid = nullptr);
302
303 /// getSpelling - This method is used to get the spelling of the
304 /// token at the given source location. If, as is usually true, it
305 /// is not necessary to copy any data, then the returned string may
306 /// not point into the provided buffer.
307 ///
308 /// This method lexes at the expansion depth of the given
309 /// location and does not jump to the expansion or spelling
310 /// location.
311 static StringRef getSpelling(SourceLocation loc,
312 SmallVectorImpl<char> &buffer,
313 const SourceManager &SourceMgr,
314 const LangOptions &LangOpts,
315 bool *invalid = nullptr);
316
317 /// MeasureTokenLength - Relex the token at the specified location and return
318 /// its length in bytes in the input file. If the token needs cleaning (e.g.
319 /// includes a trigraph or an escaped newline) then this count includes bytes
320 /// that are part of that.
321 static unsigned MeasureTokenLength(SourceLocation Loc,
322 const SourceManager &SM,
323 const LangOptions &LangOpts);
324
325 /// \brief Relex the token at the specified location.
326 /// \returns true if there was a failure, false on success.
327 static bool getRawToken(SourceLocation Loc, Token &Result,
328 const SourceManager &SM,
329 const LangOptions &LangOpts,
330 bool IgnoreWhiteSpace = false);
331
332 /// \brief Given a location any where in a source buffer, find the location
333 /// that corresponds to the beginning of the token in which the original
334 /// source location lands.
335 static SourceLocation GetBeginningOfToken(SourceLocation Loc,
336 const SourceManager &SM,
337 const LangOptions &LangOpts);
338
339 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
340 /// location at the start of a token, return a new location that specifies a
341 /// character within the token. This handles trigraphs and escaped newlines.
342 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
343 unsigned Character,
344 const SourceManager &SM,
345 const LangOptions &LangOpts);
346
347 /// \brief Computes the source location just past the end of the
348 /// token at this source location.
349 ///
350 /// This routine can be used to produce a source location that
351 /// points just past the end of the token referenced by \p Loc, and
352 /// is generally used when a diagnostic needs to point just after a
353 /// token where it expected something different that it received. If
354 /// the returned source location would not be meaningful (e.g., if
355 /// it points into a macro), this routine returns an invalid
356 /// source location.
357 ///
358 /// \param Offset an offset from the end of the token, where the source
359 /// location should refer to. The default offset (0) produces a source
360 /// location pointing just past the end of the token; an offset of 1 produces
361 /// a source location pointing to the last character in the token, etc.
362 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
363 const SourceManager &SM,
364 const LangOptions &LangOpts);
365
366 /// \brief Given a token range, produce a corresponding CharSourceRange that
367 /// is not a token range. This allows the source range to be used by
368 /// components that don't have access to the lexer and thus can't find the
369 /// end of the range for themselves.
370 static CharSourceRange getAsCharRange(SourceRange Range,
371 const SourceManager &SM,
372 const LangOptions &LangOpts) {
373 SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
374 return End.isInvalid() ? CharSourceRange()
375 : CharSourceRange::getCharRange(
376 Range.getBegin(), End.getLocWithOffset(-1));
377 }
378 static CharSourceRange getAsCharRange(CharSourceRange Range,
379 const SourceManager &SM,
380 const LangOptions &LangOpts) {
381 return Range.isTokenRange()
382 ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
383 : Range;
384 }
385
386 /// \brief Returns true if the given MacroID location points at the first
387 /// token of the macro expansion.
388 ///
389 /// \param MacroBegin If non-null and function returns true, it is set to
390 /// begin location of the macro.
391 static bool isAtStartOfMacroExpansion(SourceLocation loc,
392 const SourceManager &SM,
393 const LangOptions &LangOpts,
394 SourceLocation *MacroBegin = nullptr);
395
396 /// \brief Returns true if the given MacroID location points at the last
397 /// token of the macro expansion.
398 ///
399 /// \param MacroEnd If non-null and function returns true, it is set to
400 /// end location of the macro.
401 static bool isAtEndOfMacroExpansion(SourceLocation loc,
402 const SourceManager &SM,
403 const LangOptions &LangOpts,
404 SourceLocation *MacroEnd = nullptr);
405
406 /// \brief Accepts a range and returns a character range with file locations.
407 ///
408 /// Returns a null range if a part of the range resides inside a macro
409 /// expansion or the range does not reside on the same FileID.
410 ///
411 /// This function is trying to deal with macros and return a range based on
412 /// file locations. The cases where it can successfully handle macros are:
413 ///
414 /// -begin or end range lies at the start or end of a macro expansion, in
415 /// which case the location will be set to the expansion point, e.g:
416 /// \#define M 1 2
417 /// a M
418 /// If you have a range [a, 2] (where 2 came from the macro), the function
419 /// will return a range for "a M"
420 /// if you have range [a, 1], the function will fail because the range
421 /// overlaps with only a part of the macro
422 ///
423 /// -The macro is a function macro and the range can be mapped to the macro
424 /// arguments, e.g:
425 /// \#define M 1 2
426 /// \#define FM(x) x
427 /// FM(a b M)
428 /// if you have range [b, 2], the function will return the file range "b M"
429 /// inside the macro arguments.
430 /// if you have range [a, 2], the function will return the file range
431 /// "FM(a b M)" since the range includes all of the macro expansion.
432 static CharSourceRange makeFileCharRange(CharSourceRange Range,
433 const SourceManager &SM,
434 const LangOptions &LangOpts);
435
436 /// \brief Returns a string for the source that the range encompasses.
437 static StringRef getSourceText(CharSourceRange Range,
438 const SourceManager &SM,
439 const LangOptions &LangOpts,
440 bool *Invalid = nullptr);
441
442 /// \brief Retrieve the name of the immediate macro expansion.
443 ///
444 /// This routine starts from a source location, and finds the name of the macro
445 /// responsible for its immediate expansion. It looks through any intervening
446 /// macro argument expansions to compute this. It returns a StringRef which
447 /// refers to the SourceManager-owned buffer of the source where that macro
448 /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
449 static StringRef getImmediateMacroName(SourceLocation Loc,
450 const SourceManager &SM,
451 const LangOptions &LangOpts);
452
453 /// \brief Retrieve the name of the immediate macro expansion.
454 ///
455 /// This routine starts from a source location, and finds the name of the
456 /// macro responsible for its immediate expansion. It looks through any
457 /// intervening macro argument expansions to compute this. It returns a
458 /// StringRef which refers to the SourceManager-owned buffer of the source
459 /// where that macro name is spelled. Thus, the result shouldn't out-live
460 /// that SourceManager.
461 ///
462 /// This differs from Lexer::getImmediateMacroName in that any macro argument
463 /// location will result in the topmost function macro that accepted it.
464 /// e.g.
465 /// \code
466 /// MAC1( MAC2(foo) )
467 /// \endcode
468 /// for location of 'foo' token, this function will return "MAC1" while
469 /// Lexer::getImmediateMacroName will return "MAC2".
470 static StringRef getImmediateMacroNameForDiagnostics(
471 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
472
473 /// \brief Compute the preamble of the given file.
474 ///
475 /// The preamble of a file contains the initial comments, include directives,
476 /// and other preprocessor directives that occur before the code in this
477 /// particular file actually begins. The preamble of the main source file is
478 /// a potential prefix header.
479 ///
480 /// \param Buffer The memory buffer containing the file's contents.
481 ///
482 /// \param MaxLines If non-zero, restrict the length of the preamble
483 /// to fewer than this number of lines.
484 ///
485 /// \returns The offset into the file where the preamble ends and the rest
486 /// of the file begins along with a boolean value indicating whether
487 /// the preamble ends at the beginning of a new line.
488 static PreambleBounds ComputePreamble(StringRef Buffer,
489 const LangOptions &LangOpts,
490 unsigned MaxLines = 0);
491
492 /// Finds the token that comes right after the given location.
493 ///
494 /// Returns the next token, or none if the location is inside a macro.
495 static Optional<Token> findNextToken(SourceLocation Loc,
496 const SourceManager &SM,
497 const LangOptions &LangOpts);
498
499 /// \brief Checks that the given token is the first token that occurs after
500 /// the given location (this excludes comments and whitespace). Returns the
501 /// location immediately after the specified token. If the token is not found
502 /// or the location is inside a macro, the returned source location will be
503 /// invalid.
504 static SourceLocation findLocationAfterToken(SourceLocation loc,
505 tok::TokenKind TKind,
506 const SourceManager &SM,
507 const LangOptions &LangOpts,
508 bool SkipTrailingWhitespaceAndNewLine);
509
510 /// \brief Returns true if the given character could appear in an identifier.
511 static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
512
513 /// \brief Checks whether new line pointed by Str is preceded by escape
514 /// sequence.
515 static bool isNewLineEscaped(const char *BufferStart, const char *Str);
516
517 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
518 /// emit a warning.
519 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
520 const LangOptions &LangOpts) {
521 // If this is not a trigraph and not a UCN or escaped newline, return
522 // quickly.
523 if (isObviouslySimpleCharacter(Ptr[0])) {
524 Size = 1;
525 return *Ptr;
526 }
527
528 Size = 0;
529 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
530 }
531
532 /// Returns the leading whitespace for line that corresponds to the given
533 /// location \p Loc.
534 static StringRef getIndentationForLine(SourceLocation Loc,
535 const SourceManager &SM);
536
537private:
538 //===--------------------------------------------------------------------===//
539 // Internal implementation interfaces.
540
541 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
542 /// by Lex.
543 ///
544 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
545
546 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
547
548 /// Given that a token begins with the Unicode character \p C, figure out
549 /// what kind of token it is and dispatch to the appropriate lexing helper
550 /// function.
551 bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
552
553 /// FormTokenWithChars - When we lex a token, we have identified a span
554 /// starting at BufferPtr, going to TokEnd that forms the token. This method
555 /// takes that range and assigns it to the token as its location and size. In
556 /// addition, since tokens cannot overlap, this also updates BufferPtr to be
557 /// TokEnd.
558 void FormTokenWithChars(Token &Result, const char *TokEnd,
559 tok::TokenKind Kind) {
560 unsigned TokLen = TokEnd-BufferPtr;
561 Result.setLength(TokLen);
562 Result.setLocation(getSourceLocation(BufferPtr, TokLen));
563 Result.setKind(Kind);
564 BufferPtr = TokEnd;
565 }
566
567 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
568 /// tok::l_paren token, 0 if it is something else and 2 if there are no more
569 /// tokens in the buffer controlled by this lexer.
570 unsigned isNextPPTokenLParen();
571
572 //===--------------------------------------------------------------------===//
573 // Lexer character reading interfaces.
574
575 // This lexer is built on two interfaces for reading characters, both of which
576 // automatically provide phase 1/2 translation. getAndAdvanceChar is used
577 // when we know that we will be reading a character from the input buffer and
578 // that this character will be part of the result token. This occurs in (f.e.)
579 // string processing, because we know we need to read until we find the
580 // closing '"' character.
581 //
582 // The second interface is the combination of getCharAndSize with
583 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
584 // returning it and its size. If the lexer decides that this character is
585 // part of the current token, it calls ConsumeChar on it. This two stage
586 // approach allows us to emit diagnostics for characters (e.g. warnings about
587 // trigraphs), knowing that they only are emitted if the character is
588 // consumed.
589
590 /// isObviouslySimpleCharacter - Return true if the specified character is
591 /// obviously the same in translation phase 1 and translation phase 3. This
592 /// can return false for characters that end up being the same, but it will
593 /// never return true for something that needs to be mapped.
594 static bool isObviouslySimpleCharacter(char C) {
595 return C != '?' && C != '\\';
596 }
597
598 /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
599 /// advance over it, and return it. This is tricky in several cases. Here we
600 /// just handle the trivial case and fall-back to the non-inlined
601 /// getCharAndSizeSlow method to handle the hard case.
602 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
603 // If this is not a trigraph and not a UCN or escaped newline, return
604 // quickly.
605 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
3
Taking false branch
606
607 unsigned Size = 0;
608 char C = getCharAndSizeSlow(Ptr, Size, &Tok);
4
Calling 'Lexer::getCharAndSizeSlow'
609 Ptr += Size;
610 return C;
611 }
612
613 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
614 /// and added to a given token, check to see if there are diagnostics that
615 /// need to be emitted or flags that need to be set on the token. If so, do
616 /// it.
617 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
618 // Normal case, we consumed exactly one token. Just return it.
619 if (Size == 1)
620 return Ptr+Size;
621
622 // Otherwise, re-lex the character with a current token, allowing
623 // diagnostics to be emitted and flags to be set.
624 Size = 0;
625 getCharAndSizeSlow(Ptr, Size, &Tok);
626 return Ptr+Size;
627 }
628
629 /// getCharAndSize - Peek a single 'character' from the specified buffer,
630 /// get its size, and return it. This is tricky in several cases. Here we
631 /// just handle the trivial case and fall-back to the non-inlined
632 /// getCharAndSizeSlow method to handle the hard case.
633 inline char getCharAndSize(const char *Ptr, unsigned &Size) {
634 // If this is not a trigraph and not a UCN or escaped newline, return
635 // quickly.
636 if (isObviouslySimpleCharacter(Ptr[0])) {
637 Size = 1;
638 return *Ptr;
639 }
640
641 Size = 0;
642 return getCharAndSizeSlow(Ptr, Size);
643 }
644
645 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
646 /// method.
647 char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
648 Token *Tok = nullptr);
649
650 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
651 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
652 /// to this function.
653 static unsigned getEscapedNewLineSize(const char *P);
654
655 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
656 /// them), skip over them and return the first non-escaped-newline found,
657 /// otherwise return P.
658 static const char *SkipEscapedNewLines(const char *P);
659
660 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
661 /// diagnostic.
662 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
663 const LangOptions &LangOpts);
664
665 //===--------------------------------------------------------------------===//
666 // Other lexer functions.
667
668 void SetByteOffset(unsigned Offset, bool StartOfLine);
669
670 void PropagateLineStartLeadingSpaceInfo(Token &Result);
671
672 const char *LexUDSuffix(Token &Result, const char *CurPtr,
673 bool IsStringLiteral);
674
675 // Helper functions to lex the remainder of a token of the specific type.
676 bool LexIdentifier (Token &Result, const char *CurPtr);
677 bool LexNumericConstant (Token &Result, const char *CurPtr);
678 bool LexStringLiteral (Token &Result, const char *CurPtr,
679 tok::TokenKind Kind);
680 bool LexRawStringLiteral (Token &Result, const char *CurPtr,
681 tok::TokenKind Kind);
682 bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
683 bool LexCharConstant (Token &Result, const char *CurPtr,
684 tok::TokenKind Kind);
685 bool LexEndOfFile (Token &Result, const char *CurPtr);
686 bool SkipWhitespace (Token &Result, const char *CurPtr,
687 bool &TokAtPhysicalStartOfLine);
688 bool SkipLineComment (Token &Result, const char *CurPtr,
689 bool &TokAtPhysicalStartOfLine);
690 bool SkipBlockComment (Token &Result, const char *CurPtr,
691 bool &TokAtPhysicalStartOfLine);
692 bool SaveLineComment (Token &Result, const char *CurPtr);
693
694 bool IsStartOfConflictMarker(const char *CurPtr);
695 bool HandleEndOfConflictMarker(const char *CurPtr);
696
697 bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
698
699 bool isCodeCompletionPoint(const char *CurPtr) const;
700 void cutOffLexing() { BufferPtr = BufferEnd; }
701
702 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
703
704
705 /// Read a universal character name.
706 ///
707 /// \param CurPtr The position in the source buffer after the initial '\'.
708 /// If the UCN is syntactically well-formed (but not necessarily
709 /// valid), this parameter will be updated to point to the
710 /// character after the UCN.
711 /// \param SlashLoc The position in the source buffer of the '\'.
712 /// \param Tok The token being formed. Pass \c nullptr to suppress diagnostics
713 /// and handle token formation in the caller.
714 ///
715 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
716 /// invalid.
717 uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
718
719 /// \brief Try to consume a UCN as part of an identifier at the current
720 /// location.
721 /// \param CurPtr Initially points to the range of characters in the source
722 /// buffer containing the '\'. Updated to point past the end of
723 /// the UCN on success.
724 /// \param Size The number of characters occupied by the '\' (including
725 /// trigraphs and escaped newlines).
726 /// \param Result The token being produced. Marked as containing a UCN on
727 /// success.
728 /// \return \c true if a UCN was lexed and it produced an acceptable
729 /// identifier character, \c false otherwise.
730 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
731 Token &Result);
732
733 /// \brief Try to consume an identifier character encoded in UTF-8.
734 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
735 /// sequence. On success, updated to point past the end of it.
736 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
737 /// character was lexed, \c false otherwise.
738 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
739};
740
741} // namespace clang
742
743#endif // LLVM_CLANG_LEX_LEXER_H

/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h

1//===--- Token.h - Token interface ------------------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the Token interface.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_LEX_TOKEN_H
15#define LLVM_CLANG_LEX_TOKEN_H
16
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/TokenKinds.h"
19#include "llvm/ADT/StringRef.h"
20#include <cassert>
21
22namespace clang {
23
24class IdentifierInfo;
25
26/// Token - This structure provides full information about a lexed token.
27/// It is not intended to be space efficient, it is intended to return as much
28/// information as possible about each returned token. This is expected to be
29/// compressed into a smaller form if memory footprint is important.
30///
31/// The parser can create a special "annotation token" representing a stream of
32/// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
33/// can be represented by a single typename annotation token that carries
34/// information about the SourceRange of the tokens and the type object.
35class Token {
36 /// The location of the token. This is actually a SourceLocation.
37 unsigned Loc;
38
39 // Conceptually these next two fields could be in a union. However, this
40 // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
41 // routine. Keeping as separate members with casts until a more beautiful fix
42 // presents itself.
43
44 /// UintData - This holds either the length of the token text, when
45 /// a normal token, or the end of the SourceRange when an annotation
46 /// token.
47 unsigned UintData;
48
49 /// PtrData - This is a union of four different pointer types, which depends
50 /// on what type of token this is:
51 /// Identifiers, keywords, etc:
52 /// This is an IdentifierInfo*, which contains the uniqued identifier
53 /// spelling.
54 /// Literals: isLiteral() returns true.
55 /// This is a pointer to the start of the token in a text buffer, which
56 /// may be dirty (have trigraphs / escaped newlines).
57 /// Annotations (resolved type names, C++ scopes, etc): isAnnotation().
58 /// This is a pointer to sema-specific data for the annotation token.
59 /// Eof:
60 // This is a pointer to a Decl.
61 /// Other:
62 /// This is null.
63 void *PtrData;
64
65 /// Kind - The actual flavor of token this is.
66 tok::TokenKind Kind;
67
68 /// Flags - Bits we track about this token, members of the TokenFlags enum.
69 unsigned short Flags;
70
71public:
72 // Various flags set per token:
73 enum TokenFlags {
74 StartOfLine = 0x01, // At start of line or only after whitespace
75 // (considering the line after macro expansion).
76 LeadingSpace = 0x02, // Whitespace exists before this token (considering
77 // whitespace after macro expansion).
78 DisableExpand = 0x04, // This identifier may never be macro expanded.
79 NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
80 LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
81 HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
82 HasUCN = 0x40, // This identifier contains a UCN.
83 IgnoredComma = 0x80, // This comma is not a macro argument separator (MS).
84 StringifiedInMacro = 0x100, // This string or character literal is formed by
85 // macro stringizing or charizing operator.
86 CommaAfterElided = 0x200, // The comma following this token was elided (MS).
87 IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
88 };
89
90 tok::TokenKind getKind() const { return Kind; }
91 void setKind(tok::TokenKind K) { Kind = K; }
92
93 /// is/isNot - Predicates to check if this token is a specific kind, as in
94 /// "if (Tok.is(tok::l_brace)) {...}".
95 bool is(tok::TokenKind K) const { return Kind == K; }
96 bool isNot(tok::TokenKind K) const { return Kind != K; }
97 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
98 return is(K1) || is(K2);
99 }
100 template <typename... Ts>
101 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, Ts... Ks) const {
102 return is(K1) || isOneOf(K2, Ks...);
103 }
104
105 /// \brief Return true if this is a raw identifier (when lexing
106 /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode).
107 bool isAnyIdentifier() const {
108 return tok::isAnyIdentifier(getKind());
109 }
110
111 /// \brief Return true if this is a "literal", like a numeric
112 /// constant, string, etc.
113 bool isLiteral() const {
114 return tok::isLiteral(getKind());
115 }
116
117 /// \brief Return true if this is any of tok::annot_* kind tokens.
118 bool isAnnotation() const {
119 return tok::isAnnotation(getKind());
120 }
121
122 /// \brief Return a source location identifier for the specified
123 /// offset in the current file.
124 SourceLocation getLocation() const {
125 return SourceLocation::getFromRawEncoding(Loc);
126 }
127 unsigned getLength() const {
128 assert(!isAnnotation() && "Annotation tokens have no length field")(static_cast <bool> (!isAnnotation() && "Annotation tokens have no length field"
) ? void (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 128, __extension__ __PRETTY_FUNCTION__))
;
129 return UintData;
130 }
131
132 void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); }
133 void setLength(unsigned Len) {
134 assert(!isAnnotation() && "Annotation tokens have no length field")(static_cast <bool> (!isAnnotation() && "Annotation tokens have no length field"
) ? void (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 134, __extension__ __PRETTY_FUNCTION__))
;
135 UintData = Len;
136 }
137
138 SourceLocation getAnnotationEndLoc() const {
139 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")(static_cast <bool> (isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? void (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 139, __extension__ __PRETTY_FUNCTION__))
;
140 return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc);
141 }
142 void setAnnotationEndLoc(SourceLocation L) {
143 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")(static_cast <bool> (isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? void (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 143, __extension__ __PRETTY_FUNCTION__))
;
144 UintData = L.getRawEncoding();
145 }
146
147 SourceLocation getLastLoc() const {
148 return isAnnotation() ? getAnnotationEndLoc() : getLocation();
149 }
150
151 SourceLocation getEndLoc() const {
152 return isAnnotation() ? getAnnotationEndLoc()
153 : getLocation().getLocWithOffset(getLength());
154 }
155
156 /// \brief SourceRange of the group of tokens that this annotation token
157 /// represents.
158 SourceRange getAnnotationRange() const {
159 return SourceRange(getLocation(), getAnnotationEndLoc());
160 }
161 void setAnnotationRange(SourceRange R) {
162 setLocation(R.getBegin());
163 setAnnotationEndLoc(R.getEnd());
164 }
165
166 const char *getName() const { return tok::getTokenName(Kind); }
167
168 /// \brief Reset all flags to cleared.
169 void startToken() {
170 Kind = tok::unknown;
171 Flags = 0;
172 PtrData = nullptr;
173 UintData = 0;
174 Loc = SourceLocation().getRawEncoding();
175 }
176
177 IdentifierInfo *getIdentifierInfo() const {
178 assert(isNot(tok::raw_identifier) &&(static_cast <bool> (isNot(tok::raw_identifier) &&
"getIdentifierInfo() on a tok::raw_identifier token!") ? void
(0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 179, __extension__ __PRETTY_FUNCTION__))
179 "getIdentifierInfo() on a tok::raw_identifier token!")(static_cast <bool> (isNot(tok::raw_identifier) &&
"getIdentifierInfo() on a tok::raw_identifier token!") ? void
(0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180 assert(!isAnnotation() &&(static_cast <bool> (!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? void (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 181, __extension__ __PRETTY_FUNCTION__))
181 "getIdentifierInfo() on an annotation token!")(static_cast <bool> (!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? void (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 181, __extension__ __PRETTY_FUNCTION__))
;
182 if (isLiteral()) return nullptr;
183 if (is(tok::eof)) return nullptr;
184 return (IdentifierInfo*) PtrData;
185 }
186 void setIdentifierInfo(IdentifierInfo *II) {
187 PtrData = (void*) II;
188 }
189
190 const void *getEofData() const {
191 assert(is(tok::eof))(static_cast <bool> (is(tok::eof)) ? void (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 191, __extension__ __PRETTY_FUNCTION__))
;
192 return reinterpret_cast<const void *>(PtrData);
193 }
194 void setEofData(const void *D) {
195 assert(is(tok::eof))(static_cast <bool> (is(tok::eof)) ? void (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 195, __extension__ __PRETTY_FUNCTION__))
;
196 assert(!PtrData)(static_cast <bool> (!PtrData) ? void (0) : __assert_fail
("!PtrData", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 196, __extension__ __PRETTY_FUNCTION__))
;
197 PtrData = const_cast<void *>(D);
198 }
199
200 /// getRawIdentifier - For a raw identifier token (i.e., an identifier
201 /// lexed in raw mode), returns a reference to the text substring in the
202 /// buffer if known.
203 StringRef getRawIdentifier() const {
204 assert(is(tok::raw_identifier))(static_cast <bool> (is(tok::raw_identifier)) ? void (0
) : __assert_fail ("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 204, __extension__ __PRETTY_FUNCTION__))
;
205 return StringRef(reinterpret_cast<const char *>(PtrData), getLength());
206 }
207 void setRawIdentifierData(const char *Ptr) {
208 assert(is(tok::raw_identifier))(static_cast <bool> (is(tok::raw_identifier)) ? void (0
) : __assert_fail ("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 208, __extension__ __PRETTY_FUNCTION__))
;
209 PtrData = const_cast<char*>(Ptr);
210 }
211
212 /// getLiteralData - For a literal token (numeric constant, string, etc), this
213 /// returns a pointer to the start of it in the text buffer if known, null
214 /// otherwise.
215 const char *getLiteralData() const {
216 assert(isLiteral() && "Cannot get literal data of non-literal")(static_cast <bool> (isLiteral() && "Cannot get literal data of non-literal"
) ? void (0) : __assert_fail ("isLiteral() && \"Cannot get literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 216, __extension__ __PRETTY_FUNCTION__))
;
217 return reinterpret_cast<const char*>(PtrData);
218 }
219 void setLiteralData(const char *Ptr) {
220 assert(isLiteral() && "Cannot set literal data of non-literal")(static_cast <bool> (isLiteral() && "Cannot set literal data of non-literal"
) ? void (0) : __assert_fail ("isLiteral() && \"Cannot set literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 220, __extension__ __PRETTY_FUNCTION__))
;
221 PtrData = const_cast<char*>(Ptr);
222 }
223
224 void *getAnnotationValue() const {
225 assert(isAnnotation() && "Used AnnotVal on non-annotation token")(static_cast <bool> (isAnnotation() && "Used AnnotVal on non-annotation token"
) ? void (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 225, __extension__ __PRETTY_FUNCTION__))
;
226 return PtrData;
227 }
228 void setAnnotationValue(void *val) {
229 assert(isAnnotation() && "Used AnnotVal on non-annotation token")(static_cast <bool> (isAnnotation() && "Used AnnotVal on non-annotation token"
) ? void (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-7~svn329677/tools/clang/include/clang/Lex/Token.h"
, 229, __extension__ __PRETTY_FUNCTION__))
;
230 PtrData = val;
231 }
232
233 /// \brief Set the specified flag.
234 void setFlag(TokenFlags Flag) {
235 Flags |= Flag;
11
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage
236 }
237
238 /// \brief Get the specified flag.
239 bool getFlag(TokenFlags Flag) const {
240 return (Flags & Flag) != 0;
241 }
242
243 /// \brief Unset the specified flag.
244 void clearFlag(TokenFlags Flag) {
245 Flags &= ~Flag;
246 }
247
248 /// \brief Return the internal represtation of the flags.
249 ///
250 /// This is only intended for low-level operations such as writing tokens to
251 /// disk.
252 unsigned getFlags() const {
253 return Flags;
254 }
255
256 /// \brief Set a flag to either true or false.
257 void setFlagValue(TokenFlags Flag, bool Val) {
258 if (Val)
259 setFlag(Flag);
260 else
261 clearFlag(Flag);
262 }
263
264 /// isAtStartOfLine - Return true if this token is at the start of a line.
265 ///
266 bool isAtStartOfLine() const { return getFlag(StartOfLine); }
267
268 /// \brief Return true if this token has whitespace before it.
269 ///
270 bool hasLeadingSpace() const { return getFlag(LeadingSpace); }
271
272 /// \brief Return true if this identifier token should never
273 /// be expanded in the future, due to C99 6.10.3.4p2.
274 bool isExpandDisabled() const { return getFlag(DisableExpand); }
275
276 /// \brief Return true if we have an ObjC keyword identifier.
277 bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const;
278
279 /// \brief Return the ObjC keyword kind.
280 tok::ObjCKeywordKind getObjCKeywordID() const;
281
282 /// \brief Return true if this token has trigraphs or escaped newlines in it.
283 bool needsCleaning() const { return getFlag(NeedsCleaning); }
284
285 /// \brief Return true if this token has an empty macro before it.
286 ///
287 bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); }
288
289 /// \brief Return true if this token is a string or character literal which
290 /// has a ud-suffix.
291 bool hasUDSuffix() const { return getFlag(HasUDSuffix); }
292
293 /// Returns true if this token contains a universal character name.
294 bool hasUCN() const { return getFlag(HasUCN); }
295
296 /// Returns true if this token is formed by macro by stringizing or charizing
297 /// operator.
298 bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); }
299
300 /// Returns true if the comma after this token was elided.
301 bool commaAfterElided() const { return getFlag(CommaAfterElided); }
302
303 /// Returns true if this token is an editor placeholder.
304 ///
305 /// Editor placeholders are produced by the code-completion engine and are
306 /// represented as characters between '<#' and '#>' in the source code. The
307 /// lexer uses identifier tokens to represent placeholders.
308 bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
309};
310
311/// \brief Information about the conditional stack (\#if directives)
312/// currently active.
313struct PPConditionalInfo {
314 /// \brief Location where the conditional started.
315 SourceLocation IfLoc;
316
317 /// \brief True if this was contained in a skipping directive, e.g.,
318 /// in a "\#if 0" block.
319 bool WasSkipping;
320
321 /// \brief True if we have emitted tokens already, and now we're in
322 /// an \#else block or something. Only useful in Skipping blocks.
323 bool FoundNonSkip;
324
325 /// \brief True if we've seen a \#else in this block. If so,
326 /// \#elif/\#else directives are not allowed.
327 bool FoundElse;
328};
329
330} // end namespace clang
331
332namespace llvm {
333 template <>
334 struct isPodLike<clang::Token> { static const bool value = true; };
335} // end namespace llvm
336
337#endif // LLVM_CLANG_LEX_TOKEN_H