Bug Summary

File:tools/clang/include/clang/Lex/Token.h
Warning:line 235, column 11
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Lexer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-eagerly-assume -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -mrelocation-model pic -pic-level 2 -mthread-model posix -relaxed-aliasing -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-7/lib/clang/7.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-7~svn338205/build-llvm/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include -I /build/llvm-toolchain-snapshot-7~svn338205/build-llvm/tools/clang/include -I /build/llvm-toolchain-snapshot-7~svn338205/build-llvm/include -I /build/llvm-toolchain-snapshot-7~svn338205/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/backward -internal-isystem /usr/include/clang/7.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-7/lib/clang/7.0.0/include -internal-externc-isystem /usr/lib/gcc/x86_64-linux-gnu/8/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-7~svn338205/build-llvm/tools/clang/lib/Lex -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -fobjc-runtime=gcc -fno-common -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2018-07-29-043837-17923-1 -x c++ /build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp -faddrsig

/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp

1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the Lexer and Token interfaces.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/Lex/Lexer.h"
15#include "UnicodeCharSets.h"
16#include "clang/Basic/CharInfo.h"
17#include "clang/Basic/IdentifierTable.h"
18#include "clang/Basic/LangOptions.h"
19#include "clang/Basic/SourceLocation.h"
20#include "clang/Basic/SourceManager.h"
21#include "clang/Basic/TokenKinds.h"
22#include "clang/Lex/LexDiagnostic.h"
23#include "clang/Lex/LiteralSupport.h"
24#include "clang/Lex/MultipleIncludeOpt.h"
25#include "clang/Lex/Preprocessor.h"
26#include "clang/Lex/PreprocessorOptions.h"
27#include "clang/Lex/Token.h"
28#include "clang/Basic/Diagnostic.h"
29#include "clang/Basic/LLVM.h"
30#include "clang/Basic/TokenKinds.h"
31#include "llvm/ADT/None.h"
32#include "llvm/ADT/Optional.h"
33#include "llvm/ADT/StringExtras.h"
34#include "llvm/ADT/StringSwitch.h"
35#include "llvm/ADT/StringRef.h"
36#include "llvm/Support/Compiler.h"
37#include "llvm/Support/ConvertUTF.h"
38#include "llvm/Support/MathExtras.h"
39#include "llvm/Support/MemoryBuffer.h"
40#include "llvm/Support/NativeFormatting.h"
41#include "llvm/Support/UnicodeCharRanges.h"
42#include <algorithm>
43#include <cassert>
44#include <cstddef>
45#include <cstdint>
46#include <cstring>
47#include <string>
48#include <tuple>
49#include <utility>
50
51using namespace clang;
52
53//===----------------------------------------------------------------------===//
54// Token Class Implementation
55//===----------------------------------------------------------------------===//
56
57/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
58bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
59 if (isAnnotation())
60 return false;
61 if (IdentifierInfo *II = getIdentifierInfo())
62 return II->getObjCKeywordID() == objcKey;
63 return false;
64}
65
66/// getObjCKeywordID - Return the ObjC keyword kind.
67tok::ObjCKeywordKind Token::getObjCKeywordID() const {
68 if (isAnnotation())
69 return tok::objc_not_keyword;
70 IdentifierInfo *specId = getIdentifierInfo();
71 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
72}
73
74//===----------------------------------------------------------------------===//
75// Lexer Class Implementation
76//===----------------------------------------------------------------------===//
77
78void Lexer::anchor() {}
79
80void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
81 const char *BufEnd) {
82 BufferStart = BufStart;
83 BufferPtr = BufPtr;
84 BufferEnd = BufEnd;
85
86 assert(BufEnd[0] == 0 &&(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 88, __extension__ __PRETTY_FUNCTION__))
87 "We assume that the input buffer has a null character at the end"(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 88, __extension__ __PRETTY_FUNCTION__))
88 " to simplify lexing!")(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 88, __extension__ __PRETTY_FUNCTION__))
;
89
90 // Check whether we have a BOM in the beginning of the buffer. If yes - act
91 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
92 // skip the UTF-8 BOM if it's present.
93 if (BufferStart == BufferPtr) {
94 // Determine the size of the BOM.
95 StringRef Buf(BufferStart, BufferEnd - BufferStart);
96 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
97 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
98 .Default(0);
99
100 // Skip the BOM.
101 BufferPtr += BOMLength;
102 }
103
104 Is_PragmaLexer = false;
105 CurrentConflictMarkerState = CMK_None;
106
107 // Start of the file is a start of line.
108 IsAtStartOfLine = true;
109 IsAtPhysicalStartOfLine = true;
110
111 HasLeadingSpace = false;
112 HasLeadingEmptyMacro = false;
113
114 // We are not after parsing a #.
115 ParsingPreprocessorDirective = false;
116
117 // We are not after parsing #include.
118 ParsingFilename = false;
119
120 // We are not in raw mode. Raw mode disables diagnostics and interpretation
121 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
122 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
123 // or otherwise skipping over tokens.
124 LexingRawMode = false;
125
126 // Default to not keeping comments.
127 ExtendedTokenMode = 0;
128}
129
130/// Lexer constructor - Create a new lexer object for the specified buffer
131/// with the specified preprocessor managing the lexing process. This lexer
132/// assumes that the associated file buffer and Preprocessor objects will
133/// outlive it, so it doesn't take ownership of either of them.
134Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
135 : PreprocessorLexer(&PP, FID),
136 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
137 LangOpts(PP.getLangOpts()) {
138 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
139 InputFile->getBufferEnd());
140
141 resetExtendedTokenMode();
142}
143
144/// Lexer constructor - Create a new raw lexer object. This object is only
145/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
146/// range will outlive it, so it doesn't take ownership of it.
147Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
148 const char *BufStart, const char *BufPtr, const char *BufEnd)
149 : FileLoc(fileloc), LangOpts(langOpts) {
150 InitLexer(BufStart, BufPtr, BufEnd);
151
152 // We *are* in raw mode.
153 LexingRawMode = true;
154}
155
156/// Lexer constructor - Create a new raw lexer object. This object is only
157/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
158/// range will outlive it, so it doesn't take ownership of it.
159Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
160 const SourceManager &SM, const LangOptions &langOpts)
161 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
162 FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
163
164void Lexer::resetExtendedTokenMode() {
165 assert(PP && "Cannot reset token mode without a preprocessor")(static_cast <bool> (PP && "Cannot reset token mode without a preprocessor"
) ? void (0) : __assert_fail ("PP && \"Cannot reset token mode without a preprocessor\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 165, __extension__ __PRETTY_FUNCTION__))
;
166 if (LangOpts.TraditionalCPP)
167 SetKeepWhitespaceMode(true);
168 else
169 SetCommentRetentionState(PP->getCommentRetentionState());
170}
171
172/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
173/// _Pragma expansion. This has a variety of magic semantics that this method
174/// sets up. It returns a new'd Lexer that must be delete'd when done.
175///
176/// On entrance to this routine, TokStartLoc is a macro location which has a
177/// spelling loc that indicates the bytes to be lexed for the token and an
178/// expansion location that indicates where all lexed tokens should be
179/// "expanded from".
180///
181/// TODO: It would really be nice to make _Pragma just be a wrapper around a
182/// normal lexer that remaps tokens as they fly by. This would require making
183/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
184/// interface that could handle this stuff. This would pull GetMappedTokenLoc
185/// out of the critical path of the lexer!
186///
187Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
188 SourceLocation ExpansionLocStart,
189 SourceLocation ExpansionLocEnd,
190 unsigned TokLen, Preprocessor &PP) {
191 SourceManager &SM = PP.getSourceManager();
192
193 // Create the lexer as if we were going to lex the file normally.
194 FileID SpellingFID = SM.getFileID(SpellingLoc);
195 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
196 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
197
198 // Now that the lexer is created, change the start/end locations so that we
199 // just lex the subsection of the file that we want. This is lexing from a
200 // scratch buffer.
201 const char *StrData = SM.getCharacterData(SpellingLoc);
202
203 L->BufferPtr = StrData;
204 L->BufferEnd = StrData+TokLen;
205 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!")(static_cast <bool> (L->BufferEnd[0] == 0 &&
"Buffer is not nul terminated!") ? void (0) : __assert_fail (
"L->BufferEnd[0] == 0 && \"Buffer is not nul terminated!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 205, __extension__ __PRETTY_FUNCTION__))
;
206
207 // Set the SourceLocation with the remapping information. This ensures that
208 // GetMappedTokenLoc will remap the tokens as they are lexed.
209 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
210 ExpansionLocStart,
211 ExpansionLocEnd, TokLen);
212
213 // Ensure that the lexer thinks it is inside a directive, so that end \n will
214 // return an EOD token.
215 L->ParsingPreprocessorDirective = true;
216
217 // This lexer really is for _Pragma.
218 L->Is_PragmaLexer = true;
219 return L;
220}
221
222template <typename T> static void StringifyImpl(T &Str, char Quote) {
223 typename T::size_type i = 0, e = Str.size();
224 while (i < e) {
225 if (Str[i] == '\\' || Str[i] == Quote) {
226 Str.insert(Str.begin() + i, '\\');
227 i += 2;
228 ++e;
229 } else if (Str[i] == '\n' || Str[i] == '\r') {
230 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
231 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
232 Str[i] != Str[i + 1]) {
233 Str[i] = '\\';
234 Str[i + 1] = 'n';
235 } else {
236 // Replace '\n' and '\r' to '\\' followed by 'n'.
237 Str[i] = '\\';
238 Str.insert(Str.begin() + i + 1, 'n');
239 ++e;
240 }
241 i += 2;
242 } else
243 ++i;
244 }
245}
246
247std::string Lexer::Stringify(StringRef Str, bool Charify) {
248 std::string Result = Str;
249 char Quote = Charify ? '\'' : '"';
250 StringifyImpl(Result, Quote);
251 return Result;
252}
253
254void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
255
256//===----------------------------------------------------------------------===//
257// Token Spelling
258//===----------------------------------------------------------------------===//
259
260/// Slow case of getSpelling. Extract the characters comprising the
261/// spelling of this token from the provided input buffer.
262static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
263 const LangOptions &LangOpts, char *Spelling) {
264 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token")(static_cast <bool> (Tok.needsCleaning() && "getSpellingSlow called on simple token"
) ? void (0) : __assert_fail ("Tok.needsCleaning() && \"getSpellingSlow called on simple token\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 264, __extension__ __PRETTY_FUNCTION__))
;
265
266 size_t Length = 0;
267 const char *BufEnd = BufPtr + Tok.getLength();
268
269 if (tok::isStringLiteral(Tok.getKind())) {
270 // Munch the encoding-prefix and opening double-quote.
271 while (BufPtr < BufEnd) {
272 unsigned Size;
273 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
274 BufPtr += Size;
275
276 if (Spelling[Length - 1] == '"')
277 break;
278 }
279
280 // Raw string literals need special handling; trigraph expansion and line
281 // splicing do not occur within their d-char-sequence nor within their
282 // r-char-sequence.
283 if (Length >= 2 &&
284 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
285 // Search backwards from the end of the token to find the matching closing
286 // quote.
287 const char *RawEnd = BufEnd;
288 do --RawEnd; while (*RawEnd != '"');
289 size_t RawLength = RawEnd - BufPtr + 1;
290
291 // Everything between the quotes is included verbatim in the spelling.
292 memcpy(Spelling + Length, BufPtr, RawLength);
293 Length += RawLength;
294 BufPtr += RawLength;
295
296 // The rest of the token is lexed normally.
297 }
298 }
299
300 while (BufPtr < BufEnd) {
301 unsigned Size;
302 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
303 BufPtr += Size;
304 }
305
306 assert(Length < Tok.getLength() &&(static_cast <bool> (Length < Tok.getLength() &&
"NeedsCleaning flag set on token that didn't need cleaning!"
) ? void (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 307, __extension__ __PRETTY_FUNCTION__))
307 "NeedsCleaning flag set on token that didn't need cleaning!")(static_cast <bool> (Length < Tok.getLength() &&
"NeedsCleaning flag set on token that didn't need cleaning!"
) ? void (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 307, __extension__ __PRETTY_FUNCTION__))
;
308 return Length;
309}
310
311/// getSpelling() - Return the 'spelling' of this token. The spelling of a
312/// token are the characters used to represent the token in the source file
313/// after trigraph expansion and escaped-newline folding. In particular, this
314/// wants to get the true, uncanonicalized, spelling of things like digraphs
315/// UCNs, etc.
316StringRef Lexer::getSpelling(SourceLocation loc,
317 SmallVectorImpl<char> &buffer,
318 const SourceManager &SM,
319 const LangOptions &options,
320 bool *invalid) {
321 // Break down the source location.
322 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
323
324 // Try to the load the file buffer.
325 bool invalidTemp = false;
326 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
327 if (invalidTemp) {
328 if (invalid) *invalid = true;
329 return {};
330 }
331
332 const char *tokenBegin = file.data() + locInfo.second;
333
334 // Lex from the start of the given location.
335 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
336 file.begin(), tokenBegin, file.end());
337 Token token;
338 lexer.LexFromRawLexer(token);
339
340 unsigned length = token.getLength();
341
342 // Common case: no need for cleaning.
343 if (!token.needsCleaning())
344 return StringRef(tokenBegin, length);
345
346 // Hard case, we need to relex the characters into the string.
347 buffer.resize(length);
348 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
349 return StringRef(buffer.data(), buffer.size());
350}
351
352/// getSpelling() - Return the 'spelling' of this token. The spelling of a
353/// token are the characters used to represent the token in the source file
354/// after trigraph expansion and escaped-newline folding. In particular, this
355/// wants to get the true, uncanonicalized, spelling of things like digraphs
356/// UCNs, etc.
357std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
358 const LangOptions &LangOpts, bool *Invalid) {
359 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(static_cast <bool> ((int)Tok.getLength() >= 0 &&
"Token character range is bogus!") ? void (0) : __assert_fail
("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 359, __extension__ __PRETTY_FUNCTION__))
;
360
361 bool CharDataInvalid = false;
362 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
363 &CharDataInvalid);
364 if (Invalid)
365 *Invalid = CharDataInvalid;
366 if (CharDataInvalid)
367 return {};
368
369 // If this token contains nothing interesting, return it directly.
370 if (!Tok.needsCleaning())
371 return std::string(TokStart, TokStart + Tok.getLength());
372
373 std::string Result;
374 Result.resize(Tok.getLength());
375 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
376 return Result;
377}
378
379/// getSpelling - This method is used to get the spelling of a token into a
380/// preallocated buffer, instead of as an std::string. The caller is required
381/// to allocate enough space for the token, which is guaranteed to be at least
382/// Tok.getLength() bytes long. The actual length of the token is returned.
383///
384/// Note that this method may do two possible things: it may either fill in
385/// the buffer specified with characters, or it may *change the input pointer*
386/// to point to a constant buffer with the data already in it (avoiding a
387/// copy). The caller is not allowed to modify the returned buffer pointer
388/// if an internal buffer is returned.
389unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
390 const SourceManager &SourceMgr,
391 const LangOptions &LangOpts, bool *Invalid) {
392 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(static_cast <bool> ((int)Tok.getLength() >= 0 &&
"Token character range is bogus!") ? void (0) : __assert_fail
("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 392, __extension__ __PRETTY_FUNCTION__))
;
393
394 const char *TokStart = nullptr;
395 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
396 if (Tok.is(tok::raw_identifier))
397 TokStart = Tok.getRawIdentifier().data();
398 else if (!Tok.hasUCN()) {
399 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
400 // Just return the string from the identifier table, which is very quick.
401 Buffer = II->getNameStart();
402 return II->getLength();
403 }
404 }
405
406 // NOTE: this can be checked even after testing for an IdentifierInfo.
407 if (Tok.isLiteral())
408 TokStart = Tok.getLiteralData();
409
410 if (!TokStart) {
411 // Compute the start of the token in the input lexer buffer.
412 bool CharDataInvalid = false;
413 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
414 if (Invalid)
415 *Invalid = CharDataInvalid;
416 if (CharDataInvalid) {
417 Buffer = "";
418 return 0;
419 }
420 }
421
422 // If this token contains nothing interesting, return it directly.
423 if (!Tok.needsCleaning()) {
424 Buffer = TokStart;
425 return Tok.getLength();
426 }
427
428 // Otherwise, hard case, relex the characters into the string.
429 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
430}
431
432/// MeasureTokenLength - Relex the token at the specified location and return
433/// its length in bytes in the input file. If the token needs cleaning (e.g.
434/// includes a trigraph or an escaped newline) then this count includes bytes
435/// that are part of that.
436unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
437 const SourceManager &SM,
438 const LangOptions &LangOpts) {
439 Token TheTok;
440 if (getRawToken(Loc, TheTok, SM, LangOpts))
441 return 0;
442 return TheTok.getLength();
443}
444
445/// Relex the token at the specified location.
446/// \returns true if there was a failure, false on success.
447bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
448 const SourceManager &SM,
449 const LangOptions &LangOpts,
450 bool IgnoreWhiteSpace) {
451 // TODO: this could be special cased for common tokens like identifiers, ')',
452 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
453 // all obviously single-char tokens. This could use
454 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
455 // something.
456
457 // If this comes from a macro expansion, we really do want the macro name, not
458 // the token this macro expanded to.
459 Loc = SM.getExpansionLoc(Loc);
460 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
461 bool Invalid = false;
462 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
463 if (Invalid)
464 return true;
465
466 const char *StrData = Buffer.data()+LocInfo.second;
467
468 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
469 return true;
470
471 // Create a lexer starting at the beginning of this token.
472 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
473 Buffer.begin(), StrData, Buffer.end());
474 TheLexer.SetCommentRetentionState(true);
475 TheLexer.LexFromRawLexer(Result);
476 return false;
477}
478
479/// Returns the pointer that points to the beginning of line that contains
480/// the given offset, or null if the offset if invalid.
481static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
482 const char *BufStart = Buffer.data();
483 if (Offset >= Buffer.size())
484 return nullptr;
485
486 const char *LexStart = BufStart + Offset;
487 for (; LexStart != BufStart; --LexStart) {
488 if (isVerticalWhitespace(LexStart[0]) &&
489 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
490 // LexStart should point at first character of logical line.
491 ++LexStart;
492 break;
493 }
494 }
495 return LexStart;
496}
497
498static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
499 const SourceManager &SM,
500 const LangOptions &LangOpts) {
501 assert(Loc.isFileID())(static_cast <bool> (Loc.isFileID()) ? void (0) : __assert_fail
("Loc.isFileID()", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 501, __extension__ __PRETTY_FUNCTION__))
;
502 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
503 if (LocInfo.first.isInvalid())
504 return Loc;
505
506 bool Invalid = false;
507 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
508 if (Invalid)
509 return Loc;
510
511 // Back up from the current location until we hit the beginning of a line
512 // (or the buffer). We'll relex from that point.
513 const char *StrData = Buffer.data() + LocInfo.second;
514 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
515 if (!LexStart || LexStart == StrData)
516 return Loc;
517
518 // Create a lexer starting at the beginning of this token.
519 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
520 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
521 Buffer.end());
522 TheLexer.SetCommentRetentionState(true);
523
524 // Lex tokens until we find the token that contains the source location.
525 Token TheTok;
526 do {
527 TheLexer.LexFromRawLexer(TheTok);
528
529 if (TheLexer.getBufferLocation() > StrData) {
530 // Lexing this token has taken the lexer past the source location we're
531 // looking for. If the current token encompasses our source location,
532 // return the beginning of that token.
533 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
534 return TheTok.getLocation();
535
536 // We ended up skipping over the source location entirely, which means
537 // that it points into whitespace. We're done here.
538 break;
539 }
540 } while (TheTok.getKind() != tok::eof);
541
542 // We've passed our source location; just return the original source location.
543 return Loc;
544}
545
546SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
547 const SourceManager &SM,
548 const LangOptions &LangOpts) {
549 if (Loc.isFileID())
550 return getBeginningOfFileToken(Loc, SM, LangOpts);
551
552 if (!SM.isMacroArgExpansion(Loc))
553 return Loc;
554
555 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
556 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
557 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
558 std::pair<FileID, unsigned> BeginFileLocInfo =
559 SM.getDecomposedLoc(BeginFileLoc);
560 assert(FileLocInfo.first == BeginFileLocInfo.first &&(static_cast <bool> (FileLocInfo.first == BeginFileLocInfo
.first && FileLocInfo.second >= BeginFileLocInfo.second
) ? void (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 561, __extension__ __PRETTY_FUNCTION__))
561 FileLocInfo.second >= BeginFileLocInfo.second)(static_cast <bool> (FileLocInfo.first == BeginFileLocInfo
.first && FileLocInfo.second >= BeginFileLocInfo.second
) ? void (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 561, __extension__ __PRETTY_FUNCTION__))
;
562 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
563}
564
565namespace {
566
567enum PreambleDirectiveKind {
568 PDK_Skipped,
569 PDK_Unknown
570};
571
572} // namespace
573
574PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
575 const LangOptions &LangOpts,
576 unsigned MaxLines) {
577 // Create a lexer starting at the beginning of the file. Note that we use a
578 // "fake" file source location at offset 1 so that the lexer will track our
579 // position within the file.
580 const unsigned StartOffset = 1;
581 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
582 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
583 Buffer.end());
584 TheLexer.SetCommentRetentionState(true);
585
586 bool InPreprocessorDirective = false;
587 Token TheTok;
588 SourceLocation ActiveCommentLoc;
589
590 unsigned MaxLineOffset = 0;
591 if (MaxLines) {
592 const char *CurPtr = Buffer.begin();
593 unsigned CurLine = 0;
594 while (CurPtr != Buffer.end()) {
595 char ch = *CurPtr++;
596 if (ch == '\n') {
597 ++CurLine;
598 if (CurLine == MaxLines)
599 break;
600 }
601 }
602 if (CurPtr != Buffer.end())
603 MaxLineOffset = CurPtr - Buffer.begin();
604 }
605
606 do {
607 TheLexer.LexFromRawLexer(TheTok);
608
609 if (InPreprocessorDirective) {
610 // If we've hit the end of the file, we're done.
611 if (TheTok.getKind() == tok::eof) {
612 break;
613 }
614
615 // If we haven't hit the end of the preprocessor directive, skip this
616 // token.
617 if (!TheTok.isAtStartOfLine())
618 continue;
619
620 // We've passed the end of the preprocessor directive, and will look
621 // at this token again below.
622 InPreprocessorDirective = false;
623 }
624
625 // Keep track of the # of lines in the preamble.
626 if (TheTok.isAtStartOfLine()) {
627 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
628
629 // If we were asked to limit the number of lines in the preamble,
630 // and we're about to exceed that limit, we're done.
631 if (MaxLineOffset && TokOffset >= MaxLineOffset)
632 break;
633 }
634
635 // Comments are okay; skip over them.
636 if (TheTok.getKind() == tok::comment) {
637 if (ActiveCommentLoc.isInvalid())
638 ActiveCommentLoc = TheTok.getLocation();
639 continue;
640 }
641
642 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
643 // This is the start of a preprocessor directive.
644 Token HashTok = TheTok;
645 InPreprocessorDirective = true;
646 ActiveCommentLoc = SourceLocation();
647
648 // Figure out which directive this is. Since we're lexing raw tokens,
649 // we don't have an identifier table available. Instead, just look at
650 // the raw identifier to recognize and categorize preprocessor directives.
651 TheLexer.LexFromRawLexer(TheTok);
652 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
653 StringRef Keyword = TheTok.getRawIdentifier();
654 PreambleDirectiveKind PDK
655 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
656 .Case("include", PDK_Skipped)
657 .Case("__include_macros", PDK_Skipped)
658 .Case("define", PDK_Skipped)
659 .Case("undef", PDK_Skipped)
660 .Case("line", PDK_Skipped)
661 .Case("error", PDK_Skipped)
662 .Case("pragma", PDK_Skipped)
663 .Case("import", PDK_Skipped)
664 .Case("include_next", PDK_Skipped)
665 .Case("warning", PDK_Skipped)
666 .Case("ident", PDK_Skipped)
667 .Case("sccs", PDK_Skipped)
668 .Case("assert", PDK_Skipped)
669 .Case("unassert", PDK_Skipped)
670 .Case("if", PDK_Skipped)
671 .Case("ifdef", PDK_Skipped)
672 .Case("ifndef", PDK_Skipped)
673 .Case("elif", PDK_Skipped)
674 .Case("else", PDK_Skipped)
675 .Case("endif", PDK_Skipped)
676 .Default(PDK_Unknown);
677
678 switch (PDK) {
679 case PDK_Skipped:
680 continue;
681
682 case PDK_Unknown:
683 // We don't know what this directive is; stop at the '#'.
684 break;
685 }
686 }
687
688 // We only end up here if we didn't recognize the preprocessor
689 // directive or it was one that can't occur in the preamble at this
690 // point. Roll back the current token to the location of the '#'.
691 InPreprocessorDirective = false;
692 TheTok = HashTok;
693 }
694
695 // We hit a token that we don't recognize as being in the
696 // "preprocessing only" part of the file, so we're no longer in
697 // the preamble.
698 break;
699 } while (true);
700
701 SourceLocation End;
702 if (ActiveCommentLoc.isValid())
703 End = ActiveCommentLoc; // don't truncate a decl comment.
704 else
705 End = TheTok.getLocation();
706
707 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
708 TheTok.isAtStartOfLine());
709}
710
711unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
712 const SourceManager &SM,
713 const LangOptions &LangOpts) {
714 // Figure out how many physical characters away the specified expansion
715 // character is. This needs to take into consideration newlines and
716 // trigraphs.
717 bool Invalid = false;
718 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
719
720 // If they request the first char of the token, we're trivially done.
721 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
722 return 0;
723
724 unsigned PhysOffset = 0;
725
726 // The usual case is that tokens don't contain anything interesting. Skip
727 // over the uninteresting characters. If a token only consists of simple
728 // chars, this method is extremely fast.
729 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
730 if (CharNo == 0)
731 return PhysOffset;
732 ++TokPtr;
733 --CharNo;
734 ++PhysOffset;
735 }
736
737 // If we have a character that may be a trigraph or escaped newline, use a
738 // lexer to parse it correctly.
739 for (; CharNo; --CharNo) {
740 unsigned Size;
741 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
742 TokPtr += Size;
743 PhysOffset += Size;
744 }
745
746 // Final detail: if we end up on an escaped newline, we want to return the
747 // location of the actual byte of the token. For example foo\<newline>bar
748 // advanced by 3 should return the location of b, not of \\. One compounding
749 // detail of this is that the escape may be made by a trigraph.
750 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
751 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
752
753 return PhysOffset;
754}
755
756/// Computes the source location just past the end of the
757/// token at this source location.
758///
759/// This routine can be used to produce a source location that
760/// points just past the end of the token referenced by \p Loc, and
761/// is generally used when a diagnostic needs to point just after a
762/// token where it expected something different that it received. If
763/// the returned source location would not be meaningful (e.g., if
764/// it points into a macro), this routine returns an invalid
765/// source location.
766///
767/// \param Offset an offset from the end of the token, where the source
768/// location should refer to. The default offset (0) produces a source
769/// location pointing just past the end of the token; an offset of 1 produces
770/// a source location pointing to the last character in the token, etc.
771SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
772 const SourceManager &SM,
773 const LangOptions &LangOpts) {
774 if (Loc.isInvalid())
775 return {};
776
777 if (Loc.isMacroID()) {
778 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
779 return {}; // Points inside the macro expansion.
780 }
781
782 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
783 if (Len > Offset)
784 Len = Len - Offset;
785 else
786 return Loc;
787
788 return Loc.getLocWithOffset(Len);
789}
790
791/// Returns true if the given MacroID location points at the first
792/// token of the macro expansion.
793bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
794 const SourceManager &SM,
795 const LangOptions &LangOpts,
796 SourceLocation *MacroBegin) {
797 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")(static_cast <bool> (loc.isValid() && loc.isMacroID
() && "Expected a valid macro loc") ? void (0) : __assert_fail
("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 797, __extension__ __PRETTY_FUNCTION__))
;
798
799 SourceLocation expansionLoc;
800 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
801 return false;
802
803 if (expansionLoc.isFileID()) {
804 // No other macro expansions, this is the first.
805 if (MacroBegin)
806 *MacroBegin = expansionLoc;
807 return true;
808 }
809
810 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
811}
812
813/// Returns true if the given MacroID location points at the last
814/// token of the macro expansion.
815bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
816 const SourceManager &SM,
817 const LangOptions &LangOpts,
818 SourceLocation *MacroEnd) {
819 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")(static_cast <bool> (loc.isValid() && loc.isMacroID
() && "Expected a valid macro loc") ? void (0) : __assert_fail
("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 819, __extension__ __PRETTY_FUNCTION__))
;
820
821 SourceLocation spellLoc = SM.getSpellingLoc(loc);
822 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
823 if (tokLen == 0)
824 return false;
825
826 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
827 SourceLocation expansionLoc;
828 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
829 return false;
830
831 if (expansionLoc.isFileID()) {
832 // No other macro expansions.
833 if (MacroEnd)
834 *MacroEnd = expansionLoc;
835 return true;
836 }
837
838 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
839}
840
841static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
842 const SourceManager &SM,
843 const LangOptions &LangOpts) {
844 SourceLocation Begin = Range.getBegin();
845 SourceLocation End = Range.getEnd();
846 assert(Begin.isFileID() && End.isFileID())(static_cast <bool> (Begin.isFileID() && End.isFileID
()) ? void (0) : __assert_fail ("Begin.isFileID() && End.isFileID()"
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 846, __extension__ __PRETTY_FUNCTION__))
;
847 if (Range.isTokenRange()) {
848 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
849 if (End.isInvalid())
850 return {};
851 }
852
853 // Break down the source locations.
854 FileID FID;
855 unsigned BeginOffs;
856 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
857 if (FID.isInvalid())
858 return {};
859
860 unsigned EndOffs;
861 if (!SM.isInFileID(End, FID, &EndOffs) ||
862 BeginOffs > EndOffs)
863 return {};
864
865 return CharSourceRange::getCharRange(Begin, End);
866}
867
868CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
869 const SourceManager &SM,
870 const LangOptions &LangOpts) {
871 SourceLocation Begin = Range.getBegin();
872 SourceLocation End = Range.getEnd();
873 if (Begin.isInvalid() || End.isInvalid())
874 return {};
875
876 if (Begin.isFileID() && End.isFileID())
877 return makeRangeFromFileLocs(Range, SM, LangOpts);
878
879 if (Begin.isMacroID() && End.isFileID()) {
880 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
881 return {};
882 Range.setBegin(Begin);
883 return makeRangeFromFileLocs(Range, SM, LangOpts);
884 }
885
886 if (Begin.isFileID() && End.isMacroID()) {
887 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
888 &End)) ||
889 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
890 &End)))
891 return {};
892 Range.setEnd(End);
893 return makeRangeFromFileLocs(Range, SM, LangOpts);
894 }
895
896 assert(Begin.isMacroID() && End.isMacroID())(static_cast <bool> (Begin.isMacroID() && End.isMacroID
()) ? void (0) : __assert_fail ("Begin.isMacroID() && End.isMacroID()"
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 896, __extension__ __PRETTY_FUNCTION__))
;
897 SourceLocation MacroBegin, MacroEnd;
898 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
899 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
900 &MacroEnd)) ||
901 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
902 &MacroEnd)))) {
903 Range.setBegin(MacroBegin);
904 Range.setEnd(MacroEnd);
905 return makeRangeFromFileLocs(Range, SM, LangOpts);
906 }
907
908 bool Invalid = false;
909 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
910 &Invalid);
911 if (Invalid)
912 return {};
913
914 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
915 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
916 &Invalid);
917 if (Invalid)
918 return {};
919
920 if (EndEntry.getExpansion().isMacroArgExpansion() &&
921 BeginEntry.getExpansion().getExpansionLocStart() ==
922 EndEntry.getExpansion().getExpansionLocStart()) {
923 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
924 Range.setEnd(SM.getImmediateSpellingLoc(End));
925 return makeFileCharRange(Range, SM, LangOpts);
926 }
927 }
928
929 return {};
930}
931
932StringRef Lexer::getSourceText(CharSourceRange Range,
933 const SourceManager &SM,
934 const LangOptions &LangOpts,
935 bool *Invalid) {
936 Range = makeFileCharRange(Range, SM, LangOpts);
937 if (Range.isInvalid()) {
938 if (Invalid) *Invalid = true;
939 return {};
940 }
941
942 // Break down the source location.
943 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
944 if (beginInfo.first.isInvalid()) {
945 if (Invalid) *Invalid = true;
946 return {};
947 }
948
949 unsigned EndOffs;
950 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
951 beginInfo.second > EndOffs) {
952 if (Invalid) *Invalid = true;
953 return {};
954 }
955
956 // Try to the load the file buffer.
957 bool invalidTemp = false;
958 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
959 if (invalidTemp) {
960 if (Invalid) *Invalid = true;
961 return {};
962 }
963
964 if (Invalid) *Invalid = false;
965 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
966}
967
968StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
969 const SourceManager &SM,
970 const LangOptions &LangOpts) {
971 assert(Loc.isMacroID() && "Only reasonable to call this on macros")(static_cast <bool> (Loc.isMacroID() && "Only reasonable to call this on macros"
) ? void (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 971, __extension__ __PRETTY_FUNCTION__))
;
972
973 // Find the location of the immediate macro expansion.
974 while (true) {
975 FileID FID = SM.getFileID(Loc);
976 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
977 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
978 Loc = Expansion.getExpansionLocStart();
979 if (!Expansion.isMacroArgExpansion())
980 break;
981
982 // For macro arguments we need to check that the argument did not come
983 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
984
985 // Loc points to the argument id of the macro definition, move to the
986 // macro expansion.
987 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
988 SourceLocation SpellLoc = Expansion.getSpellingLoc();
989 if (SpellLoc.isFileID())
990 break; // No inner macro.
991
992 // If spelling location resides in the same FileID as macro expansion
993 // location, it means there is no inner macro.
994 FileID MacroFID = SM.getFileID(Loc);
995 if (SM.isInFileID(SpellLoc, MacroFID))
996 break;
997
998 // Argument came from inner macro.
999 Loc = SpellLoc;
1000 }
1001
1002 // Find the spelling location of the start of the non-argument expansion
1003 // range. This is where the macro name was spelled in order to begin
1004 // expanding this macro.
1005 Loc = SM.getSpellingLoc(Loc);
1006
1007 // Dig out the buffer where the macro name was spelled and the extents of the
1008 // name so that we can render it into the expansion note.
1009 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1010 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1011 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1012 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1013}
1014
1015StringRef Lexer::getImmediateMacroNameForDiagnostics(
1016 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1017 assert(Loc.isMacroID() && "Only reasonable to call this on macros")(static_cast <bool> (Loc.isMacroID() && "Only reasonable to call this on macros"
) ? void (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1017, __extension__ __PRETTY_FUNCTION__))
;
1018 // Walk past macro argument expanions.
1019 while (SM.isMacroArgExpansion(Loc))
1020 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1021
1022 // If the macro's spelling has no FileID, then it's actually a token paste
1023 // or stringization (or similar) and not a macro at all.
1024 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1025 return {};
1026
1027 // Find the spelling location of the start of the non-argument expansion
1028 // range. This is where the macro name was spelled in order to begin
1029 // expanding this macro.
1030 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1031
1032 // Dig out the buffer where the macro name was spelled and the extents of the
1033 // name so that we can render it into the expansion note.
1034 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1035 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1036 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1037 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1038}
1039
1040bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1041 return isIdentifierBody(c, LangOpts.DollarIdents);
1042}
1043
1044bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1045 assert(isVerticalWhitespace(Str[0]))(static_cast <bool> (isVerticalWhitespace(Str[0])) ? void
(0) : __assert_fail ("isVerticalWhitespace(Str[0])", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1045, __extension__ __PRETTY_FUNCTION__))
;
1046 if (Str - 1 < BufferStart)
1047 return false;
1048
1049 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1050 (Str[0] == '\r' && Str[-1] == '\n')) {
1051 if (Str - 2 < BufferStart)
1052 return false;
1053 --Str;
1054 }
1055 --Str;
1056
1057 // Rewind to first non-space character:
1058 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1059 --Str;
1060
1061 return *Str == '\\';
1062}
1063
1064StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1065 const SourceManager &SM) {
1066 if (Loc.isInvalid() || Loc.isMacroID())
1067 return {};
1068 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1069 if (LocInfo.first.isInvalid())
1070 return {};
1071 bool Invalid = false;
1072 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1073 if (Invalid)
1074 return {};
1075 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1076 if (!Line)
1077 return {};
1078 StringRef Rest = Buffer.substr(Line - Buffer.data());
1079 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1080 return NumWhitespaceChars == StringRef::npos
1081 ? ""
1082 : Rest.take_front(NumWhitespaceChars);
1083}
1084
1085//===----------------------------------------------------------------------===//
1086// Diagnostics forwarding code.
1087//===----------------------------------------------------------------------===//
1088
1089/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1090/// lexer buffer was all expanded at a single point, perform the mapping.
1091/// This is currently only used for _Pragma implementation, so it is the slow
1092/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1093static LLVM_ATTRIBUTE_NOINLINE__attribute__((noinline)) SourceLocation GetMappedTokenLoc(
1094 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1095static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1096 SourceLocation FileLoc,
1097 unsigned CharNo, unsigned TokLen) {
1098 assert(FileLoc.isMacroID() && "Must be a macro expansion")(static_cast <bool> (FileLoc.isMacroID() && "Must be a macro expansion"
) ? void (0) : __assert_fail ("FileLoc.isMacroID() && \"Must be a macro expansion\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1098, __extension__ __PRETTY_FUNCTION__))
;
1099
1100 // Otherwise, we're lexing "mapped tokens". This is used for things like
1101 // _Pragma handling. Combine the expansion location of FileLoc with the
1102 // spelling location.
1103 SourceManager &SM = PP.getSourceManager();
1104
1105 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1106 // characters come from spelling(FileLoc)+Offset.
1107 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1108 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1109
1110 // Figure out the expansion loc range, which is the range covered by the
1111 // original _Pragma(...) sequence.
1112 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1113
1114 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1115}
1116
1117/// getSourceLocation - Return a source location identifier for the specified
1118/// offset in the current file.
1119SourceLocation Lexer::getSourceLocation(const char *Loc,
1120 unsigned TokLen) const {
1121 assert(Loc >= BufferStart && Loc <= BufferEnd &&(static_cast <bool> (Loc >= BufferStart && Loc
<= BufferEnd && "Location out of range for this buffer!"
) ? void (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1122, __extension__ __PRETTY_FUNCTION__))
1122 "Location out of range for this buffer!")(static_cast <bool> (Loc >= BufferStart && Loc
<= BufferEnd && "Location out of range for this buffer!"
) ? void (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1122, __extension__ __PRETTY_FUNCTION__))
;
1123
1124 // In the normal case, we're just lexing from a simple file buffer, return
1125 // the file id from FileLoc with the offset specified.
1126 unsigned CharNo = Loc-BufferStart;
1127 if (FileLoc.isFileID())
1128 return FileLoc.getLocWithOffset(CharNo);
1129
1130 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1131 // tokens are lexed from where the _Pragma was defined.
1132 assert(PP && "This doesn't work on raw lexers")(static_cast <bool> (PP && "This doesn't work on raw lexers"
) ? void (0) : __assert_fail ("PP && \"This doesn't work on raw lexers\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1132, __extension__ __PRETTY_FUNCTION__))
;
1133 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1134}
1135
1136/// Diag - Forwarding function for diagnostics. This translate a source
1137/// position in the current buffer into a SourceLocation object for rendering.
1138DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1139 return PP->Diag(getSourceLocation(Loc), DiagID);
1140}
1141
1142//===----------------------------------------------------------------------===//
1143// Trigraph and Escaped Newline Handling Code.
1144//===----------------------------------------------------------------------===//
1145
1146/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1147/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1148static char GetTrigraphCharForLetter(char Letter) {
1149 switch (Letter) {
1150 default: return 0;
1151 case '=': return '#';
1152 case ')': return ']';
1153 case '(': return '[';
1154 case '!': return '|';
1155 case '\'': return '^';
1156 case '>': return '}';
1157 case '/': return '\\';
1158 case '<': return '{';
1159 case '-': return '~';
1160 }
1161}
1162
1163/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1164/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1165/// return the result character. Finally, emit a warning about trigraph use
1166/// whether trigraphs are enabled or not.
1167static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1168 char Res = GetTrigraphCharForLetter(*CP);
1169 if (!Res || !L) return Res;
1170
1171 if (!L->getLangOpts().Trigraphs) {
1172 if (!L->isLexingRawMode())
1173 L->Diag(CP-2, diag::trigraph_ignored);
1174 return 0;
1175 }
1176
1177 if (!L->isLexingRawMode())
1178 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1179 return Res;
1180}
1181
1182/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1183/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1184/// trigraph equivalent on entry to this function.
1185unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1186 unsigned Size = 0;
1187 while (isWhitespace(Ptr[Size])) {
1188 ++Size;
1189
1190 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1191 continue;
1192
1193 // If this is a \r\n or \n\r, skip the other half.
1194 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1195 Ptr[Size-1] != Ptr[Size])
1196 ++Size;
1197
1198 return Size;
1199 }
1200
1201 // Not an escaped newline, must be a \t or something else.
1202 return 0;
1203}
1204
1205/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1206/// them), skip over them and return the first non-escaped-newline found,
1207/// otherwise return P.
1208const char *Lexer::SkipEscapedNewLines(const char *P) {
1209 while (true) {
1210 const char *AfterEscape;
1211 if (*P == '\\') {
1212 AfterEscape = P+1;
1213 } else if (*P == '?') {
1214 // If not a trigraph for escape, bail out.
1215 if (P[1] != '?' || P[2] != '/')
1216 return P;
1217 // FIXME: Take LangOpts into account; the language might not
1218 // support trigraphs.
1219 AfterEscape = P+3;
1220 } else {
1221 return P;
1222 }
1223
1224 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1225 if (NewLineSize == 0) return P;
1226 P = AfterEscape+NewLineSize;
1227 }
1228}
1229
1230Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1231 const SourceManager &SM,
1232 const LangOptions &LangOpts) {
1233 if (Loc.isMacroID()) {
1234 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1235 return None;
1236 }
1237 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1238
1239 // Break down the source location.
1240 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1241
1242 // Try to load the file buffer.
1243 bool InvalidTemp = false;
1244 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1245 if (InvalidTemp)
1246 return None;
1247
1248 const char *TokenBegin = File.data() + LocInfo.second;
1249
1250 // Lex from the start of the given location.
1251 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1252 TokenBegin, File.end());
1253 // Find the token.
1254 Token Tok;
1255 lexer.LexFromRawLexer(Tok);
1256 return Tok;
1257}
1258
1259/// Checks that the given token is the first token that occurs after the
1260/// given location (this excludes comments and whitespace). Returns the location
1261/// immediately after the specified token. If the token is not found or the
1262/// location is inside a macro, the returned source location will be invalid.
1263SourceLocation Lexer::findLocationAfterToken(
1264 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1265 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1266 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1267 if (!Tok || Tok->isNot(TKind))
1268 return {};
1269 SourceLocation TokenLoc = Tok->getLocation();
1270
1271 // Calculate how much whitespace needs to be skipped if any.
1272 unsigned NumWhitespaceChars = 0;
1273 if (SkipTrailingWhitespaceAndNewLine) {
1274 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1275 unsigned char C = *TokenEnd;
1276 while (isHorizontalWhitespace(C)) {
1277 C = *(++TokenEnd);
1278 NumWhitespaceChars++;
1279 }
1280
1281 // Skip \r, \n, \r\n, or \n\r
1282 if (C == '\n' || C == '\r') {
1283 char PrevC = C;
1284 C = *(++TokenEnd);
1285 NumWhitespaceChars++;
1286 if ((C == '\n' || C == '\r') && C != PrevC)
1287 NumWhitespaceChars++;
1288 }
1289 }
1290
1291 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1292}
1293
1294/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1295/// get its size, and return it. This is tricky in several cases:
1296/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1297/// then either return the trigraph (skipping 3 chars) or the '?',
1298/// depending on whether trigraphs are enabled or not.
1299/// 2. If this is an escaped newline (potentially with whitespace between
1300/// the backslash and newline), implicitly skip the newline and return
1301/// the char after it.
1302///
1303/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1304/// know that we can accumulate into Size, and that we have already incremented
1305/// Ptr by Size bytes.
1306///
1307/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1308/// be updated to match.
1309char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1310 Token *Tok) {
1311 // If we have a slash, look for an escaped newline.
1312 if (Ptr[0] == '\\') {
5
Taking true branch
1313 ++Size;
1314 ++Ptr;
1315Slash:
1316 // Common case, backslash-char where the char is not whitespace.
1317 if (!isWhitespace(Ptr[0])) return '\\';
6
Taking false branch
1318
1319 // See if we have optional whitespace characters between the slash and
1320 // newline.
1321 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
7
Assuming 'EscapedNewLineSize' is not equal to 0
8
Taking true branch
1322 // Remember that this token needs to be cleaned.
1323 if (Tok) Tok->setFlag(Token::NeedsCleaning);
9
Taking true branch
10
Calling 'Token::setFlag'
1324
1325 // Warn if there was whitespace between the backslash and newline.
1326 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1327 Diag(Ptr, diag::backslash_newline_space);
1328
1329 // Found backslash<whitespace><newline>. Parse the char after it.
1330 Size += EscapedNewLineSize;
1331 Ptr += EscapedNewLineSize;
1332
1333 // Use slow version to accumulate a correct size field.
1334 return getCharAndSizeSlow(Ptr, Size, Tok);
1335 }
1336
1337 // Otherwise, this is not an escaped newline, just return the slash.
1338 return '\\';
1339 }
1340
1341 // If this is a trigraph, process it.
1342 if (Ptr[0] == '?' && Ptr[1] == '?') {
1343 // If this is actually a legal trigraph (not something like "??x"), emit
1344 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1345 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1346 // Remember that this token needs to be cleaned.
1347 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1348
1349 Ptr += 3;
1350 Size += 3;
1351 if (C == '\\') goto Slash;
1352 return C;
1353 }
1354 }
1355
1356 // If this is neither, return a single character.
1357 ++Size;
1358 return *Ptr;
1359}
1360
1361/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1362/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1363/// and that we have already incremented Ptr by Size bytes.
1364///
1365/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1366/// be updated to match.
1367char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1368 const LangOptions &LangOpts) {
1369 // If we have a slash, look for an escaped newline.
1370 if (Ptr[0] == '\\') {
1371 ++Size;
1372 ++Ptr;
1373Slash:
1374 // Common case, backslash-char where the char is not whitespace.
1375 if (!isWhitespace(Ptr[0])) return '\\';
1376
1377 // See if we have optional whitespace characters followed by a newline.
1378 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1379 // Found backslash<whitespace><newline>. Parse the char after it.
1380 Size += EscapedNewLineSize;
1381 Ptr += EscapedNewLineSize;
1382
1383 // Use slow version to accumulate a correct size field.
1384 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1385 }
1386
1387 // Otherwise, this is not an escaped newline, just return the slash.
1388 return '\\';
1389 }
1390
1391 // If this is a trigraph, process it.
1392 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1393 // If this is actually a legal trigraph (not something like "??x"), return
1394 // it.
1395 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1396 Ptr += 3;
1397 Size += 3;
1398 if (C == '\\') goto Slash;
1399 return C;
1400 }
1401 }
1402
1403 // If this is neither, return a single character.
1404 ++Size;
1405 return *Ptr;
1406}
1407
1408//===----------------------------------------------------------------------===//
1409// Helper methods for lexing.
1410//===----------------------------------------------------------------------===//
1411
1412/// Routine that indiscriminately sets the offset into the source file.
1413void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1414 BufferPtr = BufferStart + Offset;
1415 if (BufferPtr > BufferEnd)
1416 BufferPtr = BufferEnd;
1417 // FIXME: What exactly does the StartOfLine bit mean? There are two
1418 // possible meanings for the "start" of the line: the first token on the
1419 // unexpanded line, or the first token on the expanded line.
1420 IsAtStartOfLine = StartOfLine;
1421 IsAtPhysicalStartOfLine = StartOfLine;
1422}
1423
1424static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1425 if (LangOpts.AsmPreprocessor) {
1426 return false;
1427 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1428 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1429 C11AllowedIDCharRanges);
1430 return C11AllowedIDChars.contains(C);
1431 } else if (LangOpts.CPlusPlus) {
1432 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1433 CXX03AllowedIDCharRanges);
1434 return CXX03AllowedIDChars.contains(C);
1435 } else {
1436 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1437 C99AllowedIDCharRanges);
1438 return C99AllowedIDChars.contains(C);
1439 }
1440}
1441
1442static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1443 assert(isAllowedIDChar(C, LangOpts))(static_cast <bool> (isAllowedIDChar(C, LangOpts)) ? void
(0) : __assert_fail ("isAllowedIDChar(C, LangOpts)", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1443, __extension__ __PRETTY_FUNCTION__))
;
1444 if (LangOpts.AsmPreprocessor) {
1445 return false;
1446 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1447 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1448 C11DisallowedInitialIDCharRanges);
1449 return !C11DisallowedInitialIDChars.contains(C);
1450 } else if (LangOpts.CPlusPlus) {
1451 return true;
1452 } else {
1453 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1454 C99DisallowedInitialIDCharRanges);
1455 return !C99DisallowedInitialIDChars.contains(C);
1456 }
1457}
1458
1459static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1460 const char *End) {
1461 return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1462 L.getSourceLocation(End));
1463}
1464
1465static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1466 CharSourceRange Range, bool IsFirst) {
1467 // Check C99 compatibility.
1468 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1469 enum {
1470 CannotAppearInIdentifier = 0,
1471 CannotStartIdentifier
1472 };
1473
1474 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1475 C99AllowedIDCharRanges);
1476 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1477 C99DisallowedInitialIDCharRanges);
1478 if (!C99AllowedIDChars.contains(C)) {
1479 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1480 << Range
1481 << CannotAppearInIdentifier;
1482 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1483 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1484 << Range
1485 << CannotStartIdentifier;
1486 }
1487 }
1488
1489 // Check C++98 compatibility.
1490 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1491 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1492 CXX03AllowedIDCharRanges);
1493 if (!CXX03AllowedIDChars.contains(C)) {
1494 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1495 << Range;
1496 }
1497 }
1498}
1499
1500/// After encountering UTF-8 character C and interpreting it as an identifier
1501/// character, check whether it's a homoglyph for a common non-identifier
1502/// source character that is unlikely to be an intentional identifier
1503/// character and warn if so.
1504static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1505 CharSourceRange Range) {
1506 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1507 struct HomoglyphPair {
1508 uint32_t Character;
1509 char LooksLike;
1510 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1511 };
1512 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1513 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1514 {U'\u037e', ';'}, // GREEK QUESTION MARK
1515 {U'\u2212', '-'}, // MINUS SIGN
1516 {U'\u2215', '/'}, // DIVISION SLASH
1517 {U'\u2216', '\\'}, // SET MINUS
1518 {U'\u2217', '*'}, // ASTERISK OPERATOR
1519 {U'\u2223', '|'}, // DIVIDES
1520 {U'\u2227', '^'}, // LOGICAL AND
1521 {U'\u2236', ':'}, // RATIO
1522 {U'\u223c', '~'}, // TILDE OPERATOR
1523 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1524 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1525 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1526 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1527 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1528 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1529 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1530 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1531 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1532 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1533 {U'\uff0c', ','}, // FULLWIDTH COMMA
1534 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1535 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1536 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1537 {U'\uff1a', ':'}, // FULLWIDTH COLON
1538 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1539 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1540 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1541 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1542 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1543 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1544 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1545 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1546 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1547 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1548 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1549 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1550 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1551 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1552 {0, 0}
1553 };
1554 auto Homoglyph =
1555 std::lower_bound(std::begin(SortedHomoglyphs),
1556 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1557 if (Homoglyph->Character == C) {
1558 llvm::SmallString<5> CharBuf;
1559 {
1560 llvm::raw_svector_ostream CharOS(CharBuf);
1561 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1562 }
1563 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1564 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1565 << Range << CharBuf << LooksLikeStr;
1566 }
1567}
1568
1569bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1570 Token &Result) {
1571 const char *UCNPtr = CurPtr + Size;
1572 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1573 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
1574 return false;
1575
1576 if (!isLexingRawMode())
1577 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1578 makeCharRange(*this, CurPtr, UCNPtr),
1579 /*IsFirst=*/false);
1580
1581 Result.setFlag(Token::HasUCN);
1582 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1583 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1584 CurPtr = UCNPtr;
1585 else
1586 while (CurPtr != UCNPtr)
1587 (void)getAndAdvanceChar(CurPtr, Result);
1588 return true;
1589}
1590
1591bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1592 const char *UnicodePtr = CurPtr;
1593 llvm::UTF32 CodePoint;
1594 llvm::ConversionResult Result =
1595 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1596 (const llvm::UTF8 *)BufferEnd,
1597 &CodePoint,
1598 llvm::strictConversion);
1599 if (Result != llvm::conversionOK ||
1600 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1601 return false;
1602
1603 if (!isLexingRawMode()) {
1604 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1605 makeCharRange(*this, CurPtr, UnicodePtr),
1606 /*IsFirst=*/false);
1607 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1608 makeCharRange(*this, CurPtr, UnicodePtr));
1609 }
1610
1611 CurPtr = UnicodePtr;
1612 return true;
1613}
1614
1615bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1616 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1617 unsigned Size;
1618 unsigned char C = *CurPtr++;
1619 while (isIdentifierBody(C))
1620 C = *CurPtr++;
1621
1622 --CurPtr; // Back up over the skipped character.
1623
1624 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
1625 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1626 //
1627 // TODO: Could merge these checks into an InfoTable flag to make the
1628 // comparison cheaper
1629 if (isASCII(C) && C != '\\' && C != '?' &&
1630 (C != '$' || !LangOpts.DollarIdents)) {
1631FinishIdentifier:
1632 const char *IdStart = BufferPtr;
1633 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1634 Result.setRawIdentifierData(IdStart);
1635
1636 // If we are in raw mode, return this identifier raw. There is no need to
1637 // look up identifier information or attempt to macro expand it.
1638 if (LexingRawMode)
1639 return true;
1640
1641 // Fill in Result.IdentifierInfo and update the token kind,
1642 // looking up the identifier in the identifier table.
1643 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1644 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1645 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1646
1647 // If the completion point is at the end of an identifier, we want to treat
1648 // the identifier as incomplete even if it resolves to a macro or a keyword.
1649 // This allows e.g. 'class^' to complete to 'classifier'.
1650 if (isCodeCompletionPoint(CurPtr)) {
1651 // Return the code-completion token.
1652 Result.setKind(tok::code_completion);
1653 // Skip the code-completion char and all immediate identifier characters.
1654 // This ensures we get consistent behavior when completing at any point in
1655 // an identifier (i.e. at the start, in the middle, at the end). Note that
1656 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1657 // simpler.
1658 assert(*CurPtr == 0 && "Completion character must be 0")(static_cast <bool> (*CurPtr == 0 && "Completion character must be 0"
) ? void (0) : __assert_fail ("*CurPtr == 0 && \"Completion character must be 0\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1658, __extension__ __PRETTY_FUNCTION__))
;
1659 ++CurPtr;
1660 // Note that code completion token is not added as a separate character
1661 // when the completion point is at the end of the buffer. Therefore, we need
1662 // to check if the buffer has ended.
1663 if (CurPtr < BufferEnd) {
1664 while (isIdentifierBody(*CurPtr))
1665 ++CurPtr;
1666 }
1667 BufferPtr = CurPtr;
1668 return true;
1669 }
1670
1671 // Finally, now that we know we have an identifier, pass this off to the
1672 // preprocessor, which may macro expand it or something.
1673 if (II->isHandleIdentifierCase())
1674 return PP->HandleIdentifier(Result);
1675
1676 return true;
1677 }
1678
1679 // Otherwise, $,\,? in identifier found. Enter slower path.
1680
1681 C = getCharAndSize(CurPtr, Size);
1682 while (true) {
1683 if (C == '$') {
1684 // If we hit a $ and they are not supported in identifiers, we are done.
1685 if (!LangOpts.DollarIdents) goto FinishIdentifier;
1686
1687 // Otherwise, emit a diagnostic and continue.
1688 if (!isLexingRawMode())
1689 Diag(CurPtr, diag::ext_dollar_in_identifier);
1690 CurPtr = ConsumeChar(CurPtr, Size, Result);
1691 C = getCharAndSize(CurPtr, Size);
1692 continue;
1693 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
1694 C = getCharAndSize(CurPtr, Size);
1695 continue;
1696 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
1697 C = getCharAndSize(CurPtr, Size);
1698 continue;
1699 } else if (!isIdentifierBody(C)) {
1700 goto FinishIdentifier;
1701 }
1702
1703 // Otherwise, this character is good, consume it.
1704 CurPtr = ConsumeChar(CurPtr, Size, Result);
1705
1706 C = getCharAndSize(CurPtr, Size);
1707 while (isIdentifierBody(C)) {
1708 CurPtr = ConsumeChar(CurPtr, Size, Result);
1709 C = getCharAndSize(CurPtr, Size);
1710 }
1711 }
1712}
1713
1714/// isHexaLiteral - Return true if Start points to a hex constant.
1715/// in microsoft mode (where this is supposed to be several different tokens).
1716bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1717 unsigned Size;
1718 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1719 if (C1 != '0')
1720 return false;
1721 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1722 return (C2 == 'x' || C2 == 'X');
1723}
1724
1725/// LexNumericConstant - Lex the remainder of a integer or floating point
1726/// constant. From[-1] is the first character lexed. Return the end of the
1727/// constant.
1728bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1729 unsigned Size;
1730 char C = getCharAndSize(CurPtr, Size);
1731 char PrevCh = 0;
1732 while (isPreprocessingNumberBody(C)) {
1733 CurPtr = ConsumeChar(CurPtr, Size, Result);
1734 PrevCh = C;
1735 C = getCharAndSize(CurPtr, Size);
1736 }
1737
1738 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1739 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1740 // If we are in Microsoft mode, don't continue if the constant is hex.
1741 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1742 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1743 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1744 }
1745
1746 // If we have a hex FP constant, continue.
1747 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1748 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1749 // not-quite-conforming extension. Only do so if this looks like it's
1750 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1751 bool IsHexFloat = true;
1752 if (!LangOpts.C99) {
1753 if (!isHexaLiteral(BufferPtr, LangOpts))
1754 IsHexFloat = false;
1755 else if (!getLangOpts().CPlusPlus17 &&
1756 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1757 IsHexFloat = false;
1758 }
1759 if (IsHexFloat)
1760 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1761 }
1762
1763 // If we have a digit separator, continue.
1764 if (C == '\'' && getLangOpts().CPlusPlus14) {
1765 unsigned NextSize;
1766 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1767 if (isIdentifierBody(Next)) {
1768 if (!isLexingRawMode())
1769 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1770 CurPtr = ConsumeChar(CurPtr, Size, Result);
1771 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1772 return LexNumericConstant(Result, CurPtr);
1773 }
1774 }
1775
1776 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1777 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1778 return LexNumericConstant(Result, CurPtr);
1779 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1780 return LexNumericConstant(Result, CurPtr);
1781
1782 // Update the location of token as well as BufferPtr.
1783 const char *TokStart = BufferPtr;
1784 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1785 Result.setLiteralData(TokStart);
1786 return true;
1787}
1788
1789/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1790/// in C++11, or warn on a ud-suffix in C++98.
1791const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1792 bool IsStringLiteral) {
1793 assert(getLangOpts().CPlusPlus)(static_cast <bool> (getLangOpts().CPlusPlus) ? void (0
) : __assert_fail ("getLangOpts().CPlusPlus", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 1793, __extension__ __PRETTY_FUNCTION__))
;
1794
1795 // Maximally munch an identifier.
1796 unsigned Size;
1797 char C = getCharAndSize(CurPtr, Size);
1798 bool Consumed = false;
1799
1800 if (!isIdentifierHead(C)) {
1801 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1802 Consumed = true;
1803 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1804 Consumed = true;
1805 else
1806 return CurPtr;
1807 }
1808
1809 if (!getLangOpts().CPlusPlus11) {
1810 if (!isLexingRawMode())
1811 Diag(CurPtr,
1812 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1813 : diag::warn_cxx11_compat_reserved_user_defined_literal)
1814 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1815 return CurPtr;
1816 }
1817
1818 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1819 // that does not start with an underscore is ill-formed. As a conforming
1820 // extension, we treat all such suffixes as if they had whitespace before
1821 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1822 // likely to be a ud-suffix than a macro, however, and accept that.
1823 if (!Consumed) {
1824 bool IsUDSuffix = false;
1825 if (C == '_')
1826 IsUDSuffix = true;
1827 else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
1828 // In C++1y, we need to look ahead a few characters to see if this is a
1829 // valid suffix for a string literal or a numeric literal (this could be
1830 // the 'operator""if' defining a numeric literal operator).
1831 const unsigned MaxStandardSuffixLength = 3;
1832 char Buffer[MaxStandardSuffixLength] = { C };
1833 unsigned Consumed = Size;
1834 unsigned Chars = 1;
1835 while (true) {
1836 unsigned NextSize;
1837 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1838 getLangOpts());
1839 if (!isIdentifierBody(Next)) {
1840 // End of suffix. Check whether this is on the whitelist.
1841 const StringRef CompleteSuffix(Buffer, Chars);
1842 IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1843 CompleteSuffix);
1844 break;
1845 }
1846
1847 if (Chars == MaxStandardSuffixLength)
1848 // Too long: can't be a standard suffix.
1849 break;
1850
1851 Buffer[Chars++] = Next;
1852 Consumed += NextSize;
1853 }
1854 }
1855
1856 if (!IsUDSuffix) {
1857 if (!isLexingRawMode())
1858 Diag(CurPtr, getLangOpts().MSVCCompat
1859 ? diag::ext_ms_reserved_user_defined_literal
1860 : diag::ext_reserved_user_defined_literal)
1861 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1862 return CurPtr;
1863 }
1864
1865 CurPtr = ConsumeChar(CurPtr, Size, Result);
1866 }
1867
1868 Result.setFlag(Token::HasUDSuffix);
1869 while (true) {
1870 C = getCharAndSize(CurPtr, Size);
1871 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
1872 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
1873 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
1874 else break;
1875 }
1876
1877 return CurPtr;
1878}
1879
1880/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1881/// either " or L" or u8" or u" or U".
1882bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1883 tok::TokenKind Kind) {
1884 // Does this string contain the \0 character?
1885 const char *NulCharacter = nullptr;
1886
1887 if (!isLexingRawMode() &&
1888 (Kind == tok::utf8_string_literal ||
1889 Kind == tok::utf16_string_literal ||
1890 Kind == tok::utf32_string_literal))
1891 Diag(BufferPtr, getLangOpts().CPlusPlus
1892 ? diag::warn_cxx98_compat_unicode_literal
1893 : diag::warn_c99_compat_unicode_literal);
1894
1895 char C = getAndAdvanceChar(CurPtr, Result);
1896 while (C != '"') {
1897 // Skip escaped characters. Escaped newlines will already be processed by
1898 // getAndAdvanceChar.
1899 if (C == '\\')
1900 C = getAndAdvanceChar(CurPtr, Result);
1901
1902 if (C == '\n' || C == '\r' || // Newline.
1903 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
1904 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1905 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1906 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1907 return true;
1908 }
1909
1910 if (C == 0) {
1911 if (isCodeCompletionPoint(CurPtr-1)) {
1912 PP->CodeCompleteNaturalLanguage();
1913 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1914 cutOffLexing();
1915 return true;
1916 }
1917
1918 NulCharacter = CurPtr-1;
1919 }
1920 C = getAndAdvanceChar(CurPtr, Result);
1921 }
1922
1923 // If we are in C++11, lex the optional ud-suffix.
1924 if (getLangOpts().CPlusPlus)
1925 CurPtr = LexUDSuffix(Result, CurPtr, true);
1926
1927 // If a nul character existed in the string, warn about it.
1928 if (NulCharacter && !isLexingRawMode())
1929 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1930
1931 // Update the location of the token as well as the BufferPtr instance var.
1932 const char *TokStart = BufferPtr;
1933 FormTokenWithChars(Result, CurPtr, Kind);
1934 Result.setLiteralData(TokStart);
1935 return true;
1936}
1937
1938/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1939/// having lexed R", LR", u8R", uR", or UR".
1940bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1941 tok::TokenKind Kind) {
1942 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1943 // Between the initial and final double quote characters of the raw string,
1944 // any transformations performed in phases 1 and 2 (trigraphs,
1945 // universal-character-names, and line splicing) are reverted.
1946
1947 if (!isLexingRawMode())
1948 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1949
1950 unsigned PrefixLen = 0;
1951
1952 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
1953 ++PrefixLen;
1954
1955 // If the last character was not a '(', then we didn't lex a valid delimiter.
1956 if (CurPtr[PrefixLen] != '(') {
1957 if (!isLexingRawMode()) {
1958 const char *PrefixEnd = &CurPtr[PrefixLen];
1959 if (PrefixLen == 16) {
1960 Diag(PrefixEnd, diag::err_raw_delim_too_long);
1961 } else {
1962 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1963 << StringRef(PrefixEnd, 1);
1964 }
1965 }
1966
1967 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1968 // it's possible the '"' was intended to be part of the raw string, but
1969 // there's not much we can do about that.
1970 while (true) {
1971 char C = *CurPtr++;
1972
1973 if (C == '"')
1974 break;
1975 if (C == 0 && CurPtr-1 == BufferEnd) {
1976 --CurPtr;
1977 break;
1978 }
1979 }
1980
1981 FormTokenWithChars(Result, CurPtr, tok::unknown);
1982 return true;
1983 }
1984
1985 // Save prefix and move CurPtr past it
1986 const char *Prefix = CurPtr;
1987 CurPtr += PrefixLen + 1; // skip over prefix and '('
1988
1989 while (true) {
1990 char C = *CurPtr++;
1991
1992 if (C == ')') {
1993 // Check for prefix match and closing quote.
1994 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
1995 CurPtr += PrefixLen + 1; // skip over prefix and '"'
1996 break;
1997 }
1998 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
1999 if (!isLexingRawMode())
2000 Diag(BufferPtr, diag::err_unterminated_raw_string)
2001 << StringRef(Prefix, PrefixLen);
2002 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2003 return true;
2004 }
2005 }
2006
2007 // If we are in C++11, lex the optional ud-suffix.
2008 if (getLangOpts().CPlusPlus)
2009 CurPtr = LexUDSuffix(Result, CurPtr, true);
2010
2011 // Update the location of token as well as BufferPtr.
2012 const char *TokStart = BufferPtr;
2013 FormTokenWithChars(Result, CurPtr, Kind);
2014 Result.setLiteralData(TokStart);
2015 return true;
2016}
2017
2018/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2019/// after having lexed the '<' character. This is used for #include filenames.
2020bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2021 // Does this string contain the \0 character?
2022 const char *NulCharacter = nullptr;
2023 const char *AfterLessPos = CurPtr;
2024 char C = getAndAdvanceChar(CurPtr, Result);
2025 while (C != '>') {
2026 // Skip escaped characters. Escaped newlines will already be processed by
2027 // getAndAdvanceChar.
2028 if (C == '\\')
2029 C = getAndAdvanceChar(CurPtr, Result);
2030
2031 if (C == '\n' || C == '\r' || // Newline.
2032 (C == 0 && (CurPtr-1 == BufferEnd || // End of file.
2033 isCodeCompletionPoint(CurPtr-1)))) {
2034 // If the filename is unterminated, then it must just be a lone <
2035 // character. Return this as such.
2036 FormTokenWithChars(Result, AfterLessPos, tok::less);
2037 return true;
2038 }
2039
2040 if (C == 0) {
2041 NulCharacter = CurPtr-1;
2042 }
2043 C = getAndAdvanceChar(CurPtr, Result);
2044 }
2045
2046 // If a nul character existed in the string, warn about it.
2047 if (NulCharacter && !isLexingRawMode())
2048 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2049
2050 // Update the location of token as well as BufferPtr.
2051 const char *TokStart = BufferPtr;
2052 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
2053 Result.setLiteralData(TokStart);
2054 return true;
2055}
2056
2057/// LexCharConstant - Lex the remainder of a character constant, after having
2058/// lexed either ' or L' or u8' or u' or U'.
2059bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2060 tok::TokenKind Kind) {
2061 // Does this character contain the \0 character?
2062 const char *NulCharacter = nullptr;
2063
2064 if (!isLexingRawMode()) {
2065 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2066 Diag(BufferPtr, getLangOpts().CPlusPlus
2067 ? diag::warn_cxx98_compat_unicode_literal
2068 : diag::warn_c99_compat_unicode_literal);
2069 else if (Kind == tok::utf8_char_constant)
2070 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2071 }
2072
2073 char C = getAndAdvanceChar(CurPtr, Result);
2074 if (C == '\'') {
2075 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2076 Diag(BufferPtr, diag::ext_empty_character);
2077 FormTokenWithChars(Result, CurPtr, tok::unknown);
2078 return true;
2079 }
2080
2081 while (C != '\'') {
2082 // Skip escaped characters.
2083 if (C == '\\')
2084 C = getAndAdvanceChar(CurPtr, Result);
2085
2086 if (C == '\n' || C == '\r' || // Newline.
2087 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2088 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2089 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2090 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2091 return true;
2092 }
2093
2094 if (C == 0) {
2095 if (isCodeCompletionPoint(CurPtr-1)) {
2096 PP->CodeCompleteNaturalLanguage();
2097 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2098 cutOffLexing();
2099 return true;
2100 }
2101
2102 NulCharacter = CurPtr-1;
2103 }
2104 C = getAndAdvanceChar(CurPtr, Result);
2105 }
2106
2107 // If we are in C++11, lex the optional ud-suffix.
2108 if (getLangOpts().CPlusPlus)
2109 CurPtr = LexUDSuffix(Result, CurPtr, false);
2110
2111 // If a nul character existed in the character, warn about it.
2112 if (NulCharacter && !isLexingRawMode())
2113 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2114
2115 // Update the location of token as well as BufferPtr.
2116 const char *TokStart = BufferPtr;
2117 FormTokenWithChars(Result, CurPtr, Kind);
2118 Result.setLiteralData(TokStart);
2119 return true;
2120}
2121
2122/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2123/// Update BufferPtr to point to the next non-whitespace character and return.
2124///
2125/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2126bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2127 bool &TokAtPhysicalStartOfLine) {
2128 // Whitespace - Skip it, then return the token after the whitespace.
2129 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2130
2131 unsigned char Char = *CurPtr;
2132
2133 // Skip consecutive spaces efficiently.
2134 while (true) {
2135 // Skip horizontal whitespace very aggressively.
2136 while (isHorizontalWhitespace(Char))
2137 Char = *++CurPtr;
2138
2139 // Otherwise if we have something other than whitespace, we're done.
2140 if (!isVerticalWhitespace(Char))
2141 break;
2142
2143 if (ParsingPreprocessorDirective) {
2144 // End of preprocessor directive line, let LexTokenInternal handle this.
2145 BufferPtr = CurPtr;
2146 return false;
2147 }
2148
2149 // OK, but handle newline.
2150 SawNewline = true;
2151 Char = *++CurPtr;
2152 }
2153
2154 // If the client wants us to return whitespace, return it now.
2155 if (isKeepWhitespaceMode()) {
2156 FormTokenWithChars(Result, CurPtr, tok::unknown);
2157 if (SawNewline) {
2158 IsAtStartOfLine = true;
2159 IsAtPhysicalStartOfLine = true;
2160 }
2161 // FIXME: The next token will not have LeadingSpace set.
2162 return true;
2163 }
2164
2165 // If this isn't immediately after a newline, there is leading space.
2166 char PrevChar = CurPtr[-1];
2167 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2168
2169 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2170 if (SawNewline) {
2171 Result.setFlag(Token::StartOfLine);
2172 TokAtPhysicalStartOfLine = true;
2173 }
2174
2175 BufferPtr = CurPtr;
2176 return false;
2177}
2178
2179/// We have just read the // characters from input. Skip until we find the
2180/// newline character that terminates the comment. Then update BufferPtr and
2181/// return.
2182///
2183/// If we're in KeepCommentMode or any CommentHandler has inserted
2184/// some tokens, this will store the first token and return true.
2185bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2186 bool &TokAtPhysicalStartOfLine) {
2187 // If Line comments aren't explicitly enabled for this language, emit an
2188 // extension warning.
2189 if (!LangOpts.LineComment && !isLexingRawMode()) {
2190 Diag(BufferPtr, diag::ext_line_comment);
2191
2192 // Mark them enabled so we only emit one warning for this translation
2193 // unit.
2194 LangOpts.LineComment = true;
2195 }
2196
2197 // Scan over the body of the comment. The common case, when scanning, is that
2198 // the comment contains normal ascii characters with nothing interesting in
2199 // them. As such, optimize for this case with the inner loop.
2200 //
2201 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2202 // character that ends the line comment.
2203 char C;
2204 while (true) {
2205 C = *CurPtr;
2206 // Skip over characters in the fast loop.
2207 while (C != 0 && // Potentially EOF.
2208 C != '\n' && C != '\r') // Newline or DOS-style newline.
2209 C = *++CurPtr;
2210
2211 const char *NextLine = CurPtr;
2212 if (C != 0) {
2213 // We found a newline, see if it's escaped.
2214 const char *EscapePtr = CurPtr-1;
2215 bool HasSpace = false;
2216 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2217 --EscapePtr;
2218 HasSpace = true;
2219 }
2220
2221 if (*EscapePtr == '\\')
2222 // Escaped newline.
2223 CurPtr = EscapePtr;
2224 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2225 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2226 // Trigraph-escaped newline.
2227 CurPtr = EscapePtr-2;
2228 else
2229 break; // This is a newline, we're done.
2230
2231 // If there was space between the backslash and newline, warn about it.
2232 if (HasSpace && !isLexingRawMode())
2233 Diag(EscapePtr, diag::backslash_newline_space);
2234 }
2235
2236 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2237 // properly decode the character. Read it in raw mode to avoid emitting
2238 // diagnostics about things like trigraphs. If we see an escaped newline,
2239 // we'll handle it below.
2240 const char *OldPtr = CurPtr;
2241 bool OldRawMode = isLexingRawMode();
2242 LexingRawMode = true;
2243 C = getAndAdvanceChar(CurPtr, Result);
2244 LexingRawMode = OldRawMode;
2245
2246 // If we only read only one character, then no special handling is needed.
2247 // We're done and can skip forward to the newline.
2248 if (C != 0 && CurPtr == OldPtr+1) {
2249 CurPtr = NextLine;
2250 break;
2251 }
2252
2253 // If we read multiple characters, and one of those characters was a \r or
2254 // \n, then we had an escaped newline within the comment. Emit diagnostic
2255 // unless the next line is also a // comment.
2256 if (CurPtr != OldPtr + 1 && C != '/' &&
2257 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2258 for (; OldPtr != CurPtr; ++OldPtr)
2259 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2260 // Okay, we found a // comment that ends in a newline, if the next
2261 // line is also a // comment, but has spaces, don't emit a diagnostic.
2262 if (isWhitespace(C)) {
2263 const char *ForwardPtr = CurPtr;
2264 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2265 ++ForwardPtr;
2266 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2267 break;
2268 }
2269
2270 if (!isLexingRawMode())
2271 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2272 break;
2273 }
2274 }
2275
2276 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2277 --CurPtr;
2278 break;
2279 }
2280
2281 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2282 PP->CodeCompleteNaturalLanguage();
2283 cutOffLexing();
2284 return false;
2285 }
2286 }
2287
2288 // Found but did not consume the newline. Notify comment handlers about the
2289 // comment unless we're in a #if 0 block.
2290 if (PP && !isLexingRawMode() &&
2291 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2292 getSourceLocation(CurPtr)))) {
2293 BufferPtr = CurPtr;
2294 return true; // A token has to be returned.
2295 }
2296
2297 // If we are returning comments as tokens, return this comment as a token.
2298 if (inKeepCommentMode())
2299 return SaveLineComment(Result, CurPtr);
2300
2301 // If we are inside a preprocessor directive and we see the end of line,
2302 // return immediately, so that the lexer can return this as an EOD token.
2303 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2304 BufferPtr = CurPtr;
2305 return false;
2306 }
2307
2308 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2309 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2310 // contribute to another token), it isn't needed for correctness. Note that
2311 // this is ok even in KeepWhitespaceMode, because we would have returned the
2312 /// comment above in that mode.
2313 ++CurPtr;
2314
2315 // The next returned token is at the start of the line.
2316 Result.setFlag(Token::StartOfLine);
2317 TokAtPhysicalStartOfLine = true;
2318 // No leading whitespace seen so far.
2319 Result.clearFlag(Token::LeadingSpace);
2320 BufferPtr = CurPtr;
2321 return false;
2322}
2323
2324/// If in save-comment mode, package up this Line comment in an appropriate
2325/// way and return it.
2326bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2327 // If we're not in a preprocessor directive, just return the // comment
2328 // directly.
2329 FormTokenWithChars(Result, CurPtr, tok::comment);
2330
2331 if (!ParsingPreprocessorDirective || LexingRawMode)
2332 return true;
2333
2334 // If this Line-style comment is in a macro definition, transmogrify it into
2335 // a C-style block comment.
2336 bool Invalid = false;
2337 std::string Spelling = PP->getSpelling(Result, &Invalid);
2338 if (Invalid)
2339 return true;
2340
2341 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?")(static_cast <bool> (Spelling[0] == '/' && Spelling
[1] == '/' && "Not line comment?") ? void (0) : __assert_fail
("Spelling[0] == '/' && Spelling[1] == '/' && \"Not line comment?\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2341, __extension__ __PRETTY_FUNCTION__))
;
2342 Spelling[1] = '*'; // Change prefix to "/*".
2343 Spelling += "*/"; // add suffix.
2344
2345 Result.setKind(tok::comment);
2346 PP->CreateString(Spelling, Result,
2347 Result.getLocation(), Result.getLocation());
2348 return true;
2349}
2350
2351/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2352/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2353/// a diagnostic if so. We know that the newline is inside of a block comment.
2354static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2355 Lexer *L) {
2356 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r')(static_cast <bool> (CurPtr[0] == '\n' || CurPtr[0] == '\r'
) ? void (0) : __assert_fail ("CurPtr[0] == '\\n' || CurPtr[0] == '\\r'"
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2356, __extension__ __PRETTY_FUNCTION__))
;
2357
2358 // Back up off the newline.
2359 --CurPtr;
2360
2361 // If this is a two-character newline sequence, skip the other character.
2362 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2363 // \n\n or \r\r -> not escaped newline.
2364 if (CurPtr[0] == CurPtr[1])
2365 return false;
2366 // \n\r or \r\n -> skip the newline.
2367 --CurPtr;
2368 }
2369
2370 // If we have horizontal whitespace, skip over it. We allow whitespace
2371 // between the slash and newline.
2372 bool HasSpace = false;
2373 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2374 --CurPtr;
2375 HasSpace = true;
2376 }
2377
2378 // If we have a slash, we know this is an escaped newline.
2379 if (*CurPtr == '\\') {
2380 if (CurPtr[-1] != '*') return false;
2381 } else {
2382 // It isn't a slash, is it the ?? / trigraph?
2383 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
2384 CurPtr[-3] != '*')
2385 return false;
2386
2387 // This is the trigraph ending the comment. Emit a stern warning!
2388 CurPtr -= 2;
2389
2390 // If no trigraphs are enabled, warn that we ignored this trigraph and
2391 // ignore this * character.
2392 if (!L->getLangOpts().Trigraphs) {
2393 if (!L->isLexingRawMode())
2394 L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2395 return false;
2396 }
2397 if (!L->isLexingRawMode())
2398 L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2399 }
2400
2401 // Warn about having an escaped newline between the */ characters.
2402 if (!L->isLexingRawMode())
2403 L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2404
2405 // If there was space between the backslash and newline, warn about it.
2406 if (HasSpace && !L->isLexingRawMode())
2407 L->Diag(CurPtr, diag::backslash_newline_space);
2408
2409 return true;
2410}
2411
2412#ifdef __SSE2__1
2413#include <emmintrin.h>
2414#elif __ALTIVEC__
2415#include <altivec.h>
2416#undef bool
2417#endif
2418
2419/// We have just read from input the / and * characters that started a comment.
2420/// Read until we find the * and / characters that terminate the comment.
2421/// Note that we don't bother decoding trigraphs or escaped newlines in block
2422/// comments, because they cannot cause the comment to end. The only thing
2423/// that can happen is the comment could end with an escaped newline between
2424/// the terminating * and /.
2425///
2426/// If we're in KeepCommentMode or any CommentHandler has inserted
2427/// some tokens, this will store the first token and return true.
2428bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2429 bool &TokAtPhysicalStartOfLine) {
2430 // Scan one character past where we should, looking for a '/' character. Once
2431 // we find it, check to see if it was preceded by a *. This common
2432 // optimization helps people who like to put a lot of * characters in their
2433 // comments.
2434
2435 // The first character we get with newlines and trigraphs skipped to handle
2436 // the degenerate /*/ case below correctly if the * has an escaped newline
2437 // after it.
2438 unsigned CharSize;
2439 unsigned char C = getCharAndSize(CurPtr, CharSize);
2440 CurPtr += CharSize;
2441 if (C == 0 && CurPtr == BufferEnd+1) {
2442 if (!isLexingRawMode())
2443 Diag(BufferPtr, diag::err_unterminated_block_comment);
2444 --CurPtr;
2445
2446 // KeepWhitespaceMode should return this broken comment as a token. Since
2447 // it isn't a well formed comment, just return it as an 'unknown' token.
2448 if (isKeepWhitespaceMode()) {
2449 FormTokenWithChars(Result, CurPtr, tok::unknown);
2450 return true;
2451 }
2452
2453 BufferPtr = CurPtr;
2454 return false;
2455 }
2456
2457 // Check to see if the first character after the '/*' is another /. If so,
2458 // then this slash does not end the block comment, it is part of it.
2459 if (C == '/')
2460 C = *CurPtr++;
2461
2462 while (true) {
2463 // Skip over all non-interesting characters until we find end of buffer or a
2464 // (probably ending) '/' character.
2465 if (CurPtr + 24 < BufferEnd &&
2466 // If there is a code-completion point avoid the fast scan because it
2467 // doesn't check for '\0'.
2468 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2469 // While not aligned to a 16-byte boundary.
2470 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
2471 C = *CurPtr++;
2472
2473 if (C == '/') goto FoundSlash;
2474
2475#ifdef __SSE2__1
2476 __m128i Slashes = _mm_set1_epi8('/');
2477 while (CurPtr+16 <= BufferEnd) {
2478 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2479 Slashes));
2480 if (cmp != 0) {
2481 // Adjust the pointer to point directly after the first slash. It's
2482 // not necessary to set C here, it will be overwritten at the end of
2483 // the outer loop.
2484 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2485 goto FoundSlash;
2486 }
2487 CurPtr += 16;
2488 }
2489#elif __ALTIVEC__
2490 __vector unsigned char Slashes = {
2491 '/', '/', '/', '/', '/', '/', '/', '/',
2492 '/', '/', '/', '/', '/', '/', '/', '/'
2493 };
2494 while (CurPtr+16 <= BufferEnd &&
2495 !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
2496 CurPtr += 16;
2497#else
2498 // Scan for '/' quickly. Many block comments are very large.
2499 while (CurPtr[0] != '/' &&
2500 CurPtr[1] != '/' &&
2501 CurPtr[2] != '/' &&
2502 CurPtr[3] != '/' &&
2503 CurPtr+4 < BufferEnd) {
2504 CurPtr += 4;
2505 }
2506#endif
2507
2508 // It has to be one of the bytes scanned, increment to it and read one.
2509 C = *CurPtr++;
2510 }
2511
2512 // Loop to scan the remainder.
2513 while (C != '/' && C != '\0')
2514 C = *CurPtr++;
2515
2516 if (C == '/') {
2517 FoundSlash:
2518 if (CurPtr[-2] == '*') // We found the final */. We're done!
2519 break;
2520
2521 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2522 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2523 // We found the final */, though it had an escaped newline between the
2524 // * and /. We're done!
2525 break;
2526 }
2527 }
2528 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2529 // If this is a /* inside of the comment, emit a warning. Don't do this
2530 // if this is a /*/, which will end the comment. This misses cases with
2531 // embedded escaped newlines, but oh well.
2532 if (!isLexingRawMode())
2533 Diag(CurPtr-1, diag::warn_nested_block_comment);
2534 }
2535 } else if (C == 0 && CurPtr == BufferEnd+1) {
2536 if (!isLexingRawMode())
2537 Diag(BufferPtr, diag::err_unterminated_block_comment);
2538 // Note: the user probably forgot a */. We could continue immediately
2539 // after the /*, but this would involve lexing a lot of what really is the
2540 // comment, which surely would confuse the parser.
2541 --CurPtr;
2542
2543 // KeepWhitespaceMode should return this broken comment as a token. Since
2544 // it isn't a well formed comment, just return it as an 'unknown' token.
2545 if (isKeepWhitespaceMode()) {
2546 FormTokenWithChars(Result, CurPtr, tok::unknown);
2547 return true;
2548 }
2549
2550 BufferPtr = CurPtr;
2551 return false;
2552 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2553 PP->CodeCompleteNaturalLanguage();
2554 cutOffLexing();
2555 return false;
2556 }
2557
2558 C = *CurPtr++;
2559 }
2560
2561 // Notify comment handlers about the comment unless we're in a #if 0 block.
2562 if (PP && !isLexingRawMode() &&
2563 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2564 getSourceLocation(CurPtr)))) {
2565 BufferPtr = CurPtr;
2566 return true; // A token has to be returned.
2567 }
2568
2569 // If we are returning comments as tokens, return this comment as a token.
2570 if (inKeepCommentMode()) {
2571 FormTokenWithChars(Result, CurPtr, tok::comment);
2572 return true;
2573 }
2574
2575 // It is common for the tokens immediately after a /**/ comment to be
2576 // whitespace. Instead of going through the big switch, handle it
2577 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2578 // have already returned above with the comment as a token.
2579 if (isHorizontalWhitespace(*CurPtr)) {
2580 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2581 return false;
2582 }
2583
2584 // Otherwise, just return so that the next character will be lexed as a token.
2585 BufferPtr = CurPtr;
2586 Result.setFlag(Token::LeadingSpace);
2587 return false;
2588}
2589
2590//===----------------------------------------------------------------------===//
2591// Primary Lexing Entry Points
2592//===----------------------------------------------------------------------===//
2593
2594/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2595/// uninterpreted string. This switches the lexer out of directive mode.
2596void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2597 assert(ParsingPreprocessorDirective && ParsingFilename == false &&(static_cast <bool> (ParsingPreprocessorDirective &&
ParsingFilename == false && "Must be in a preprocessing directive!"
) ? void (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2598, __extension__ __PRETTY_FUNCTION__))
2598 "Must be in a preprocessing directive!")(static_cast <bool> (ParsingPreprocessorDirective &&
ParsingFilename == false && "Must be in a preprocessing directive!"
) ? void (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2598, __extension__ __PRETTY_FUNCTION__))
;
2599 Token Tmp;
2600
2601 // CurPtr - Cache BufferPtr in an automatic variable.
2602 const char *CurPtr = BufferPtr;
2603 while (true) {
1
Loop condition is true. Entering loop body
2604 char Char = getAndAdvanceChar(CurPtr, Tmp);
2
Calling 'Lexer::getAndAdvanceChar'
2605 switch (Char) {
2606 default:
2607 if (Result)
2608 Result->push_back(Char);
2609 break;
2610 case 0: // Null.
2611 // Found end of file?
2612 if (CurPtr-1 != BufferEnd) {
2613 if (isCodeCompletionPoint(CurPtr-1)) {
2614 PP->CodeCompleteNaturalLanguage();
2615 cutOffLexing();
2616 return;
2617 }
2618
2619 // Nope, normal character, continue.
2620 if (Result)
2621 Result->push_back(Char);
2622 break;
2623 }
2624 // FALL THROUGH.
2625 LLVM_FALLTHROUGH[[clang::fallthrough]];
2626 case '\r':
2627 case '\n':
2628 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2629 assert(CurPtr[-1] == Char && "Trigraphs for newline?")(static_cast <bool> (CurPtr[-1] == Char && "Trigraphs for newline?"
) ? void (0) : __assert_fail ("CurPtr[-1] == Char && \"Trigraphs for newline?\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2629, __extension__ __PRETTY_FUNCTION__))
;
2630 BufferPtr = CurPtr-1;
2631
2632 // Next, lex the character, which should handle the EOD transition.
2633 Lex(Tmp);
2634 if (Tmp.is(tok::code_completion)) {
2635 if (PP)
2636 PP->CodeCompleteNaturalLanguage();
2637 Lex(Tmp);
2638 }
2639 assert(Tmp.is(tok::eod) && "Unexpected token!")(static_cast <bool> (Tmp.is(tok::eod) && "Unexpected token!"
) ? void (0) : __assert_fail ("Tmp.is(tok::eod) && \"Unexpected token!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2639, __extension__ __PRETTY_FUNCTION__))
;
2640
2641 // Finally, we're done;
2642 return;
2643 }
2644 }
2645}
2646
2647/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2648/// condition, reporting diagnostics and handling other edge cases as required.
2649/// This returns true if Result contains a token, false if PP.Lex should be
2650/// called again.
2651bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2652 // If we hit the end of the file while parsing a preprocessor directive,
2653 // end the preprocessor directive first. The next token returned will
2654 // then be the end of file.
2655 if (ParsingPreprocessorDirective) {
2656 // Done parsing the "line".
2657 ParsingPreprocessorDirective = false;
2658 // Update the location of token as well as BufferPtr.
2659 FormTokenWithChars(Result, CurPtr, tok::eod);
2660
2661 // Restore comment saving mode, in case it was disabled for directive.
2662 if (PP)
2663 resetExtendedTokenMode();
2664 return true; // Have a token.
2665 }
2666
2667 // If we are in raw mode, return this event as an EOF token. Let the caller
2668 // that put us in raw mode handle the event.
2669 if (isLexingRawMode()) {
2670 Result.startToken();
2671 BufferPtr = BufferEnd;
2672 FormTokenWithChars(Result, BufferEnd, tok::eof);
2673 return true;
2674 }
2675
2676 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2677 PP->setRecordedPreambleConditionalStack(ConditionalStack);
2678 ConditionalStack.clear();
2679 }
2680
2681 // Issue diagnostics for unterminated #if and missing newline.
2682
2683 // If we are in a #if directive, emit an error.
2684 while (!ConditionalStack.empty()) {
2685 if (PP->getCodeCompletionFileLoc() != FileLoc)
2686 PP->Diag(ConditionalStack.back().IfLoc,
2687 diag::err_pp_unterminated_conditional);
2688 ConditionalStack.pop_back();
2689 }
2690
2691 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2692 // a pedwarn.
2693 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2694 DiagnosticsEngine &Diags = PP->getDiagnostics();
2695 SourceLocation EndLoc = getSourceLocation(BufferEnd);
2696 unsigned DiagID;
2697
2698 if (LangOpts.CPlusPlus11) {
2699 // C++11 [lex.phases] 2.2 p2
2700 // Prefer the C++98 pedantic compatibility warning over the generic,
2701 // non-extension, user-requested "missing newline at EOF" warning.
2702 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2703 DiagID = diag::warn_cxx98_compat_no_newline_eof;
2704 } else {
2705 DiagID = diag::warn_no_newline_eof;
2706 }
2707 } else {
2708 DiagID = diag::ext_no_newline_eof;
2709 }
2710
2711 Diag(BufferEnd, DiagID)
2712 << FixItHint::CreateInsertion(EndLoc, "\n");
2713 }
2714
2715 BufferPtr = CurPtr;
2716
2717 // Finally, let the preprocessor handle this.
2718 return PP->HandleEndOfFile(Result, isPragmaLexer());
2719}
2720
2721/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2722/// the specified lexer will return a tok::l_paren token, 0 if it is something
2723/// else and 2 if there are no more tokens in the buffer controlled by the
2724/// lexer.
2725unsigned Lexer::isNextPPTokenLParen() {
2726 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?")(static_cast <bool> (!LexingRawMode && "How can we expand a macro from a skipping buffer?"
) ? void (0) : __assert_fail ("!LexingRawMode && \"How can we expand a macro from a skipping buffer?\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2726, __extension__ __PRETTY_FUNCTION__))
;
2727
2728 // Switch to 'skipping' mode. This will ensure that we can lex a token
2729 // without emitting diagnostics, disables macro expansion, and will cause EOF
2730 // to return an EOF token instead of popping the include stack.
2731 LexingRawMode = true;
2732
2733 // Save state that can be changed while lexing so that we can restore it.
2734 const char *TmpBufferPtr = BufferPtr;
2735 bool inPPDirectiveMode = ParsingPreprocessorDirective;
2736 bool atStartOfLine = IsAtStartOfLine;
2737 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2738 bool leadingSpace = HasLeadingSpace;
2739
2740 Token Tok;
2741 Lex(Tok);
2742
2743 // Restore state that may have changed.
2744 BufferPtr = TmpBufferPtr;
2745 ParsingPreprocessorDirective = inPPDirectiveMode;
2746 HasLeadingSpace = leadingSpace;
2747 IsAtStartOfLine = atStartOfLine;
2748 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2749
2750 // Restore the lexer back to non-skipping mode.
2751 LexingRawMode = false;
2752
2753 if (Tok.is(tok::eof))
2754 return 2;
2755 return Tok.is(tok::l_paren);
2756}
2757
2758/// Find the end of a version control conflict marker.
2759static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2760 ConflictMarkerKind CMK) {
2761 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2762 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2763 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2764 size_t Pos = RestOfBuffer.find(Terminator);
2765 while (Pos != StringRef::npos) {
2766 // Must occur at start of line.
2767 if (Pos == 0 ||
2768 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
2769 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2770 Pos = RestOfBuffer.find(Terminator);
2771 continue;
2772 }
2773 return RestOfBuffer.data()+Pos;
2774 }
2775 return nullptr;
2776}
2777
2778/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2779/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2780/// and recover nicely. This returns true if it is a conflict marker and false
2781/// if not.
2782bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2783 // Only a conflict marker if it starts at the beginning of a line.
2784 if (CurPtr != BufferStart &&
2785 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2786 return false;
2787
2788 // Check to see if we have <<<<<<< or >>>>.
2789 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2790 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
2791 return false;
2792
2793 // If we have a situation where we don't care about conflict markers, ignore
2794 // it.
2795 if (CurrentConflictMarkerState || isLexingRawMode())
2796 return false;
2797
2798 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
2799
2800 // Check to see if there is an ending marker somewhere in the buffer at the
2801 // start of a line to terminate this conflict marker.
2802 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2803 // We found a match. We are really in a conflict marker.
2804 // Diagnose this, and ignore to the end of line.
2805 Diag(CurPtr, diag::err_conflict_marker);
2806 CurrentConflictMarkerState = Kind;
2807
2808 // Skip ahead to the end of line. We know this exists because the
2809 // end-of-conflict marker starts with \r or \n.
2810 while (*CurPtr != '\r' && *CurPtr != '\n') {
2811 assert(CurPtr != BufferEnd && "Didn't find end of line")(static_cast <bool> (CurPtr != BufferEnd && "Didn't find end of line"
) ? void (0) : __assert_fail ("CurPtr != BufferEnd && \"Didn't find end of line\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2811, __extension__ __PRETTY_FUNCTION__))
;
2812 ++CurPtr;
2813 }
2814 BufferPtr = CurPtr;
2815 return true;
2816 }
2817
2818 // No end of conflict marker found.
2819 return false;
2820}
2821
2822/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2823/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2824/// is the end of a conflict marker. Handle it by ignoring up until the end of
2825/// the line. This returns true if it is a conflict marker and false if not.
2826bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2827 // Only a conflict marker if it starts at the beginning of a line.
2828 if (CurPtr != BufferStart &&
2829 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2830 return false;
2831
2832 // If we have a situation where we don't care about conflict markers, ignore
2833 // it.
2834 if (!CurrentConflictMarkerState || isLexingRawMode())
2835 return false;
2836
2837 // Check to see if we have the marker (4 characters in a row).
2838 for (unsigned i = 1; i != 4; ++i)
2839 if (CurPtr[i] != CurPtr[0])
2840 return false;
2841
2842 // If we do have it, search for the end of the conflict marker. This could
2843 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
2844 // be the end of conflict marker.
2845 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2846 CurrentConflictMarkerState)) {
2847 CurPtr = End;
2848
2849 // Skip ahead to the end of line.
2850 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2851 ++CurPtr;
2852
2853 BufferPtr = CurPtr;
2854
2855 // No longer in the conflict marker.
2856 CurrentConflictMarkerState = CMK_None;
2857 return true;
2858 }
2859
2860 return false;
2861}
2862
2863static const char *findPlaceholderEnd(const char *CurPtr,
2864 const char *BufferEnd) {
2865 if (CurPtr == BufferEnd)
2866 return nullptr;
2867 BufferEnd -= 1; // Scan until the second last character.
2868 for (; CurPtr != BufferEnd; ++CurPtr) {
2869 if (CurPtr[0] == '#' && CurPtr[1] == '>')
2870 return CurPtr + 2;
2871 }
2872 return nullptr;
2873}
2874
2875bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2876 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!")(static_cast <bool> (CurPtr[-1] == '<' && CurPtr
[0] == '#' && "Not a placeholder!") ? void (0) : __assert_fail
("CurPtr[-1] == '<' && CurPtr[0] == '#' && \"Not a placeholder!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 2876, __extension__ __PRETTY_FUNCTION__))
;
2877 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
2878 return false;
2879 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2880 if (!End)
2881 return false;
2882 const char *Start = CurPtr - 1;
2883 if (!LangOpts.AllowEditorPlaceholders)
2884 Diag(Start, diag::err_placeholder_in_source);
2885 Result.startToken();
2886 FormTokenWithChars(Result, End, tok::raw_identifier);
2887 Result.setRawIdentifierData(Start);
2888 PP->LookUpIdentifierInfo(Result);
2889 Result.setFlag(Token::IsEditorPlaceholder);
2890 BufferPtr = End;
2891 return true;
2892}
2893
2894bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2895 if (PP && PP->isCodeCompletionEnabled()) {
2896 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2897 return Loc == PP->getCodeCompletionLoc();
2898 }
2899
2900 return false;
2901}
2902
2903uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2904 Token *Result) {
2905 unsigned CharSize;
2906 char Kind = getCharAndSize(StartPtr, CharSize);
2907
2908 unsigned NumHexDigits;
2909 if (Kind == 'u')
2910 NumHexDigits = 4;
2911 else if (Kind == 'U')
2912 NumHexDigits = 8;
2913 else
2914 return 0;
2915
2916 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
2917 if (Result && !isLexingRawMode())
2918 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2919 return 0;
2920 }
2921
2922 const char *CurPtr = StartPtr + CharSize;
2923 const char *KindLoc = &CurPtr[-1];
2924
2925 uint32_t CodePoint = 0;
2926 for (unsigned i = 0; i < NumHexDigits; ++i) {
2927 char C = getCharAndSize(CurPtr, CharSize);
2928
2929 unsigned Value = llvm::hexDigitValue(C);
2930 if (Value == -1U) {
2931 if (Result && !isLexingRawMode()) {
2932 if (i == 0) {
2933 Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2934 << StringRef(KindLoc, 1);
2935 } else {
2936 Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2937
2938 // If the user wrote \U1234, suggest a fixit to \u.
2939 if (i == 4 && NumHexDigits == 8) {
2940 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
2941 Diag(KindLoc, diag::note_ucn_four_not_eight)
2942 << FixItHint::CreateReplacement(URange, "u");
2943 }
2944 }
2945 }
2946
2947 return 0;
2948 }
2949
2950 CodePoint <<= 4;
2951 CodePoint += Value;
2952
2953 CurPtr += CharSize;
2954 }
2955
2956 if (Result) {
2957 Result->setFlag(Token::HasUCN);
2958 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
2959 StartPtr = CurPtr;
2960 else
2961 while (StartPtr != CurPtr)
2962 (void)getAndAdvanceChar(StartPtr, *Result);
2963 } else {
2964 StartPtr = CurPtr;
2965 }
2966
2967 // Don't apply C family restrictions to UCNs in assembly mode
2968 if (LangOpts.AsmPreprocessor)
2969 return CodePoint;
2970
2971 // C99 6.4.3p2: A universal character name shall not specify a character whose
2972 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
2973 // 0060 (`), nor one in the range D800 through DFFF inclusive.)
2974 // C++11 [lex.charset]p2: If the hexadecimal value for a
2975 // universal-character-name corresponds to a surrogate code point (in the
2976 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
2977 // if the hexadecimal value for a universal-character-name outside the
2978 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
2979 // string literal corresponds to a control character (in either of the
2980 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
2981 // basic source character set, the program is ill-formed.
2982 if (CodePoint < 0xA0) {
2983 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
2984 return CodePoint;
2985
2986 // We don't use isLexingRawMode() here because we need to warn about bad
2987 // UCNs even when skipping preprocessing tokens in a #if block.
2988 if (Result && PP) {
2989 if (CodePoint < 0x20 || CodePoint >= 0x7F)
2990 Diag(BufferPtr, diag::err_ucn_control_character);
2991 else {
2992 char C = static_cast<char>(CodePoint);
2993 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
2994 }
2995 }
2996
2997 return 0;
2998 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
2999 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3000 // We don't use isLexingRawMode() here because we need to diagnose bad
3001 // UCNs even when skipping preprocessing tokens in a #if block.
3002 if (Result && PP) {
3003 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3004 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3005 else
3006 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3007 }
3008 return 0;
3009 }
3010
3011 return CodePoint;
3012}
3013
3014bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3015 const char *CurPtr) {
3016 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3017 UnicodeWhitespaceCharRanges);
3018 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3019 UnicodeWhitespaceChars.contains(C)) {
3020 Diag(BufferPtr, diag::ext_unicode_whitespace)
3021 << makeCharRange(*this, BufferPtr, CurPtr);
3022
3023 Result.setFlag(Token::LeadingSpace);
3024 return true;
3025 }
3026 return false;
3027}
3028
3029bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3030 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
3031 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3032 !PP->isPreprocessedOutput()) {
3033 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3034 makeCharRange(*this, BufferPtr, CurPtr),
3035 /*IsFirst=*/true);
3036 }
3037
3038 MIOpt.ReadToken();
3039 return LexIdentifier(Result, CurPtr);
3040 }
3041
3042 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3043 !PP->isPreprocessedOutput() &&
3044 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
3045 // Non-ASCII characters tend to creep into source code unintentionally.
3046 // Instead of letting the parser complain about the unknown token,
3047 // just drop the character.
3048 // Note that we can /only/ do this when the non-ASCII character is actually
3049 // spelled as Unicode, not written as a UCN. The standard requires that
3050 // we not throw away any possible preprocessor tokens, but there's a
3051 // loophole in the mapping of Unicode characters to basic character set
3052 // characters that allows us to map these particular characters to, say,
3053 // whitespace.
3054 Diag(BufferPtr, diag::err_non_ascii)
3055 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3056
3057 BufferPtr = CurPtr;
3058 return false;
3059 }
3060
3061 // Otherwise, we have an explicit UCN or a character that's unlikely to show
3062 // up by accident.
3063 MIOpt.ReadToken();
3064 FormTokenWithChars(Result, CurPtr, tok::unknown);
3065 return true;
3066}
3067
3068void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3069 IsAtStartOfLine = Result.isAtStartOfLine();
3070 HasLeadingSpace = Result.hasLeadingSpace();
3071 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3072 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3073}
3074
3075bool Lexer::Lex(Token &Result) {
3076 // Start a new token.
3077 Result.startToken();
3078
3079 // Set up misc whitespace flags for LexTokenInternal.
3080 if (IsAtStartOfLine) {
3081 Result.setFlag(Token::StartOfLine);
3082 IsAtStartOfLine = false;
3083 }
3084
3085 if (HasLeadingSpace) {
3086 Result.setFlag(Token::LeadingSpace);
3087 HasLeadingSpace = false;
3088 }
3089
3090 if (HasLeadingEmptyMacro) {
3091 Result.setFlag(Token::LeadingEmptyMacro);
3092 HasLeadingEmptyMacro = false;
3093 }
3094
3095 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3096 IsAtPhysicalStartOfLine = false;
3097 bool isRawLex = isLexingRawMode();
3098 (void) isRawLex;
3099 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3100 // (After the LexTokenInternal call, the lexer might be destroyed.)
3101 assert((returnedToken || !isRawLex) && "Raw lex must succeed")(static_cast <bool> ((returnedToken || !isRawLex) &&
"Raw lex must succeed") ? void (0) : __assert_fail ("(returnedToken || !isRawLex) && \"Raw lex must succeed\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 3101, __extension__ __PRETTY_FUNCTION__))
;
3102 return returnedToken;
3103}
3104
3105/// LexTokenInternal - This implements a simple C family lexer. It is an
3106/// extremely performance critical piece of code. This assumes that the buffer
3107/// has a null character at the end of the file. This returns a preprocessing
3108/// token, not a normal token, as such, it is an internal interface. It assumes
3109/// that the Flags of result have been cleared before calling this.
3110bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3111LexNextToken:
3112 // New token, can't need cleaning yet.
3113 Result.clearFlag(Token::NeedsCleaning);
3114 Result.setIdentifierInfo(nullptr);
3115
3116 // CurPtr - Cache BufferPtr in an automatic variable.
3117 const char *CurPtr = BufferPtr;
3118
3119 // Small amounts of horizontal whitespace is very common between tokens.
3120 if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
3121 ++CurPtr;
3122 while ((*CurPtr == ' ') || (*CurPtr == '\t'))
3123 ++CurPtr;
3124
3125 // If we are keeping whitespace and other tokens, just return what we just
3126 // skipped. The next lexer invocation will return the token after the
3127 // whitespace.
3128 if (isKeepWhitespaceMode()) {
3129 FormTokenWithChars(Result, CurPtr, tok::unknown);
3130 // FIXME: The next token will not have LeadingSpace set.
3131 return true;
3132 }
3133
3134 BufferPtr = CurPtr;
3135 Result.setFlag(Token::LeadingSpace);
3136 }
3137
3138 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3139
3140 // Read a character, advancing over it.
3141 char Char = getAndAdvanceChar(CurPtr, Result);
3142 tok::TokenKind Kind;
3143
3144 switch (Char) {
3145 case 0: // Null.
3146 // Found end of file?
3147 if (CurPtr-1 == BufferEnd)
3148 return LexEndOfFile(Result, CurPtr-1);
3149
3150 // Check if we are performing code completion.
3151 if (isCodeCompletionPoint(CurPtr-1)) {
3152 // Return the code-completion token.
3153 Result.startToken();
3154 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3155 return true;
3156 }
3157
3158 if (!isLexingRawMode())
3159 Diag(CurPtr-1, diag::null_in_file);
3160 Result.setFlag(Token::LeadingSpace);
3161 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3162 return true; // KeepWhitespaceMode
3163
3164 // We know the lexer hasn't changed, so just try again with this lexer.
3165 // (We manually eliminate the tail call to avoid recursion.)
3166 goto LexNextToken;
3167
3168 case 26: // DOS & CP/M EOF: "^Z".
3169 // If we're in Microsoft extensions mode, treat this as end of file.
3170 if (LangOpts.MicrosoftExt) {
3171 if (!isLexingRawMode())
3172 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3173 return LexEndOfFile(Result, CurPtr-1);
3174 }
3175
3176 // If Microsoft extensions are disabled, this is just random garbage.
3177 Kind = tok::unknown;
3178 break;
3179
3180 case '\r':
3181 if (CurPtr[0] == '\n')
3182 Char = getAndAdvanceChar(CurPtr, Result);
3183 LLVM_FALLTHROUGH[[clang::fallthrough]];
3184 case '\n':
3185 // If we are inside a preprocessor directive and we see the end of line,
3186 // we know we are done with the directive, so return an EOD token.
3187 if (ParsingPreprocessorDirective) {
3188 // Done parsing the "line".
3189 ParsingPreprocessorDirective = false;
3190
3191 // Restore comment saving mode, in case it was disabled for directive.
3192 if (PP)
3193 resetExtendedTokenMode();
3194
3195 // Since we consumed a newline, we are back at the start of a line.
3196 IsAtStartOfLine = true;
3197 IsAtPhysicalStartOfLine = true;
3198
3199 Kind = tok::eod;
3200 break;
3201 }
3202
3203 // No leading whitespace seen so far.
3204 Result.clearFlag(Token::LeadingSpace);
3205
3206 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3207 return true; // KeepWhitespaceMode
3208
3209 // We only saw whitespace, so just try again with this lexer.
3210 // (We manually eliminate the tail call to avoid recursion.)
3211 goto LexNextToken;
3212 case ' ':
3213 case '\t':
3214 case '\f':
3215 case '\v':
3216 SkipHorizontalWhitespace:
3217 Result.setFlag(Token::LeadingSpace);
3218 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3219 return true; // KeepWhitespaceMode
3220
3221 SkipIgnoredUnits:
3222 CurPtr = BufferPtr;
3223
3224 // If the next token is obviously a // or /* */ comment, skip it efficiently
3225 // too (without going through the big switch stmt).
3226 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3227 LangOpts.LineComment &&
3228 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3229 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3230 return true; // There is a token to return.
3231 goto SkipIgnoredUnits;
3232 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3233 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3234 return true; // There is a token to return.
3235 goto SkipIgnoredUnits;
3236 } else if (isHorizontalWhitespace(*CurPtr)) {
3237 goto SkipHorizontalWhitespace;
3238 }
3239 // We only saw whitespace, so just try again with this lexer.
3240 // (We manually eliminate the tail call to avoid recursion.)
3241 goto LexNextToken;
3242
3243 // C99 6.4.4.1: Integer Constants.
3244 // C99 6.4.4.2: Floating Constants.
3245 case '0': case '1': case '2': case '3': case '4':
3246 case '5': case '6': case '7': case '8': case '9':
3247 // Notify MIOpt that we read a non-whitespace/non-comment token.
3248 MIOpt.ReadToken();
3249 return LexNumericConstant(Result, CurPtr);
3250
3251 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3252 // Notify MIOpt that we read a non-whitespace/non-comment token.
3253 MIOpt.ReadToken();
3254
3255 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3256 Char = getCharAndSize(CurPtr, SizeTmp);
3257
3258 // UTF-16 string literal
3259 if (Char == '"')
3260 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3261 tok::utf16_string_literal);
3262
3263 // UTF-16 character constant
3264 if (Char == '\'')
3265 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3266 tok::utf16_char_constant);
3267
3268 // UTF-16 raw string literal
3269 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3270 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3271 return LexRawStringLiteral(Result,
3272 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3273 SizeTmp2, Result),
3274 tok::utf16_string_literal);
3275
3276 if (Char == '8') {
3277 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3278
3279 // UTF-8 string literal
3280 if (Char2 == '"')
3281 return LexStringLiteral(Result,
3282 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3283 SizeTmp2, Result),
3284 tok::utf8_string_literal);
3285 if (Char2 == '\'' && LangOpts.CPlusPlus17)
3286 return LexCharConstant(
3287 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3288 SizeTmp2, Result),
3289 tok::utf8_char_constant);
3290
3291 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3292 unsigned SizeTmp3;
3293 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3294 // UTF-8 raw string literal
3295 if (Char3 == '"') {
3296 return LexRawStringLiteral(Result,
3297 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3298 SizeTmp2, Result),
3299 SizeTmp3, Result),
3300 tok::utf8_string_literal);
3301 }
3302 }
3303 }
3304 }
3305
3306 // treat u like the start of an identifier.
3307 return LexIdentifier(Result, CurPtr);
3308
3309 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
3310 // Notify MIOpt that we read a non-whitespace/non-comment token.
3311 MIOpt.ReadToken();
3312
3313 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3314 Char = getCharAndSize(CurPtr, SizeTmp);
3315
3316 // UTF-32 string literal
3317 if (Char == '"')
3318 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3319 tok::utf32_string_literal);
3320
3321 // UTF-32 character constant
3322 if (Char == '\'')
3323 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3324 tok::utf32_char_constant);
3325
3326 // UTF-32 raw string literal
3327 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3328 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3329 return LexRawStringLiteral(Result,
3330 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3331 SizeTmp2, Result),
3332 tok::utf32_string_literal);
3333 }
3334
3335 // treat U like the start of an identifier.
3336 return LexIdentifier(Result, CurPtr);
3337
3338 case 'R': // Identifier or C++0x raw string literal
3339 // Notify MIOpt that we read a non-whitespace/non-comment token.
3340 MIOpt.ReadToken();
3341
3342 if (LangOpts.CPlusPlus11) {
3343 Char = getCharAndSize(CurPtr, SizeTmp);
3344
3345 if (Char == '"')
3346 return LexRawStringLiteral(Result,
3347 ConsumeChar(CurPtr, SizeTmp, Result),
3348 tok::string_literal);
3349 }
3350
3351 // treat R like the start of an identifier.
3352 return LexIdentifier(Result, CurPtr);
3353
3354 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3355 // Notify MIOpt that we read a non-whitespace/non-comment token.
3356 MIOpt.ReadToken();
3357 Char = getCharAndSize(CurPtr, SizeTmp);
3358
3359 // Wide string literal.
3360 if (Char == '"')
3361 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3362 tok::wide_string_literal);
3363
3364 // Wide raw string literal.
3365 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3366 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3367 return LexRawStringLiteral(Result,
3368 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3369 SizeTmp2, Result),
3370 tok::wide_string_literal);
3371
3372 // Wide character constant.
3373 if (Char == '\'')
3374 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3375 tok::wide_char_constant);
3376 // FALL THROUGH, treating L like the start of an identifier.
3377 LLVM_FALLTHROUGH[[clang::fallthrough]];
3378
3379 // C99 6.4.2: Identifiers.
3380 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3381 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3382 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3383 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3384 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3385 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3386 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3387 case 'v': case 'w': case 'x': case 'y': case 'z':
3388 case '_':
3389 // Notify MIOpt that we read a non-whitespace/non-comment token.
3390 MIOpt.ReadToken();
3391 return LexIdentifier(Result, CurPtr);
3392
3393 case '$': // $ in identifiers.
3394 if (LangOpts.DollarIdents) {
3395 if (!isLexingRawMode())
3396 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3397 // Notify MIOpt that we read a non-whitespace/non-comment token.
3398 MIOpt.ReadToken();
3399 return LexIdentifier(Result, CurPtr);
3400 }
3401
3402 Kind = tok::unknown;
3403 break;
3404
3405 // C99 6.4.4: Character Constants.
3406 case '\'':
3407 // Notify MIOpt that we read a non-whitespace/non-comment token.
3408 MIOpt.ReadToken();
3409 return LexCharConstant(Result, CurPtr, tok::char_constant);
3410
3411 // C99 6.4.5: String Literals.
3412 case '"':
3413 // Notify MIOpt that we read a non-whitespace/non-comment token.
3414 MIOpt.ReadToken();
3415 return LexStringLiteral(Result, CurPtr, tok::string_literal);
3416
3417 // C99 6.4.6: Punctuators.
3418 case '?':
3419 Kind = tok::question;
3420 break;
3421 case '[':
3422 Kind = tok::l_square;
3423 break;
3424 case ']':
3425 Kind = tok::r_square;
3426 break;
3427 case '(':
3428 Kind = tok::l_paren;
3429 break;
3430 case ')':
3431 Kind = tok::r_paren;
3432 break;
3433 case '{':
3434 Kind = tok::l_brace;
3435 break;
3436 case '}':
3437 Kind = tok::r_brace;
3438 break;
3439 case '.':
3440 Char = getCharAndSize(CurPtr, SizeTmp);
3441 if (Char >= '0' && Char <= '9') {
3442 // Notify MIOpt that we read a non-whitespace/non-comment token.
3443 MIOpt.ReadToken();
3444
3445 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3446 } else if (LangOpts.CPlusPlus && Char == '*') {
3447 Kind = tok::periodstar;
3448 CurPtr += SizeTmp;
3449 } else if (Char == '.' &&
3450 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3451 Kind = tok::ellipsis;
3452 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3453 SizeTmp2, Result);
3454 } else {
3455 Kind = tok::period;
3456 }
3457 break;
3458 case '&':
3459 Char = getCharAndSize(CurPtr, SizeTmp);
3460 if (Char == '&') {
3461 Kind = tok::ampamp;
3462 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3463 } else if (Char == '=') {
3464 Kind = tok::ampequal;
3465 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3466 } else {
3467 Kind = tok::amp;
3468 }
3469 break;
3470 case '*':
3471 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3472 Kind = tok::starequal;
3473 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3474 } else {
3475 Kind = tok::star;
3476 }
3477 break;
3478 case '+':
3479 Char = getCharAndSize(CurPtr, SizeTmp);
3480 if (Char == '+') {
3481 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3482 Kind = tok::plusplus;
3483 } else if (Char == '=') {
3484 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3485 Kind = tok::plusequal;
3486 } else {
3487 Kind = tok::plus;
3488 }
3489 break;
3490 case '-':
3491 Char = getCharAndSize(CurPtr, SizeTmp);
3492 if (Char == '-') { // --
3493 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3494 Kind = tok::minusminus;
3495 } else if (Char == '>' && LangOpts.CPlusPlus &&
3496 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3497 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3498 SizeTmp2, Result);
3499 Kind = tok::arrowstar;
3500 } else if (Char == '>') { // ->
3501 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3502 Kind = tok::arrow;
3503 } else if (Char == '=') { // -=
3504 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3505 Kind = tok::minusequal;
3506 } else {
3507 Kind = tok::minus;
3508 }
3509 break;
3510 case '~':
3511 Kind = tok::tilde;
3512 break;
3513 case '!':
3514 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3515 Kind = tok::exclaimequal;
3516 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3517 } else {
3518 Kind = tok::exclaim;
3519 }
3520 break;
3521 case '/':
3522 // 6.4.9: Comments
3523 Char = getCharAndSize(CurPtr, SizeTmp);
3524 if (Char == '/') { // Line comment.
3525 // Even if Line comments are disabled (e.g. in C89 mode), we generally
3526 // want to lex this as a comment. There is one problem with this though,
3527 // that in one particular corner case, this can change the behavior of the
3528 // resultant program. For example, In "foo //**/ bar", C89 would lex
3529 // this as "foo / bar" and languages with Line comments would lex it as
3530 // "foo". Check to see if the character after the second slash is a '*'.
3531 // If so, we will lex that as a "/" instead of the start of a comment.
3532 // However, we never do this if we are just preprocessing.
3533 bool TreatAsComment = LangOpts.LineComment &&
3534 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3535 if (!TreatAsComment)
3536 if (!(PP && PP->isPreprocessedOutput()))
3537 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3538
3539 if (TreatAsComment) {
3540 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3541 TokAtPhysicalStartOfLine))
3542 return true; // There is a token to return.
3543
3544 // It is common for the tokens immediately after a // comment to be
3545 // whitespace (indentation for the next line). Instead of going through
3546 // the big switch, handle it efficiently now.
3547 goto SkipIgnoredUnits;
3548 }
3549 }
3550
3551 if (Char == '*') { // /**/ comment.
3552 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3553 TokAtPhysicalStartOfLine))
3554 return true; // There is a token to return.
3555
3556 // We only saw whitespace, so just try again with this lexer.
3557 // (We manually eliminate the tail call to avoid recursion.)
3558 goto LexNextToken;
3559 }
3560
3561 if (Char == '=') {
3562 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3563 Kind = tok::slashequal;
3564 } else {
3565 Kind = tok::slash;
3566 }
3567 break;
3568 case '%':
3569 Char = getCharAndSize(CurPtr, SizeTmp);
3570 if (Char == '=') {
3571 Kind = tok::percentequal;
3572 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3573 } else if (LangOpts.Digraphs && Char == '>') {
3574 Kind = tok::r_brace; // '%>' -> '}'
3575 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3576 } else if (LangOpts.Digraphs && Char == ':') {
3577 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3578 Char = getCharAndSize(CurPtr, SizeTmp);
3579 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3580 Kind = tok::hashhash; // '%:%:' -> '##'
3581 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3582 SizeTmp2, Result);
3583 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
3584 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3585 if (!isLexingRawMode())
3586 Diag(BufferPtr, diag::ext_charize_microsoft);
3587 Kind = tok::hashat;
3588 } else { // '%:' -> '#'
3589 // We parsed a # character. If this occurs at the start of the line,
3590 // it's actually the start of a preprocessing directive. Callback to
3591 // the preprocessor to handle it.
3592 // TODO: -fpreprocessed mode??
3593 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3594 goto HandleDirective;
3595
3596 Kind = tok::hash;
3597 }
3598 } else {
3599 Kind = tok::percent;
3600 }
3601 break;
3602 case '<':
3603 Char = getCharAndSize(CurPtr, SizeTmp);
3604 if (ParsingFilename) {
3605 return LexAngledStringLiteral(Result, CurPtr);
3606 } else if (Char == '<') {
3607 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3608 if (After == '=') {
3609 Kind = tok::lesslessequal;
3610 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3611 SizeTmp2, Result);
3612 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
3613 // If this is actually a '<<<<<<<' version control conflict marker,
3614 // recognize it as such and recover nicely.
3615 goto LexNextToken;
3616 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
3617 // If this is '<<<<' and we're in a Perforce-style conflict marker,
3618 // ignore it.
3619 goto LexNextToken;
3620 } else if (LangOpts.CUDA && After == '<') {
3621 Kind = tok::lesslessless;
3622 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3623 SizeTmp2, Result);
3624 } else {
3625 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3626 Kind = tok::lessless;
3627 }
3628 } else if (Char == '=') {
3629 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3630 if (After == '>') {
3631 if (getLangOpts().CPlusPlus2a) {
3632 if (!isLexingRawMode())
3633 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3634 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3635 SizeTmp2, Result);
3636 Kind = tok::spaceship;
3637 break;
3638 }
3639 // Suggest adding a space between the '<=' and the '>' to avoid a
3640 // change in semantics if this turns up in C++ <=17 mode.
3641 if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
3642 Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
3643 << FixItHint::CreateInsertion(
3644 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3645 }
3646 }
3647 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3648 Kind = tok::lessequal;
3649 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
3650 if (LangOpts.CPlusPlus11 &&
3651 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3652 // C++0x [lex.pptoken]p3:
3653 // Otherwise, if the next three characters are <:: and the subsequent
3654 // character is neither : nor >, the < is treated as a preprocessor
3655 // token by itself and not as the first character of the alternative
3656 // token <:.
3657 unsigned SizeTmp3;
3658 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3659 if (After != ':' && After != '>') {
3660 Kind = tok::less;
3661 if (!isLexingRawMode())
3662 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3663 break;
3664 }
3665 }
3666
3667 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3668 Kind = tok::l_square;
3669 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
3670 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3671 Kind = tok::l_brace;
3672 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
3673 lexEditorPlaceholder(Result, CurPtr)) {
3674 return true;
3675 } else {
3676 Kind = tok::less;
3677 }
3678 break;
3679 case '>':
3680 Char = getCharAndSize(CurPtr, SizeTmp);
3681 if (Char == '=') {
3682 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3683 Kind = tok::greaterequal;
3684 } else if (Char == '>') {
3685 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3686 if (After == '=') {
3687 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3688 SizeTmp2, Result);
3689 Kind = tok::greatergreaterequal;
3690 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
3691 // If this is actually a '>>>>' conflict marker, recognize it as such
3692 // and recover nicely.
3693 goto LexNextToken;
3694 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
3695 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3696 goto LexNextToken;
3697 } else if (LangOpts.CUDA && After == '>') {
3698 Kind = tok::greatergreatergreater;
3699 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3700 SizeTmp2, Result);
3701 } else {
3702 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3703 Kind = tok::greatergreater;
3704 }
3705 } else {
3706 Kind = tok::greater;
3707 }
3708 break;
3709 case '^':
3710 Char = getCharAndSize(CurPtr, SizeTmp);
3711 if (Char == '=') {
3712 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3713 Kind = tok::caretequal;
3714 } else if (LangOpts.OpenCL && Char == '^') {
3715 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3716 Kind = tok::caretcaret;
3717 } else {
3718 Kind = tok::caret;
3719 }
3720 break;
3721 case '|':
3722 Char = getCharAndSize(CurPtr, SizeTmp);
3723 if (Char == '=') {
3724 Kind = tok::pipeequal;
3725 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3726 } else if (Char == '|') {
3727 // If this is '|||||||' and we're in a conflict marker, ignore it.
3728 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
3729 goto LexNextToken;
3730 Kind = tok::pipepipe;
3731 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3732 } else {
3733 Kind = tok::pipe;
3734 }
3735 break;
3736 case ':':
3737 Char = getCharAndSize(CurPtr, SizeTmp);
3738 if (LangOpts.Digraphs && Char == '>') {
3739 Kind = tok::r_square; // ':>' -> ']'
3740 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3741 } else if ((LangOpts.CPlusPlus ||
3742 LangOpts.DoubleSquareBracketAttributes) &&
3743 Char == ':') {
3744 Kind = tok::coloncolon;
3745 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3746 } else {
3747 Kind = tok::colon;
3748 }
3749 break;
3750 case ';':
3751 Kind = tok::semi;
3752 break;
3753 case '=':
3754 Char = getCharAndSize(CurPtr, SizeTmp);
3755 if (Char == '=') {
3756 // If this is '====' and we're in a conflict marker, ignore it.
3757 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
3758 goto LexNextToken;
3759
3760 Kind = tok::equalequal;
3761 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3762 } else {
3763 Kind = tok::equal;
3764 }
3765 break;
3766 case ',':
3767 Kind = tok::comma;
3768 break;
3769 case '#':
3770 Char = getCharAndSize(CurPtr, SizeTmp);
3771 if (Char == '#') {
3772 Kind = tok::hashhash;
3773 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3774 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
3775 Kind = tok::hashat;
3776 if (!isLexingRawMode())
3777 Diag(BufferPtr, diag::ext_charize_microsoft);
3778 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3779 } else {
3780 // We parsed a # character. If this occurs at the start of the line,
3781 // it's actually the start of a preprocessing directive. Callback to
3782 // the preprocessor to handle it.
3783 // TODO: -fpreprocessed mode??
3784 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3785 goto HandleDirective;
3786
3787 Kind = tok::hash;
3788 }
3789 break;
3790
3791 case '@':
3792 // Objective C support.
3793 if (CurPtr[-1] == '@' && LangOpts.ObjC1)
3794 Kind = tok::at;
3795 else
3796 Kind = tok::unknown;
3797 break;
3798
3799 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3800 case '\\':
3801 if (!LangOpts.AsmPreprocessor) {
3802 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3803 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3804 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3805 return true; // KeepWhitespaceMode
3806
3807 // We only saw whitespace, so just try again with this lexer.
3808 // (We manually eliminate the tail call to avoid recursion.)
3809 goto LexNextToken;
3810 }
3811
3812 return LexUnicode(Result, CodePoint, CurPtr);
3813 }
3814 }
3815
3816 Kind = tok::unknown;
3817 break;
3818
3819 default: {
3820 if (isASCII(Char)) {
3821 Kind = tok::unknown;
3822 break;
3823 }
3824
3825 llvm::UTF32 CodePoint;
3826
3827 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3828 // an escaped newline.
3829 --CurPtr;
3830 const char *UTF8StartPtr = CurPtr;
3831 llvm::ConversionResult Status =
3832 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3833 (const llvm::UTF8 *)BufferEnd,
3834 &CodePoint,
3835 llvm::strictConversion);
3836 if (Status == llvm::conversionOK) {
3837 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3838 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3839 return true; // KeepWhitespaceMode
3840
3841 // We only saw whitespace, so just try again with this lexer.
3842 // (We manually eliminate the tail call to avoid recursion.)
3843 goto LexNextToken;
3844 }
3845 if (!isLexingRawMode())
3846 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
3847 makeCharRange(*this, UTF8StartPtr, CurPtr));
3848 return LexUnicode(Result, CodePoint, CurPtr);
3849 }
3850
3851 if (isLexingRawMode() || ParsingPreprocessorDirective ||
3852 PP->isPreprocessedOutput()) {
3853 ++CurPtr;
3854 Kind = tok::unknown;
3855 break;
3856 }
3857
3858 // Non-ASCII characters tend to creep into source code unintentionally.
3859 // Instead of letting the parser complain about the unknown token,
3860 // just diagnose the invalid UTF-8, then drop the character.
3861 Diag(CurPtr, diag::err_invalid_utf8);
3862
3863 BufferPtr = CurPtr+1;
3864 // We're pretending the character didn't exist, so just try again with
3865 // this lexer.
3866 // (We manually eliminate the tail call to avoid recursion.)
3867 goto LexNextToken;
3868 }
3869 }
3870
3871 // Notify MIOpt that we read a non-whitespace/non-comment token.
3872 MIOpt.ReadToken();
3873
3874 // Update the location of token as well as BufferPtr.
3875 FormTokenWithChars(Result, CurPtr, Kind);
3876 return true;
3877
3878HandleDirective:
3879 // We parsed a # character and it's the start of a preprocessing directive.
3880
3881 FormTokenWithChars(Result, CurPtr, tok::hash);
3882 PP->HandleDirective(Result);
3883
3884 if (PP->hadModuleLoaderFatalFailure()) {
3885 // With a fatal failure in the module loader, we abort parsing.
3886 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof")(static_cast <bool> (Result.is(tok::eof) && "Preprocessor did not set tok:eof"
) ? void (0) : __assert_fail ("Result.is(tok::eof) && \"Preprocessor did not set tok:eof\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/lib/Lex/Lexer.cpp"
, 3886, __extension__ __PRETTY_FUNCTION__))
;
3887 return true;
3888 }
3889
3890 // We parsed the directive; lex a token with the new state.
3891 return false;
3892}

/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Lexer.h

1//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the Lexer interface.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_LEX_LEXER_H
15#define LLVM_CLANG_LEX_LEXER_H
16
17#include "clang/Basic/LangOptions.h"
18#include "clang/Basic/SourceLocation.h"
19#include "clang/Basic/TokenKinds.h"
20#include "clang/Lex/PreprocessorLexer.h"
21#include "clang/Lex/Token.h"
22#include "llvm/ADT/Optional.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include <cassert>
26#include <cstdint>
27#include <string>
28
29namespace llvm {
30
31class MemoryBuffer;
32
33} // namespace llvm
34
35namespace clang {
36
37class DiagnosticBuilder;
38class Preprocessor;
39class SourceManager;
40
41/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
42/// recovering from.
43enum ConflictMarkerKind {
44 /// Not within a conflict marker.
45 CMK_None,
46
47 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
48 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
49 CMK_Normal,
50
51 /// A Perforce-style conflict marker, initiated by 4 ">"s,
52 /// separated by 4 "="s, and terminated by 4 "<"s.
53 CMK_Perforce
54};
55
56/// Describes the bounds (start, size) of the preamble and a flag required by
57/// PreprocessorOptions::PrecompiledPreambleBytes.
58/// The preamble includes the BOM, if any.
59struct PreambleBounds {
60 /// Size of the preamble in bytes.
61 unsigned Size;
62
63 /// Whether the preamble ends at the start of a new line.
64 ///
65 /// Used to inform the lexer as to whether it's starting at the beginning of
66 /// a line after skipping the preamble.
67 bool PreambleEndsAtStartOfLine;
68
69 PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
70 : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
71};
72
73/// Lexer - This provides a simple interface that turns a text buffer into a
74/// stream of tokens. This provides no support for file reading or buffering,
75/// or buffering/seeking of tokens, only forward lexing is supported. It relies
76/// on the specified Preprocessor object to handle preprocessor directives, etc.
77class Lexer : public PreprocessorLexer {
78 friend class Preprocessor;
79
80 void anchor() override;
81
82 //===--------------------------------------------------------------------===//
83 // Constant configuration values for this lexer.
84
85 // Start of the buffer.
86 const char *BufferStart;
87
88 // End of the buffer.
89 const char *BufferEnd;
90
91 // Location for start of file.
92 SourceLocation FileLoc;
93
94 // LangOpts enabled by this language (cache).
95 LangOptions LangOpts;
96
97 // True if lexer for _Pragma handling.
98 bool Is_PragmaLexer;
99
100 //===--------------------------------------------------------------------===//
101 // Context-specific lexing flags set by the preprocessor.
102 //
103
104 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
105 /// and return them as tokens. This is used for -C and -CC modes, and
106 /// whitespace preservation can be useful for some clients that want to lex
107 /// the file in raw mode and get every character from the file.
108 ///
109 /// When this is set to 2 it returns comments and whitespace. When set to 1
110 /// it returns comments, when it is set to 0 it returns normal tokens only.
111 unsigned char ExtendedTokenMode;
112
113 //===--------------------------------------------------------------------===//
114 // Context that changes as the file is lexed.
115 // NOTE: any state that mutates when in raw mode must have save/restore code
116 // in Lexer::isNextPPTokenLParen.
117
118 // BufferPtr - Current pointer into the buffer. This is the next character
119 // to be lexed.
120 const char *BufferPtr;
121
122 // IsAtStartOfLine - True if the next lexed token should get the "start of
123 // line" flag set on it.
124 bool IsAtStartOfLine;
125
126 bool IsAtPhysicalStartOfLine;
127
128 bool HasLeadingSpace;
129
130 bool HasLeadingEmptyMacro;
131
132 // CurrentConflictMarkerState - The kind of conflict marker we are handling.
133 ConflictMarkerKind CurrentConflictMarkerState;
134
135 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
136
137public:
138 /// Lexer constructor - Create a new lexer object for the specified buffer
139 /// with the specified preprocessor managing the lexing process. This lexer
140 /// assumes that the associated file buffer and Preprocessor objects will
141 /// outlive it, so it doesn't take ownership of either of them.
142 Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, Preprocessor &PP);
143
144 /// Lexer constructor - Create a new raw lexer object. This object is only
145 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
146 /// text range will outlive it, so it doesn't take ownership of it.
147 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
148 const char *BufStart, const char *BufPtr, const char *BufEnd);
149
150 /// Lexer constructor - Create a new raw lexer object. This object is only
151 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
152 /// text range will outlive it, so it doesn't take ownership of it.
153 Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer,
154 const SourceManager &SM, const LangOptions &LangOpts);
155
156 Lexer(const Lexer &) = delete;
157 Lexer &operator=(const Lexer &) = delete;
158
159 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
160 /// _Pragma expansion. This has a variety of magic semantics that this method
161 /// sets up. It returns a new'd Lexer that must be delete'd when done.
162 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
163 SourceLocation ExpansionLocStart,
164 SourceLocation ExpansionLocEnd,
165 unsigned TokLen, Preprocessor &PP);
166
167 /// getLangOpts - Return the language features currently enabled.
168 /// NOTE: this lexer modifies features as a file is parsed!
169 const LangOptions &getLangOpts() const { return LangOpts; }
170
171 /// getFileLoc - Return the File Location for the file we are lexing out of.
172 /// The physical location encodes the location where the characters come from,
173 /// the virtual location encodes where we should *claim* the characters came
174 /// from. Currently this is only used by _Pragma handling.
175 SourceLocation getFileLoc() const { return FileLoc; }
176
177private:
178 /// Lex - Return the next token in the file. If this is the end of file, it
179 /// return the tok::eof token. This implicitly involves the preprocessor.
180 bool Lex(Token &Result);
181
182public:
183 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
184 bool isPragmaLexer() const { return Is_PragmaLexer; }
185
186private:
187 /// IndirectLex - An indirect call to 'Lex' that can be invoked via
188 /// the PreprocessorLexer interface.
189 void IndirectLex(Token &Result) override { Lex(Result); }
190
191public:
192 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
193 /// associated preprocessor object. Return true if the 'next character to
194 /// read' pointer points at the end of the lexer buffer, false otherwise.
195 bool LexFromRawLexer(Token &Result) {
196 assert(LexingRawMode && "Not already in raw mode!")(static_cast <bool> (LexingRawMode && "Not already in raw mode!"
) ? void (0) : __assert_fail ("LexingRawMode && \"Not already in raw mode!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Lexer.h"
, 196, __extension__ __PRETTY_FUNCTION__))
;
197 Lex(Result);
198 // Note that lexing to the end of the buffer doesn't implicitly delete the
199 // lexer when in raw mode.
200 return BufferPtr == BufferEnd;
201 }
202
203 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
204 /// every character in the file, including whitespace and comments. This
205 /// should only be used in raw mode, as the preprocessor is not prepared to
206 /// deal with the excess tokens.
207 bool isKeepWhitespaceMode() const {
208 return ExtendedTokenMode > 1;
209 }
210
211 /// SetKeepWhitespaceMode - This method lets clients enable or disable
212 /// whitespace retention mode.
213 void SetKeepWhitespaceMode(bool Val) {
214 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&(static_cast <bool> ((!Val || LexingRawMode || LangOpts
.TraditionalCPP) && "Can only retain whitespace in raw mode or -traditional-cpp"
) ? void (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Lexer.h"
, 215, __extension__ __PRETTY_FUNCTION__))
215 "Can only retain whitespace in raw mode or -traditional-cpp")(static_cast <bool> ((!Val || LexingRawMode || LangOpts
.TraditionalCPP) && "Can only retain whitespace in raw mode or -traditional-cpp"
) ? void (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Lexer.h"
, 215, __extension__ __PRETTY_FUNCTION__))
;
216 ExtendedTokenMode = Val ? 2 : 0;
217 }
218
219 /// inKeepCommentMode - Return true if the lexer should return comments as
220 /// tokens.
221 bool inKeepCommentMode() const {
222 return ExtendedTokenMode > 0;
223 }
224
225 /// SetCommentRetentionMode - Change the comment retention mode of the lexer
226 /// to the specified mode. This is really only useful when lexing in raw
227 /// mode, because otherwise the lexer needs to manage this.
228 void SetCommentRetentionState(bool Mode) {
229 assert(!isKeepWhitespaceMode() &&(static_cast <bool> (!isKeepWhitespaceMode() &&
"Can't play with comment retention state when retaining whitespace"
) ? void (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Lexer.h"
, 230, __extension__ __PRETTY_FUNCTION__))
230 "Can't play with comment retention state when retaining whitespace")(static_cast <bool> (!isKeepWhitespaceMode() &&
"Can't play with comment retention state when retaining whitespace"
) ? void (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Lexer.h"
, 230, __extension__ __PRETTY_FUNCTION__))
;
231 ExtendedTokenMode = Mode ? 1 : 0;
232 }
233
234 /// Sets the extended token mode back to its initial value, according to the
235 /// language options and preprocessor. This controls whether the lexer
236 /// produces comment and whitespace tokens.
237 ///
238 /// This requires the lexer to have an associated preprocessor. A standalone
239 /// lexer has nothing to reset to.
240 void resetExtendedTokenMode();
241
242 /// Gets source code buffer.
243 StringRef getBuffer() const {
244 return StringRef(BufferStart, BufferEnd - BufferStart);
245 }
246
247 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
248 /// uninterpreted string. This switches the lexer out of directive mode.
249 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
250
251
252 /// Diag - Forwarding function for diagnostics. This translate a source
253 /// position in the current buffer into a SourceLocation object for rendering.
254 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
255
256 /// getSourceLocation - Return a source location identifier for the specified
257 /// offset in the current file.
258 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
259
260 /// getSourceLocation - Return a source location for the next character in
261 /// the current file.
262 SourceLocation getSourceLocation() override {
263 return getSourceLocation(BufferPtr);
264 }
265
266 /// Return the current location in the buffer.
267 const char *getBufferLocation() const { return BufferPtr; }
268
269 /// Stringify - Convert the specified string into a C string by i) escaping
270 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
271 /// If Charify is true, this escapes the ' character instead of ".
272 static std::string Stringify(StringRef Str, bool Charify = false);
273
274 /// Stringify - Convert the specified string into a C string by i) escaping
275 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
276 static void Stringify(SmallVectorImpl<char> &Str);
277
278 /// getSpelling - This method is used to get the spelling of a token into a
279 /// preallocated buffer, instead of as an std::string. The caller is required
280 /// to allocate enough space for the token, which is guaranteed to be at least
281 /// Tok.getLength() bytes long. The length of the actual result is returned.
282 ///
283 /// Note that this method may do two possible things: it may either fill in
284 /// the buffer specified with characters, or it may *change the input pointer*
285 /// to point to a constant buffer with the data already in it (avoiding a
286 /// copy). The caller is not allowed to modify the returned buffer pointer
287 /// if an internal buffer is returned.
288 static unsigned getSpelling(const Token &Tok, const char *&Buffer,
289 const SourceManager &SourceMgr,
290 const LangOptions &LangOpts,
291 bool *Invalid = nullptr);
292
293 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
294 /// token is the characters used to represent the token in the source file
295 /// after trigraph expansion and escaped-newline folding. In particular, this
296 /// wants to get the true, uncanonicalized, spelling of things like digraphs
297 /// UCNs, etc.
298 static std::string getSpelling(const Token &Tok,
299 const SourceManager &SourceMgr,
300 const LangOptions &LangOpts,
301 bool *Invalid = nullptr);
302
303 /// getSpelling - This method is used to get the spelling of the
304 /// token at the given source location. If, as is usually true, it
305 /// is not necessary to copy any data, then the returned string may
306 /// not point into the provided buffer.
307 ///
308 /// This method lexes at the expansion depth of the given
309 /// location and does not jump to the expansion or spelling
310 /// location.
311 static StringRef getSpelling(SourceLocation loc,
312 SmallVectorImpl<char> &buffer,
313 const SourceManager &SourceMgr,
314 const LangOptions &LangOpts,
315 bool *invalid = nullptr);
316
317 /// MeasureTokenLength - Relex the token at the specified location and return
318 /// its length in bytes in the input file. If the token needs cleaning (e.g.
319 /// includes a trigraph or an escaped newline) then this count includes bytes
320 /// that are part of that.
321 static unsigned MeasureTokenLength(SourceLocation Loc,
322 const SourceManager &SM,
323 const LangOptions &LangOpts);
324
325 /// Relex the token at the specified location.
326 /// \returns true if there was a failure, false on success.
327 static bool getRawToken(SourceLocation Loc, Token &Result,
328 const SourceManager &SM,
329 const LangOptions &LangOpts,
330 bool IgnoreWhiteSpace = false);
331
332 /// Given a location any where in a source buffer, find the location
333 /// that corresponds to the beginning of the token in which the original
334 /// source location lands.
335 static SourceLocation GetBeginningOfToken(SourceLocation Loc,
336 const SourceManager &SM,
337 const LangOptions &LangOpts);
338
339 /// Get the physical length (including trigraphs and escaped newlines) of the
340 /// first \p Characters characters of the token starting at TokStart.
341 static unsigned getTokenPrefixLength(SourceLocation TokStart,
342 unsigned Characters,
343 const SourceManager &SM,
344 const LangOptions &LangOpts);
345
346 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
347 /// location at the start of a token, return a new location that specifies a
348 /// character within the token. This handles trigraphs and escaped newlines.
349 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
350 unsigned Characters,
351 const SourceManager &SM,
352 const LangOptions &LangOpts) {
353 return TokStart.getLocWithOffset(
354 getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
355 }
356
357 /// Computes the source location just past the end of the
358 /// token at this source location.
359 ///
360 /// This routine can be used to produce a source location that
361 /// points just past the end of the token referenced by \p Loc, and
362 /// is generally used when a diagnostic needs to point just after a
363 /// token where it expected something different that it received. If
364 /// the returned source location would not be meaningful (e.g., if
365 /// it points into a macro), this routine returns an invalid
366 /// source location.
367 ///
368 /// \param Offset an offset from the end of the token, where the source
369 /// location should refer to. The default offset (0) produces a source
370 /// location pointing just past the end of the token; an offset of 1 produces
371 /// a source location pointing to the last character in the token, etc.
372 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
373 const SourceManager &SM,
374 const LangOptions &LangOpts);
375
376 /// Given a token range, produce a corresponding CharSourceRange that
377 /// is not a token range. This allows the source range to be used by
378 /// components that don't have access to the lexer and thus can't find the
379 /// end of the range for themselves.
380 static CharSourceRange getAsCharRange(SourceRange Range,
381 const SourceManager &SM,
382 const LangOptions &LangOpts) {
383 SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
384 return End.isInvalid() ? CharSourceRange()
385 : CharSourceRange::getCharRange(
386 Range.getBegin(), End.getLocWithOffset(-1));
387 }
388 static CharSourceRange getAsCharRange(CharSourceRange Range,
389 const SourceManager &SM,
390 const LangOptions &LangOpts) {
391 return Range.isTokenRange()
392 ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
393 : Range;
394 }
395
396 /// Returns true if the given MacroID location points at the first
397 /// token of the macro expansion.
398 ///
399 /// \param MacroBegin If non-null and function returns true, it is set to
400 /// begin location of the macro.
401 static bool isAtStartOfMacroExpansion(SourceLocation loc,
402 const SourceManager &SM,
403 const LangOptions &LangOpts,
404 SourceLocation *MacroBegin = nullptr);
405
406 /// Returns true if the given MacroID location points at the last
407 /// token of the macro expansion.
408 ///
409 /// \param MacroEnd If non-null and function returns true, it is set to
410 /// end location of the macro.
411 static bool isAtEndOfMacroExpansion(SourceLocation loc,
412 const SourceManager &SM,
413 const LangOptions &LangOpts,
414 SourceLocation *MacroEnd = nullptr);
415
416 /// Accepts a range and returns a character range with file locations.
417 ///
418 /// Returns a null range if a part of the range resides inside a macro
419 /// expansion or the range does not reside on the same FileID.
420 ///
421 /// This function is trying to deal with macros and return a range based on
422 /// file locations. The cases where it can successfully handle macros are:
423 ///
424 /// -begin or end range lies at the start or end of a macro expansion, in
425 /// which case the location will be set to the expansion point, e.g:
426 /// \#define M 1 2
427 /// a M
428 /// If you have a range [a, 2] (where 2 came from the macro), the function
429 /// will return a range for "a M"
430 /// if you have range [a, 1], the function will fail because the range
431 /// overlaps with only a part of the macro
432 ///
433 /// -The macro is a function macro and the range can be mapped to the macro
434 /// arguments, e.g:
435 /// \#define M 1 2
436 /// \#define FM(x) x
437 /// FM(a b M)
438 /// if you have range [b, 2], the function will return the file range "b M"
439 /// inside the macro arguments.
440 /// if you have range [a, 2], the function will return the file range
441 /// "FM(a b M)" since the range includes all of the macro expansion.
442 static CharSourceRange makeFileCharRange(CharSourceRange Range,
443 const SourceManager &SM,
444 const LangOptions &LangOpts);
445
446 /// Returns a string for the source that the range encompasses.
447 static StringRef getSourceText(CharSourceRange Range,
448 const SourceManager &SM,
449 const LangOptions &LangOpts,
450 bool *Invalid = nullptr);
451
452 /// Retrieve the name of the immediate macro expansion.
453 ///
454 /// This routine starts from a source location, and finds the name of the macro
455 /// responsible for its immediate expansion. It looks through any intervening
456 /// macro argument expansions to compute this. It returns a StringRef which
457 /// refers to the SourceManager-owned buffer of the source where that macro
458 /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
459 static StringRef getImmediateMacroName(SourceLocation Loc,
460 const SourceManager &SM,
461 const LangOptions &LangOpts);
462
463 /// Retrieve the name of the immediate macro expansion.
464 ///
465 /// This routine starts from a source location, and finds the name of the
466 /// macro responsible for its immediate expansion. It looks through any
467 /// intervening macro argument expansions to compute this. It returns a
468 /// StringRef which refers to the SourceManager-owned buffer of the source
469 /// where that macro name is spelled. Thus, the result shouldn't out-live
470 /// that SourceManager.
471 ///
472 /// This differs from Lexer::getImmediateMacroName in that any macro argument
473 /// location will result in the topmost function macro that accepted it.
474 /// e.g.
475 /// \code
476 /// MAC1( MAC2(foo) )
477 /// \endcode
478 /// for location of 'foo' token, this function will return "MAC1" while
479 /// Lexer::getImmediateMacroName will return "MAC2".
480 static StringRef getImmediateMacroNameForDiagnostics(
481 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
482
483 /// Compute the preamble of the given file.
484 ///
485 /// The preamble of a file contains the initial comments, include directives,
486 /// and other preprocessor directives that occur before the code in this
487 /// particular file actually begins. The preamble of the main source file is
488 /// a potential prefix header.
489 ///
490 /// \param Buffer The memory buffer containing the file's contents.
491 ///
492 /// \param MaxLines If non-zero, restrict the length of the preamble
493 /// to fewer than this number of lines.
494 ///
495 /// \returns The offset into the file where the preamble ends and the rest
496 /// of the file begins along with a boolean value indicating whether
497 /// the preamble ends at the beginning of a new line.
498 static PreambleBounds ComputePreamble(StringRef Buffer,
499 const LangOptions &LangOpts,
500 unsigned MaxLines = 0);
501
502 /// Finds the token that comes right after the given location.
503 ///
504 /// Returns the next token, or none if the location is inside a macro.
505 static Optional<Token> findNextToken(SourceLocation Loc,
506 const SourceManager &SM,
507 const LangOptions &LangOpts);
508
509 /// Checks that the given token is the first token that occurs after
510 /// the given location (this excludes comments and whitespace). Returns the
511 /// location immediately after the specified token. If the token is not found
512 /// or the location is inside a macro, the returned source location will be
513 /// invalid.
514 static SourceLocation findLocationAfterToken(SourceLocation loc,
515 tok::TokenKind TKind,
516 const SourceManager &SM,
517 const LangOptions &LangOpts,
518 bool SkipTrailingWhitespaceAndNewLine);
519
520 /// Returns true if the given character could appear in an identifier.
521 static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
522
523 /// Checks whether new line pointed by Str is preceded by escape
524 /// sequence.
525 static bool isNewLineEscaped(const char *BufferStart, const char *Str);
526
527 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
528 /// emit a warning.
529 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
530 const LangOptions &LangOpts) {
531 // If this is not a trigraph and not a UCN or escaped newline, return
532 // quickly.
533 if (isObviouslySimpleCharacter(Ptr[0])) {
534 Size = 1;
535 return *Ptr;
536 }
537
538 Size = 0;
539 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
540 }
541
542 /// Returns the leading whitespace for line that corresponds to the given
543 /// location \p Loc.
544 static StringRef getIndentationForLine(SourceLocation Loc,
545 const SourceManager &SM);
546
547private:
548 //===--------------------------------------------------------------------===//
549 // Internal implementation interfaces.
550
551 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
552 /// by Lex.
553 ///
554 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
555
556 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
557
558 /// Given that a token begins with the Unicode character \p C, figure out
559 /// what kind of token it is and dispatch to the appropriate lexing helper
560 /// function.
561 bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
562
563 /// FormTokenWithChars - When we lex a token, we have identified a span
564 /// starting at BufferPtr, going to TokEnd that forms the token. This method
565 /// takes that range and assigns it to the token as its location and size. In
566 /// addition, since tokens cannot overlap, this also updates BufferPtr to be
567 /// TokEnd.
568 void FormTokenWithChars(Token &Result, const char *TokEnd,
569 tok::TokenKind Kind) {
570 unsigned TokLen = TokEnd-BufferPtr;
571 Result.setLength(TokLen);
572 Result.setLocation(getSourceLocation(BufferPtr, TokLen));
573 Result.setKind(Kind);
574 BufferPtr = TokEnd;
575 }
576
577 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
578 /// tok::l_paren token, 0 if it is something else and 2 if there are no more
579 /// tokens in the buffer controlled by this lexer.
580 unsigned isNextPPTokenLParen();
581
582 //===--------------------------------------------------------------------===//
583 // Lexer character reading interfaces.
584
585 // This lexer is built on two interfaces for reading characters, both of which
586 // automatically provide phase 1/2 translation. getAndAdvanceChar is used
587 // when we know that we will be reading a character from the input buffer and
588 // that this character will be part of the result token. This occurs in (f.e.)
589 // string processing, because we know we need to read until we find the
590 // closing '"' character.
591 //
592 // The second interface is the combination of getCharAndSize with
593 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
594 // returning it and its size. If the lexer decides that this character is
595 // part of the current token, it calls ConsumeChar on it. This two stage
596 // approach allows us to emit diagnostics for characters (e.g. warnings about
597 // trigraphs), knowing that they only are emitted if the character is
598 // consumed.
599
600 /// isObviouslySimpleCharacter - Return true if the specified character is
601 /// obviously the same in translation phase 1 and translation phase 3. This
602 /// can return false for characters that end up being the same, but it will
603 /// never return true for something that needs to be mapped.
604 static bool isObviouslySimpleCharacter(char C) {
605 return C != '?' && C != '\\';
606 }
607
608 /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
609 /// advance over it, and return it. This is tricky in several cases. Here we
610 /// just handle the trivial case and fall-back to the non-inlined
611 /// getCharAndSizeSlow method to handle the hard case.
612 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
613 // If this is not a trigraph and not a UCN or escaped newline, return
614 // quickly.
615 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
3
Taking false branch
616
617 unsigned Size = 0;
618 char C = getCharAndSizeSlow(Ptr, Size, &Tok);
4
Calling 'Lexer::getCharAndSizeSlow'
619 Ptr += Size;
620 return C;
621 }
622
623 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
624 /// and added to a given token, check to see if there are diagnostics that
625 /// need to be emitted or flags that need to be set on the token. If so, do
626 /// it.
627 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
628 // Normal case, we consumed exactly one token. Just return it.
629 if (Size == 1)
630 return Ptr+Size;
631
632 // Otherwise, re-lex the character with a current token, allowing
633 // diagnostics to be emitted and flags to be set.
634 Size = 0;
635 getCharAndSizeSlow(Ptr, Size, &Tok);
636 return Ptr+Size;
637 }
638
639 /// getCharAndSize - Peek a single 'character' from the specified buffer,
640 /// get its size, and return it. This is tricky in several cases. Here we
641 /// just handle the trivial case and fall-back to the non-inlined
642 /// getCharAndSizeSlow method to handle the hard case.
643 inline char getCharAndSize(const char *Ptr, unsigned &Size) {
644 // If this is not a trigraph and not a UCN or escaped newline, return
645 // quickly.
646 if (isObviouslySimpleCharacter(Ptr[0])) {
647 Size = 1;
648 return *Ptr;
649 }
650
651 Size = 0;
652 return getCharAndSizeSlow(Ptr, Size);
653 }
654
655 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
656 /// method.
657 char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
658 Token *Tok = nullptr);
659
660 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
661 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
662 /// to this function.
663 static unsigned getEscapedNewLineSize(const char *P);
664
665 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
666 /// them), skip over them and return the first non-escaped-newline found,
667 /// otherwise return P.
668 static const char *SkipEscapedNewLines(const char *P);
669
670 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
671 /// diagnostic.
672 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
673 const LangOptions &LangOpts);
674
675 //===--------------------------------------------------------------------===//
676 // Other lexer functions.
677
678 void SetByteOffset(unsigned Offset, bool StartOfLine);
679
680 void PropagateLineStartLeadingSpaceInfo(Token &Result);
681
682 const char *LexUDSuffix(Token &Result, const char *CurPtr,
683 bool IsStringLiteral);
684
685 // Helper functions to lex the remainder of a token of the specific type.
686 bool LexIdentifier (Token &Result, const char *CurPtr);
687 bool LexNumericConstant (Token &Result, const char *CurPtr);
688 bool LexStringLiteral (Token &Result, const char *CurPtr,
689 tok::TokenKind Kind);
690 bool LexRawStringLiteral (Token &Result, const char *CurPtr,
691 tok::TokenKind Kind);
692 bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
693 bool LexCharConstant (Token &Result, const char *CurPtr,
694 tok::TokenKind Kind);
695 bool LexEndOfFile (Token &Result, const char *CurPtr);
696 bool SkipWhitespace (Token &Result, const char *CurPtr,
697 bool &TokAtPhysicalStartOfLine);
698 bool SkipLineComment (Token &Result, const char *CurPtr,
699 bool &TokAtPhysicalStartOfLine);
700 bool SkipBlockComment (Token &Result, const char *CurPtr,
701 bool &TokAtPhysicalStartOfLine);
702 bool SaveLineComment (Token &Result, const char *CurPtr);
703
704 bool IsStartOfConflictMarker(const char *CurPtr);
705 bool HandleEndOfConflictMarker(const char *CurPtr);
706
707 bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
708
709 bool isCodeCompletionPoint(const char *CurPtr) const;
710 void cutOffLexing() { BufferPtr = BufferEnd; }
711
712 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
713
714
715 /// Read a universal character name.
716 ///
717 /// \param CurPtr The position in the source buffer after the initial '\'.
718 /// If the UCN is syntactically well-formed (but not necessarily
719 /// valid), this parameter will be updated to point to the
720 /// character after the UCN.
721 /// \param SlashLoc The position in the source buffer of the '\'.
722 /// \param Tok The token being formed. Pass \c nullptr to suppress diagnostics
723 /// and handle token formation in the caller.
724 ///
725 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
726 /// invalid.
727 uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
728
729 /// Try to consume a UCN as part of an identifier at the current
730 /// location.
731 /// \param CurPtr Initially points to the range of characters in the source
732 /// buffer containing the '\'. Updated to point past the end of
733 /// the UCN on success.
734 /// \param Size The number of characters occupied by the '\' (including
735 /// trigraphs and escaped newlines).
736 /// \param Result The token being produced. Marked as containing a UCN on
737 /// success.
738 /// \return \c true if a UCN was lexed and it produced an acceptable
739 /// identifier character, \c false otherwise.
740 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
741 Token &Result);
742
743 /// Try to consume an identifier character encoded in UTF-8.
744 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
745 /// sequence. On success, updated to point past the end of it.
746 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
747 /// character was lexed, \c false otherwise.
748 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
749};
750
751} // namespace clang
752
753#endif // LLVM_CLANG_LEX_LEXER_H

/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h

1//===--- Token.h - Token interface ------------------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the Token interface.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_LEX_TOKEN_H
15#define LLVM_CLANG_LEX_TOKEN_H
16
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/TokenKinds.h"
19#include "llvm/ADT/StringRef.h"
20#include <cassert>
21
22namespace clang {
23
24class IdentifierInfo;
25
26/// Token - This structure provides full information about a lexed token.
27/// It is not intended to be space efficient, it is intended to return as much
28/// information as possible about each returned token. This is expected to be
29/// compressed into a smaller form if memory footprint is important.
30///
31/// The parser can create a special "annotation token" representing a stream of
32/// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
33/// can be represented by a single typename annotation token that carries
34/// information about the SourceRange of the tokens and the type object.
35class Token {
36 /// The location of the token. This is actually a SourceLocation.
37 unsigned Loc;
38
39 // Conceptually these next two fields could be in a union. However, this
40 // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
41 // routine. Keeping as separate members with casts until a more beautiful fix
42 // presents itself.
43
44 /// UintData - This holds either the length of the token text, when
45 /// a normal token, or the end of the SourceRange when an annotation
46 /// token.
47 unsigned UintData;
48
49 /// PtrData - This is a union of four different pointer types, which depends
50 /// on what type of token this is:
51 /// Identifiers, keywords, etc:
52 /// This is an IdentifierInfo*, which contains the uniqued identifier
53 /// spelling.
54 /// Literals: isLiteral() returns true.
55 /// This is a pointer to the start of the token in a text buffer, which
56 /// may be dirty (have trigraphs / escaped newlines).
57 /// Annotations (resolved type names, C++ scopes, etc): isAnnotation().
58 /// This is a pointer to sema-specific data for the annotation token.
59 /// Eof:
60 // This is a pointer to a Decl.
61 /// Other:
62 /// This is null.
63 void *PtrData;
64
65 /// Kind - The actual flavor of token this is.
66 tok::TokenKind Kind;
67
68 /// Flags - Bits we track about this token, members of the TokenFlags enum.
69 unsigned short Flags;
70
71public:
72 // Various flags set per token:
73 enum TokenFlags {
74 StartOfLine = 0x01, // At start of line or only after whitespace
75 // (considering the line after macro expansion).
76 LeadingSpace = 0x02, // Whitespace exists before this token (considering
77 // whitespace after macro expansion).
78 DisableExpand = 0x04, // This identifier may never be macro expanded.
79 NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
80 LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
81 HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
82 HasUCN = 0x40, // This identifier contains a UCN.
83 IgnoredComma = 0x80, // This comma is not a macro argument separator (MS).
84 StringifiedInMacro = 0x100, // This string or character literal is formed by
85 // macro stringizing or charizing operator.
86 CommaAfterElided = 0x200, // The comma following this token was elided (MS).
87 IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
88 };
89
90 tok::TokenKind getKind() const { return Kind; }
91 void setKind(tok::TokenKind K) { Kind = K; }
92
93 /// is/isNot - Predicates to check if this token is a specific kind, as in
94 /// "if (Tok.is(tok::l_brace)) {...}".
95 bool is(tok::TokenKind K) const { return Kind == K; }
96 bool isNot(tok::TokenKind K) const { return Kind != K; }
97 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
98 return is(K1) || is(K2);
99 }
100 template <typename... Ts>
101 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, Ts... Ks) const {
102 return is(K1) || isOneOf(K2, Ks...);
103 }
104
105 /// Return true if this is a raw identifier (when lexing
106 /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode).
107 bool isAnyIdentifier() const {
108 return tok::isAnyIdentifier(getKind());
109 }
110
111 /// Return true if this is a "literal", like a numeric
112 /// constant, string, etc.
113 bool isLiteral() const {
114 return tok::isLiteral(getKind());
115 }
116
117 /// Return true if this is any of tok::annot_* kind tokens.
118 bool isAnnotation() const {
119 return tok::isAnnotation(getKind());
120 }
121
122 /// Return a source location identifier for the specified
123 /// offset in the current file.
124 SourceLocation getLocation() const {
125 return SourceLocation::getFromRawEncoding(Loc);
126 }
127 unsigned getLength() const {
128 assert(!isAnnotation() && "Annotation tokens have no length field")(static_cast <bool> (!isAnnotation() && "Annotation tokens have no length field"
) ? void (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 128, __extension__ __PRETTY_FUNCTION__))
;
129 return UintData;
130 }
131
132 void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); }
133 void setLength(unsigned Len) {
134 assert(!isAnnotation() && "Annotation tokens have no length field")(static_cast <bool> (!isAnnotation() && "Annotation tokens have no length field"
) ? void (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 134, __extension__ __PRETTY_FUNCTION__))
;
135 UintData = Len;
136 }
137
138 SourceLocation getAnnotationEndLoc() const {
139 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")(static_cast <bool> (isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? void (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 139, __extension__ __PRETTY_FUNCTION__))
;
140 return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc);
141 }
142 void setAnnotationEndLoc(SourceLocation L) {
143 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")(static_cast <bool> (isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? void (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 143, __extension__ __PRETTY_FUNCTION__))
;
144 UintData = L.getRawEncoding();
145 }
146
147 SourceLocation getLastLoc() const {
148 return isAnnotation() ? getAnnotationEndLoc() : getLocation();
149 }
150
151 SourceLocation getEndLoc() const {
152 return isAnnotation() ? getAnnotationEndLoc()
153 : getLocation().getLocWithOffset(getLength());
154 }
155
156 /// SourceRange of the group of tokens that this annotation token
157 /// represents.
158 SourceRange getAnnotationRange() const {
159 return SourceRange(getLocation(), getAnnotationEndLoc());
160 }
161 void setAnnotationRange(SourceRange R) {
162 setLocation(R.getBegin());
163 setAnnotationEndLoc(R.getEnd());
164 }
165
166 const char *getName() const { return tok::getTokenName(Kind); }
167
168 /// Reset all flags to cleared.
169 void startToken() {
170 Kind = tok::unknown;
171 Flags = 0;
172 PtrData = nullptr;
173 UintData = 0;
174 Loc = SourceLocation().getRawEncoding();
175 }
176
177 IdentifierInfo *getIdentifierInfo() const {
178 assert(isNot(tok::raw_identifier) &&(static_cast <bool> (isNot(tok::raw_identifier) &&
"getIdentifierInfo() on a tok::raw_identifier token!") ? void
(0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 179, __extension__ __PRETTY_FUNCTION__))
179 "getIdentifierInfo() on a tok::raw_identifier token!")(static_cast <bool> (isNot(tok::raw_identifier) &&
"getIdentifierInfo() on a tok::raw_identifier token!") ? void
(0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180 assert(!isAnnotation() &&(static_cast <bool> (!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? void (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 181, __extension__ __PRETTY_FUNCTION__))
181 "getIdentifierInfo() on an annotation token!")(static_cast <bool> (!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? void (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 181, __extension__ __PRETTY_FUNCTION__))
;
182 if (isLiteral()) return nullptr;
183 if (is(tok::eof)) return nullptr;
184 return (IdentifierInfo*) PtrData;
185 }
186 void setIdentifierInfo(IdentifierInfo *II) {
187 PtrData = (void*) II;
188 }
189
190 const void *getEofData() const {
191 assert(is(tok::eof))(static_cast <bool> (is(tok::eof)) ? void (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 191, __extension__ __PRETTY_FUNCTION__))
;
192 return reinterpret_cast<const void *>(PtrData);
193 }
194 void setEofData(const void *D) {
195 assert(is(tok::eof))(static_cast <bool> (is(tok::eof)) ? void (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 195, __extension__ __PRETTY_FUNCTION__))
;
196 assert(!PtrData)(static_cast <bool> (!PtrData) ? void (0) : __assert_fail
("!PtrData", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 196, __extension__ __PRETTY_FUNCTION__))
;
197 PtrData = const_cast<void *>(D);
198 }
199
200 /// getRawIdentifier - For a raw identifier token (i.e., an identifier
201 /// lexed in raw mode), returns a reference to the text substring in the
202 /// buffer if known.
203 StringRef getRawIdentifier() const {
204 assert(is(tok::raw_identifier))(static_cast <bool> (is(tok::raw_identifier)) ? void (0
) : __assert_fail ("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 204, __extension__ __PRETTY_FUNCTION__))
;
205 return StringRef(reinterpret_cast<const char *>(PtrData), getLength());
206 }
207 void setRawIdentifierData(const char *Ptr) {
208 assert(is(tok::raw_identifier))(static_cast <bool> (is(tok::raw_identifier)) ? void (0
) : __assert_fail ("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 208, __extension__ __PRETTY_FUNCTION__))
;
209 PtrData = const_cast<char*>(Ptr);
210 }
211
212 /// getLiteralData - For a literal token (numeric constant, string, etc), this
213 /// returns a pointer to the start of it in the text buffer if known, null
214 /// otherwise.
215 const char *getLiteralData() const {
216 assert(isLiteral() && "Cannot get literal data of non-literal")(static_cast <bool> (isLiteral() && "Cannot get literal data of non-literal"
) ? void (0) : __assert_fail ("isLiteral() && \"Cannot get literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 216, __extension__ __PRETTY_FUNCTION__))
;
217 return reinterpret_cast<const char*>(PtrData);
218 }
219 void setLiteralData(const char *Ptr) {
220 assert(isLiteral() && "Cannot set literal data of non-literal")(static_cast <bool> (isLiteral() && "Cannot set literal data of non-literal"
) ? void (0) : __assert_fail ("isLiteral() && \"Cannot set literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 220, __extension__ __PRETTY_FUNCTION__))
;
221 PtrData = const_cast<char*>(Ptr);
222 }
223
224 void *getAnnotationValue() const {
225 assert(isAnnotation() && "Used AnnotVal on non-annotation token")(static_cast <bool> (isAnnotation() && "Used AnnotVal on non-annotation token"
) ? void (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 225, __extension__ __PRETTY_FUNCTION__))
;
226 return PtrData;
227 }
228 void setAnnotationValue(void *val) {
229 assert(isAnnotation() && "Used AnnotVal on non-annotation token")(static_cast <bool> (isAnnotation() && "Used AnnotVal on non-annotation token"
) ? void (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-7~svn338205/tools/clang/include/clang/Lex/Token.h"
, 229, __extension__ __PRETTY_FUNCTION__))
;
230 PtrData = val;
231 }
232
233 /// Set the specified flag.
234 void setFlag(TokenFlags Flag) {
235 Flags |= Flag;
11
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage
236 }
237
238 /// Get the specified flag.
239 bool getFlag(TokenFlags Flag) const {
240 return (Flags & Flag) != 0;
241 }
242
243 /// Unset the specified flag.
244 void clearFlag(TokenFlags Flag) {
245 Flags &= ~Flag;
246 }
247
248 /// Return the internal represtation of the flags.
249 ///
250 /// This is only intended for low-level operations such as writing tokens to
251 /// disk.
252 unsigned getFlags() const {
253 return Flags;
254 }
255
256 /// Set a flag to either true or false.
257 void setFlagValue(TokenFlags Flag, bool Val) {
258 if (Val)
259 setFlag(Flag);
260 else
261 clearFlag(Flag);
262 }
263
264 /// isAtStartOfLine - Return true if this token is at the start of a line.
265 ///
266 bool isAtStartOfLine() const { return getFlag(StartOfLine); }
267
268 /// Return true if this token has whitespace before it.
269 ///
270 bool hasLeadingSpace() const { return getFlag(LeadingSpace); }
271
272 /// Return true if this identifier token should never
273 /// be expanded in the future, due to C99 6.10.3.4p2.
274 bool isExpandDisabled() const { return getFlag(DisableExpand); }
275
276 /// Return true if we have an ObjC keyword identifier.
277 bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const;
278
279 /// Return the ObjC keyword kind.
280 tok::ObjCKeywordKind getObjCKeywordID() const;
281
282 /// Return true if this token has trigraphs or escaped newlines in it.
283 bool needsCleaning() const { return getFlag(NeedsCleaning); }
284
285 /// Return true if this token has an empty macro before it.
286 ///
287 bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); }
288
289 /// Return true if this token is a string or character literal which
290 /// has a ud-suffix.
291 bool hasUDSuffix() const { return getFlag(HasUDSuffix); }
292
293 /// Returns true if this token contains a universal character name.
294 bool hasUCN() const { return getFlag(HasUCN); }
295
296 /// Returns true if this token is formed by macro by stringizing or charizing
297 /// operator.
298 bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); }
299
300 /// Returns true if the comma after this token was elided.
301 bool commaAfterElided() const { return getFlag(CommaAfterElided); }
302
303 /// Returns true if this token is an editor placeholder.
304 ///
305 /// Editor placeholders are produced by the code-completion engine and are
306 /// represented as characters between '<#' and '#>' in the source code. The
307 /// lexer uses identifier tokens to represent placeholders.
308 bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
309};
310
311/// Information about the conditional stack (\#if directives)
312/// currently active.
313struct PPConditionalInfo {
314 /// Location where the conditional started.
315 SourceLocation IfLoc;
316
317 /// True if this was contained in a skipping directive, e.g.,
318 /// in a "\#if 0" block.
319 bool WasSkipping;
320
321 /// True if we have emitted tokens already, and now we're in
322 /// an \#else block or something. Only useful in Skipping blocks.
323 bool FoundNonSkip;
324
325 /// True if we've seen a \#else in this block. If so,
326 /// \#elif/\#else directives are not allowed.
327 bool FoundElse;
328};
329
330} // end namespace clang
331
332namespace llvm {
333 template <>
334 struct isPodLike<clang::Token> { static const bool value = true; };
335} // end namespace llvm
336
337#endif // LLVM_CLANG_LEX_TOKEN_H