Bug Summary

File:tools/clang/include/clang/Lex/Token.h
Warning:line 235, column 11
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Lexer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -relaxed-aliasing -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-8/lib/clang/8.0.0 -D CLANG_VENDOR="Debian " -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-8~svn350071/build-llvm/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include -I /build/llvm-toolchain-snapshot-8~svn350071/build-llvm/tools/clang/include -I /build/llvm-toolchain-snapshot-8~svn350071/build-llvm/include -I /build/llvm-toolchain-snapshot-8~svn350071/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/include/clang/8.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-8/lib/clang/8.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-8~svn350071/build-llvm/tools/clang/lib/Lex -fdebug-prefix-map=/build/llvm-toolchain-snapshot-8~svn350071=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fobjc-runtime=gcc -fno-common -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2018-12-27-042839-1215-1 -x c++ /build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp -faddrsig

/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp

1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the Lexer and Token interfaces.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/Lex/Lexer.h"
15#include "UnicodeCharSets.h"
16#include "clang/Basic/CharInfo.h"
17#include "clang/Basic/IdentifierTable.h"
18#include "clang/Basic/LangOptions.h"
19#include "clang/Basic/SourceLocation.h"
20#include "clang/Basic/SourceManager.h"
21#include "clang/Basic/TokenKinds.h"
22#include "clang/Lex/LexDiagnostic.h"
23#include "clang/Lex/LiteralSupport.h"
24#include "clang/Lex/MultipleIncludeOpt.h"
25#include "clang/Lex/Preprocessor.h"
26#include "clang/Lex/PreprocessorOptions.h"
27#include "clang/Lex/Token.h"
28#include "clang/Basic/Diagnostic.h"
29#include "clang/Basic/LLVM.h"
30#include "clang/Basic/TokenKinds.h"
31#include "llvm/ADT/None.h"
32#include "llvm/ADT/Optional.h"
33#include "llvm/ADT/StringExtras.h"
34#include "llvm/ADT/StringSwitch.h"
35#include "llvm/ADT/StringRef.h"
36#include "llvm/Support/Compiler.h"
37#include "llvm/Support/ConvertUTF.h"
38#include "llvm/Support/MathExtras.h"
39#include "llvm/Support/MemoryBuffer.h"
40#include "llvm/Support/NativeFormatting.h"
41#include "llvm/Support/UnicodeCharRanges.h"
42#include <algorithm>
43#include <cassert>
44#include <cstddef>
45#include <cstdint>
46#include <cstring>
47#include <string>
48#include <tuple>
49#include <utility>
50
51using namespace clang;
52
53//===----------------------------------------------------------------------===//
54// Token Class Implementation
55//===----------------------------------------------------------------------===//
56
57/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
58bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
59 if (isAnnotation())
60 return false;
61 if (IdentifierInfo *II = getIdentifierInfo())
62 return II->getObjCKeywordID() == objcKey;
63 return false;
64}
65
66/// getObjCKeywordID - Return the ObjC keyword kind.
67tok::ObjCKeywordKind Token::getObjCKeywordID() const {
68 if (isAnnotation())
69 return tok::objc_not_keyword;
70 IdentifierInfo *specId = getIdentifierInfo();
71 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
72}
73
74//===----------------------------------------------------------------------===//
75// Lexer Class Implementation
76//===----------------------------------------------------------------------===//
77
78void Lexer::anchor() {}
79
80void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
81 const char *BufEnd) {
82 BufferStart = BufStart;
83 BufferPtr = BufPtr;
84 BufferEnd = BufEnd;
85
86 assert(BufEnd[0] == 0 &&((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 88, __PRETTY_FUNCTION__))
87 "We assume that the input buffer has a null character at the end"((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 88, __PRETTY_FUNCTION__))
88 " to simplify lexing!")((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 88, __PRETTY_FUNCTION__))
;
89
90 // Check whether we have a BOM in the beginning of the buffer. If yes - act
91 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
92 // skip the UTF-8 BOM if it's present.
93 if (BufferStart == BufferPtr) {
94 // Determine the size of the BOM.
95 StringRef Buf(BufferStart, BufferEnd - BufferStart);
96 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
97 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
98 .Default(0);
99
100 // Skip the BOM.
101 BufferPtr += BOMLength;
102 }
103
104 Is_PragmaLexer = false;
105 CurrentConflictMarkerState = CMK_None;
106
107 // Start of the file is a start of line.
108 IsAtStartOfLine = true;
109 IsAtPhysicalStartOfLine = true;
110
111 HasLeadingSpace = false;
112 HasLeadingEmptyMacro = false;
113
114 // We are not after parsing a #.
115 ParsingPreprocessorDirective = false;
116
117 // We are not after parsing #include.
118 ParsingFilename = false;
119
120 // We are not in raw mode. Raw mode disables diagnostics and interpretation
121 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
122 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
123 // or otherwise skipping over tokens.
124 LexingRawMode = false;
125
126 // Default to not keeping comments.
127 ExtendedTokenMode = 0;
128}
129
130/// Lexer constructor - Create a new lexer object for the specified buffer
131/// with the specified preprocessor managing the lexing process. This lexer
132/// assumes that the associated file buffer and Preprocessor objects will
133/// outlive it, so it doesn't take ownership of either of them.
134Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
135 : PreprocessorLexer(&PP, FID),
136 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
137 LangOpts(PP.getLangOpts()) {
138 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
139 InputFile->getBufferEnd());
140
141 resetExtendedTokenMode();
142}
143
144/// Lexer constructor - Create a new raw lexer object. This object is only
145/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
146/// range will outlive it, so it doesn't take ownership of it.
147Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
148 const char *BufStart, const char *BufPtr, const char *BufEnd)
149 : FileLoc(fileloc), LangOpts(langOpts) {
150 InitLexer(BufStart, BufPtr, BufEnd);
151
152 // We *are* in raw mode.
153 LexingRawMode = true;
154}
155
156/// Lexer constructor - Create a new raw lexer object. This object is only
157/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
158/// range will outlive it, so it doesn't take ownership of it.
159Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
160 const SourceManager &SM, const LangOptions &langOpts)
161 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
162 FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
163
164void Lexer::resetExtendedTokenMode() {
165 assert(PP && "Cannot reset token mode without a preprocessor")((PP && "Cannot reset token mode without a preprocessor"
) ? static_cast<void> (0) : __assert_fail ("PP && \"Cannot reset token mode without a preprocessor\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 165, __PRETTY_FUNCTION__))
;
166 if (LangOpts.TraditionalCPP)
167 SetKeepWhitespaceMode(true);
168 else
169 SetCommentRetentionState(PP->getCommentRetentionState());
170}
171
172/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
173/// _Pragma expansion. This has a variety of magic semantics that this method
174/// sets up. It returns a new'd Lexer that must be delete'd when done.
175///
176/// On entrance to this routine, TokStartLoc is a macro location which has a
177/// spelling loc that indicates the bytes to be lexed for the token and an
178/// expansion location that indicates where all lexed tokens should be
179/// "expanded from".
180///
181/// TODO: It would really be nice to make _Pragma just be a wrapper around a
182/// normal lexer that remaps tokens as they fly by. This would require making
183/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
184/// interface that could handle this stuff. This would pull GetMappedTokenLoc
185/// out of the critical path of the lexer!
186///
187Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
188 SourceLocation ExpansionLocStart,
189 SourceLocation ExpansionLocEnd,
190 unsigned TokLen, Preprocessor &PP) {
191 SourceManager &SM = PP.getSourceManager();
192
193 // Create the lexer as if we were going to lex the file normally.
194 FileID SpellingFID = SM.getFileID(SpellingLoc);
195 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
196 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
197
198 // Now that the lexer is created, change the start/end locations so that we
199 // just lex the subsection of the file that we want. This is lexing from a
200 // scratch buffer.
201 const char *StrData = SM.getCharacterData(SpellingLoc);
202
203 L->BufferPtr = StrData;
204 L->BufferEnd = StrData+TokLen;
205 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!")((L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"
) ? static_cast<void> (0) : __assert_fail ("L->BufferEnd[0] == 0 && \"Buffer is not nul terminated!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 205, __PRETTY_FUNCTION__))
;
206
207 // Set the SourceLocation with the remapping information. This ensures that
208 // GetMappedTokenLoc will remap the tokens as they are lexed.
209 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
210 ExpansionLocStart,
211 ExpansionLocEnd, TokLen);
212
213 // Ensure that the lexer thinks it is inside a directive, so that end \n will
214 // return an EOD token.
215 L->ParsingPreprocessorDirective = true;
216
217 // This lexer really is for _Pragma.
218 L->Is_PragmaLexer = true;
219 return L;
220}
221
222template <typename T> static void StringifyImpl(T &Str, char Quote) {
223 typename T::size_type i = 0, e = Str.size();
224 while (i < e) {
225 if (Str[i] == '\\' || Str[i] == Quote) {
226 Str.insert(Str.begin() + i, '\\');
227 i += 2;
228 ++e;
229 } else if (Str[i] == '\n' || Str[i] == '\r') {
230 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
231 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
232 Str[i] != Str[i + 1]) {
233 Str[i] = '\\';
234 Str[i + 1] = 'n';
235 } else {
236 // Replace '\n' and '\r' to '\\' followed by 'n'.
237 Str[i] = '\\';
238 Str.insert(Str.begin() + i + 1, 'n');
239 ++e;
240 }
241 i += 2;
242 } else
243 ++i;
244 }
245}
246
247std::string Lexer::Stringify(StringRef Str, bool Charify) {
248 std::string Result = Str;
249 char Quote = Charify ? '\'' : '"';
250 StringifyImpl(Result, Quote);
251 return Result;
252}
253
254void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
255
256//===----------------------------------------------------------------------===//
257// Token Spelling
258//===----------------------------------------------------------------------===//
259
260/// Slow case of getSpelling. Extract the characters comprising the
261/// spelling of this token from the provided input buffer.
262static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
263 const LangOptions &LangOpts, char *Spelling) {
264 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token")((Tok.needsCleaning() && "getSpellingSlow called on simple token"
) ? static_cast<void> (0) : __assert_fail ("Tok.needsCleaning() && \"getSpellingSlow called on simple token\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 264, __PRETTY_FUNCTION__))
;
265
266 size_t Length = 0;
267 const char *BufEnd = BufPtr + Tok.getLength();
268
269 if (tok::isStringLiteral(Tok.getKind())) {
270 // Munch the encoding-prefix and opening double-quote.
271 while (BufPtr < BufEnd) {
272 unsigned Size;
273 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
274 BufPtr += Size;
275
276 if (Spelling[Length - 1] == '"')
277 break;
278 }
279
280 // Raw string literals need special handling; trigraph expansion and line
281 // splicing do not occur within their d-char-sequence nor within their
282 // r-char-sequence.
283 if (Length >= 2 &&
284 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
285 // Search backwards from the end of the token to find the matching closing
286 // quote.
287 const char *RawEnd = BufEnd;
288 do --RawEnd; while (*RawEnd != '"');
289 size_t RawLength = RawEnd - BufPtr + 1;
290
291 // Everything between the quotes is included verbatim in the spelling.
292 memcpy(Spelling + Length, BufPtr, RawLength);
293 Length += RawLength;
294 BufPtr += RawLength;
295
296 // The rest of the token is lexed normally.
297 }
298 }
299
300 while (BufPtr < BufEnd) {
301 unsigned Size;
302 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
303 BufPtr += Size;
304 }
305
306 assert(Length < Tok.getLength() &&((Length < Tok.getLength() && "NeedsCleaning flag set on token that didn't need cleaning!"
) ? static_cast<void> (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 307, __PRETTY_FUNCTION__))
307 "NeedsCleaning flag set on token that didn't need cleaning!")((Length < Tok.getLength() && "NeedsCleaning flag set on token that didn't need cleaning!"
) ? static_cast<void> (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 307, __PRETTY_FUNCTION__))
;
308 return Length;
309}
310
311/// getSpelling() - Return the 'spelling' of this token. The spelling of a
312/// token are the characters used to represent the token in the source file
313/// after trigraph expansion and escaped-newline folding. In particular, this
314/// wants to get the true, uncanonicalized, spelling of things like digraphs
315/// UCNs, etc.
316StringRef Lexer::getSpelling(SourceLocation loc,
317 SmallVectorImpl<char> &buffer,
318 const SourceManager &SM,
319 const LangOptions &options,
320 bool *invalid) {
321 // Break down the source location.
322 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
323
324 // Try to the load the file buffer.
325 bool invalidTemp = false;
326 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
327 if (invalidTemp) {
328 if (invalid) *invalid = true;
329 return {};
330 }
331
332 const char *tokenBegin = file.data() + locInfo.second;
333
334 // Lex from the start of the given location.
335 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
336 file.begin(), tokenBegin, file.end());
337 Token token;
338 lexer.LexFromRawLexer(token);
339
340 unsigned length = token.getLength();
341
342 // Common case: no need for cleaning.
343 if (!token.needsCleaning())
344 return StringRef(tokenBegin, length);
345
346 // Hard case, we need to relex the characters into the string.
347 buffer.resize(length);
348 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
349 return StringRef(buffer.data(), buffer.size());
350}
351
352/// getSpelling() - Return the 'spelling' of this token. The spelling of a
353/// token are the characters used to represent the token in the source file
354/// after trigraph expansion and escaped-newline folding. In particular, this
355/// wants to get the true, uncanonicalized, spelling of things like digraphs
356/// UCNs, etc.
357std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
358 const LangOptions &LangOpts, bool *Invalid) {
359 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(((int)Tok.getLength() >= 0 && "Token character range is bogus!"
) ? static_cast<void> (0) : __assert_fail ("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 359, __PRETTY_FUNCTION__))
;
360
361 bool CharDataInvalid = false;
362 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
363 &CharDataInvalid);
364 if (Invalid)
365 *Invalid = CharDataInvalid;
366 if (CharDataInvalid)
367 return {};
368
369 // If this token contains nothing interesting, return it directly.
370 if (!Tok.needsCleaning())
371 return std::string(TokStart, TokStart + Tok.getLength());
372
373 std::string Result;
374 Result.resize(Tok.getLength());
375 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
376 return Result;
377}
378
379/// getSpelling - This method is used to get the spelling of a token into a
380/// preallocated buffer, instead of as an std::string. The caller is required
381/// to allocate enough space for the token, which is guaranteed to be at least
382/// Tok.getLength() bytes long. The actual length of the token is returned.
383///
384/// Note that this method may do two possible things: it may either fill in
385/// the buffer specified with characters, or it may *change the input pointer*
386/// to point to a constant buffer with the data already in it (avoiding a
387/// copy). The caller is not allowed to modify the returned buffer pointer
388/// if an internal buffer is returned.
389unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
390 const SourceManager &SourceMgr,
391 const LangOptions &LangOpts, bool *Invalid) {
392 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(((int)Tok.getLength() >= 0 && "Token character range is bogus!"
) ? static_cast<void> (0) : __assert_fail ("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 392, __PRETTY_FUNCTION__))
;
393
394 const char *TokStart = nullptr;
395 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
396 if (Tok.is(tok::raw_identifier))
397 TokStart = Tok.getRawIdentifier().data();
398 else if (!Tok.hasUCN()) {
399 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
400 // Just return the string from the identifier table, which is very quick.
401 Buffer = II->getNameStart();
402 return II->getLength();
403 }
404 }
405
406 // NOTE: this can be checked even after testing for an IdentifierInfo.
407 if (Tok.isLiteral())
408 TokStart = Tok.getLiteralData();
409
410 if (!TokStart) {
411 // Compute the start of the token in the input lexer buffer.
412 bool CharDataInvalid = false;
413 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
414 if (Invalid)
415 *Invalid = CharDataInvalid;
416 if (CharDataInvalid) {
417 Buffer = "";
418 return 0;
419 }
420 }
421
422 // If this token contains nothing interesting, return it directly.
423 if (!Tok.needsCleaning()) {
424 Buffer = TokStart;
425 return Tok.getLength();
426 }
427
428 // Otherwise, hard case, relex the characters into the string.
429 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
430}
431
432/// MeasureTokenLength - Relex the token at the specified location and return
433/// its length in bytes in the input file. If the token needs cleaning (e.g.
434/// includes a trigraph or an escaped newline) then this count includes bytes
435/// that are part of that.
436unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
437 const SourceManager &SM,
438 const LangOptions &LangOpts) {
439 Token TheTok;
440 if (getRawToken(Loc, TheTok, SM, LangOpts))
441 return 0;
442 return TheTok.getLength();
443}
444
445/// Relex the token at the specified location.
446/// \returns true if there was a failure, false on success.
447bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
448 const SourceManager &SM,
449 const LangOptions &LangOpts,
450 bool IgnoreWhiteSpace) {
451 // TODO: this could be special cased for common tokens like identifiers, ')',
452 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
453 // all obviously single-char tokens. This could use
454 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
455 // something.
456
457 // If this comes from a macro expansion, we really do want the macro name, not
458 // the token this macro expanded to.
459 Loc = SM.getExpansionLoc(Loc);
460 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
461 bool Invalid = false;
462 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
463 if (Invalid)
464 return true;
465
466 const char *StrData = Buffer.data()+LocInfo.second;
467
468 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
469 return true;
470
471 // Create a lexer starting at the beginning of this token.
472 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
473 Buffer.begin(), StrData, Buffer.end());
474 TheLexer.SetCommentRetentionState(true);
475 TheLexer.LexFromRawLexer(Result);
476 return false;
477}
478
479/// Returns the pointer that points to the beginning of line that contains
480/// the given offset, or null if the offset if invalid.
481static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
482 const char *BufStart = Buffer.data();
483 if (Offset >= Buffer.size())
484 return nullptr;
485
486 const char *LexStart = BufStart + Offset;
487 for (; LexStart != BufStart; --LexStart) {
488 if (isVerticalWhitespace(LexStart[0]) &&
489 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
490 // LexStart should point at first character of logical line.
491 ++LexStart;
492 break;
493 }
494 }
495 return LexStart;
496}
497
498static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
499 const SourceManager &SM,
500 const LangOptions &LangOpts) {
501 assert(Loc.isFileID())((Loc.isFileID()) ? static_cast<void> (0) : __assert_fail
("Loc.isFileID()", "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 501, __PRETTY_FUNCTION__))
;
502 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
503 if (LocInfo.first.isInvalid())
504 return Loc;
505
506 bool Invalid = false;
507 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
508 if (Invalid)
509 return Loc;
510
511 // Back up from the current location until we hit the beginning of a line
512 // (or the buffer). We'll relex from that point.
513 const char *StrData = Buffer.data() + LocInfo.second;
514 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
515 if (!LexStart || LexStart == StrData)
516 return Loc;
517
518 // Create a lexer starting at the beginning of this token.
519 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
520 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
521 Buffer.end());
522 TheLexer.SetCommentRetentionState(true);
523
524 // Lex tokens until we find the token that contains the source location.
525 Token TheTok;
526 do {
527 TheLexer.LexFromRawLexer(TheTok);
528
529 if (TheLexer.getBufferLocation() > StrData) {
530 // Lexing this token has taken the lexer past the source location we're
531 // looking for. If the current token encompasses our source location,
532 // return the beginning of that token.
533 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
534 return TheTok.getLocation();
535
536 // We ended up skipping over the source location entirely, which means
537 // that it points into whitespace. We're done here.
538 break;
539 }
540 } while (TheTok.getKind() != tok::eof);
541
542 // We've passed our source location; just return the original source location.
543 return Loc;
544}
545
546SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
547 const SourceManager &SM,
548 const LangOptions &LangOpts) {
549 if (Loc.isFileID())
550 return getBeginningOfFileToken(Loc, SM, LangOpts);
551
552 if (!SM.isMacroArgExpansion(Loc))
553 return Loc;
554
555 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
556 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
557 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
558 std::pair<FileID, unsigned> BeginFileLocInfo =
559 SM.getDecomposedLoc(BeginFileLoc);
560 assert(FileLocInfo.first == BeginFileLocInfo.first &&((FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo
.second >= BeginFileLocInfo.second) ? static_cast<void>
(0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 561, __PRETTY_FUNCTION__))
561 FileLocInfo.second >= BeginFileLocInfo.second)((FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo
.second >= BeginFileLocInfo.second) ? static_cast<void>
(0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 561, __PRETTY_FUNCTION__))
;
562 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
563}
564
565namespace {
566
567enum PreambleDirectiveKind {
568 PDK_Skipped,
569 PDK_Unknown
570};
571
572} // namespace
573
574PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
575 const LangOptions &LangOpts,
576 unsigned MaxLines) {
577 // Create a lexer starting at the beginning of the file. Note that we use a
578 // "fake" file source location at offset 1 so that the lexer will track our
579 // position within the file.
580 const unsigned StartOffset = 1;
581 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
582 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
583 Buffer.end());
584 TheLexer.SetCommentRetentionState(true);
585
586 bool InPreprocessorDirective = false;
587 Token TheTok;
588 SourceLocation ActiveCommentLoc;
589
590 unsigned MaxLineOffset = 0;
591 if (MaxLines) {
592 const char *CurPtr = Buffer.begin();
593 unsigned CurLine = 0;
594 while (CurPtr != Buffer.end()) {
595 char ch = *CurPtr++;
596 if (ch == '\n') {
597 ++CurLine;
598 if (CurLine == MaxLines)
599 break;
600 }
601 }
602 if (CurPtr != Buffer.end())
603 MaxLineOffset = CurPtr - Buffer.begin();
604 }
605
606 do {
607 TheLexer.LexFromRawLexer(TheTok);
608
609 if (InPreprocessorDirective) {
610 // If we've hit the end of the file, we're done.
611 if (TheTok.getKind() == tok::eof) {
612 break;
613 }
614
615 // If we haven't hit the end of the preprocessor directive, skip this
616 // token.
617 if (!TheTok.isAtStartOfLine())
618 continue;
619
620 // We've passed the end of the preprocessor directive, and will look
621 // at this token again below.
622 InPreprocessorDirective = false;
623 }
624
625 // Keep track of the # of lines in the preamble.
626 if (TheTok.isAtStartOfLine()) {
627 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
628
629 // If we were asked to limit the number of lines in the preamble,
630 // and we're about to exceed that limit, we're done.
631 if (MaxLineOffset && TokOffset >= MaxLineOffset)
632 break;
633 }
634
635 // Comments are okay; skip over them.
636 if (TheTok.getKind() == tok::comment) {
637 if (ActiveCommentLoc.isInvalid())
638 ActiveCommentLoc = TheTok.getLocation();
639 continue;
640 }
641
642 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
643 // This is the start of a preprocessor directive.
644 Token HashTok = TheTok;
645 InPreprocessorDirective = true;
646 ActiveCommentLoc = SourceLocation();
647
648 // Figure out which directive this is. Since we're lexing raw tokens,
649 // we don't have an identifier table available. Instead, just look at
650 // the raw identifier to recognize and categorize preprocessor directives.
651 TheLexer.LexFromRawLexer(TheTok);
652 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
653 StringRef Keyword = TheTok.getRawIdentifier();
654 PreambleDirectiveKind PDK
655 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
656 .Case("include", PDK_Skipped)
657 .Case("__include_macros", PDK_Skipped)
658 .Case("define", PDK_Skipped)
659 .Case("undef", PDK_Skipped)
660 .Case("line", PDK_Skipped)
661 .Case("error", PDK_Skipped)
662 .Case("pragma", PDK_Skipped)
663 .Case("import", PDK_Skipped)
664 .Case("include_next", PDK_Skipped)
665 .Case("warning", PDK_Skipped)
666 .Case("ident", PDK_Skipped)
667 .Case("sccs", PDK_Skipped)
668 .Case("assert", PDK_Skipped)
669 .Case("unassert", PDK_Skipped)
670 .Case("if", PDK_Skipped)
671 .Case("ifdef", PDK_Skipped)
672 .Case("ifndef", PDK_Skipped)
673 .Case("elif", PDK_Skipped)
674 .Case("else", PDK_Skipped)
675 .Case("endif", PDK_Skipped)
676 .Default(PDK_Unknown);
677
678 switch (PDK) {
679 case PDK_Skipped:
680 continue;
681
682 case PDK_Unknown:
683 // We don't know what this directive is; stop at the '#'.
684 break;
685 }
686 }
687
688 // We only end up here if we didn't recognize the preprocessor
689 // directive or it was one that can't occur in the preamble at this
690 // point. Roll back the current token to the location of the '#'.
691 InPreprocessorDirective = false;
692 TheTok = HashTok;
693 }
694
695 // We hit a token that we don't recognize as being in the
696 // "preprocessing only" part of the file, so we're no longer in
697 // the preamble.
698 break;
699 } while (true);
700
701 SourceLocation End;
702 if (ActiveCommentLoc.isValid())
703 End = ActiveCommentLoc; // don't truncate a decl comment.
704 else
705 End = TheTok.getLocation();
706
707 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
708 TheTok.isAtStartOfLine());
709}
710
711unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
712 const SourceManager &SM,
713 const LangOptions &LangOpts) {
714 // Figure out how many physical characters away the specified expansion
715 // character is. This needs to take into consideration newlines and
716 // trigraphs.
717 bool Invalid = false;
718 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
719
720 // If they request the first char of the token, we're trivially done.
721 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
722 return 0;
723
724 unsigned PhysOffset = 0;
725
726 // The usual case is that tokens don't contain anything interesting. Skip
727 // over the uninteresting characters. If a token only consists of simple
728 // chars, this method is extremely fast.
729 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
730 if (CharNo == 0)
731 return PhysOffset;
732 ++TokPtr;
733 --CharNo;
734 ++PhysOffset;
735 }
736
737 // If we have a character that may be a trigraph or escaped newline, use a
738 // lexer to parse it correctly.
739 for (; CharNo; --CharNo) {
740 unsigned Size;
741 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
742 TokPtr += Size;
743 PhysOffset += Size;
744 }
745
746 // Final detail: if we end up on an escaped newline, we want to return the
747 // location of the actual byte of the token. For example foo\<newline>bar
748 // advanced by 3 should return the location of b, not of \\. One compounding
749 // detail of this is that the escape may be made by a trigraph.
750 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
751 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
752
753 return PhysOffset;
754}
755
756/// Computes the source location just past the end of the
757/// token at this source location.
758///
759/// This routine can be used to produce a source location that
760/// points just past the end of the token referenced by \p Loc, and
761/// is generally used when a diagnostic needs to point just after a
762/// token where it expected something different that it received. If
763/// the returned source location would not be meaningful (e.g., if
764/// it points into a macro), this routine returns an invalid
765/// source location.
766///
767/// \param Offset an offset from the end of the token, where the source
768/// location should refer to. The default offset (0) produces a source
769/// location pointing just past the end of the token; an offset of 1 produces
770/// a source location pointing to the last character in the token, etc.
771SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
772 const SourceManager &SM,
773 const LangOptions &LangOpts) {
774 if (Loc.isInvalid())
775 return {};
776
777 if (Loc.isMacroID()) {
778 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
779 return {}; // Points inside the macro expansion.
780 }
781
782 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
783 if (Len > Offset)
784 Len = Len - Offset;
785 else
786 return Loc;
787
788 return Loc.getLocWithOffset(Len);
789}
790
791/// Returns true if the given MacroID location points at the first
792/// token of the macro expansion.
793bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
794 const SourceManager &SM,
795 const LangOptions &LangOpts,
796 SourceLocation *MacroBegin) {
797 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")((loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"
) ? static_cast<void> (0) : __assert_fail ("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 797, __PRETTY_FUNCTION__))
;
798
799 SourceLocation expansionLoc;
800 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
801 return false;
802
803 if (expansionLoc.isFileID()) {
804 // No other macro expansions, this is the first.
805 if (MacroBegin)
806 *MacroBegin = expansionLoc;
807 return true;
808 }
809
810 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
811}
812
813/// Returns true if the given MacroID location points at the last
814/// token of the macro expansion.
815bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
816 const SourceManager &SM,
817 const LangOptions &LangOpts,
818 SourceLocation *MacroEnd) {
819 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")((loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"
) ? static_cast<void> (0) : __assert_fail ("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 819, __PRETTY_FUNCTION__))
;
820
821 SourceLocation spellLoc = SM.getSpellingLoc(loc);
822 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
823 if (tokLen == 0)
824 return false;
825
826 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
827 SourceLocation expansionLoc;
828 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
829 return false;
830
831 if (expansionLoc.isFileID()) {
832 // No other macro expansions.
833 if (MacroEnd)
834 *MacroEnd = expansionLoc;
835 return true;
836 }
837
838 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
839}
840
841static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
842 const SourceManager &SM,
843 const LangOptions &LangOpts) {
844 SourceLocation Begin = Range.getBegin();
845 SourceLocation End = Range.getEnd();
846 assert(Begin.isFileID() && End.isFileID())((Begin.isFileID() && End.isFileID()) ? static_cast<
void> (0) : __assert_fail ("Begin.isFileID() && End.isFileID()"
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 846, __PRETTY_FUNCTION__))
;
847 if (Range.isTokenRange()) {
848 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
849 if (End.isInvalid())
850 return {};
851 }
852
853 // Break down the source locations.
854 FileID FID;
855 unsigned BeginOffs;
856 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
857 if (FID.isInvalid())
858 return {};
859
860 unsigned EndOffs;
861 if (!SM.isInFileID(End, FID, &EndOffs) ||
862 BeginOffs > EndOffs)
863 return {};
864
865 return CharSourceRange::getCharRange(Begin, End);
866}
867
868CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
869 const SourceManager &SM,
870 const LangOptions &LangOpts) {
871 SourceLocation Begin = Range.getBegin();
872 SourceLocation End = Range.getEnd();
873 if (Begin.isInvalid() || End.isInvalid())
874 return {};
875
876 if (Begin.isFileID() && End.isFileID())
877 return makeRangeFromFileLocs(Range, SM, LangOpts);
878
879 if (Begin.isMacroID() && End.isFileID()) {
880 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
881 return {};
882 Range.setBegin(Begin);
883 return makeRangeFromFileLocs(Range, SM, LangOpts);
884 }
885
886 if (Begin.isFileID() && End.isMacroID()) {
887 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
888 &End)) ||
889 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
890 &End)))
891 return {};
892 Range.setEnd(End);
893 return makeRangeFromFileLocs(Range, SM, LangOpts);
894 }
895
896 assert(Begin.isMacroID() && End.isMacroID())((Begin.isMacroID() && End.isMacroID()) ? static_cast
<void> (0) : __assert_fail ("Begin.isMacroID() && End.isMacroID()"
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 896, __PRETTY_FUNCTION__))
;
897 SourceLocation MacroBegin, MacroEnd;
898 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
899 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
900 &MacroEnd)) ||
901 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
902 &MacroEnd)))) {
903 Range.setBegin(MacroBegin);
904 Range.setEnd(MacroEnd);
905 return makeRangeFromFileLocs(Range, SM, LangOpts);
906 }
907
908 bool Invalid = false;
909 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
910 &Invalid);
911 if (Invalid)
912 return {};
913
914 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
915 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
916 &Invalid);
917 if (Invalid)
918 return {};
919
920 if (EndEntry.getExpansion().isMacroArgExpansion() &&
921 BeginEntry.getExpansion().getExpansionLocStart() ==
922 EndEntry.getExpansion().getExpansionLocStart()) {
923 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
924 Range.setEnd(SM.getImmediateSpellingLoc(End));
925 return makeFileCharRange(Range, SM, LangOpts);
926 }
927 }
928
929 return {};
930}
931
932StringRef Lexer::getSourceText(CharSourceRange Range,
933 const SourceManager &SM,
934 const LangOptions &LangOpts,
935 bool *Invalid) {
936 Range = makeFileCharRange(Range, SM, LangOpts);
937 if (Range.isInvalid()) {
938 if (Invalid) *Invalid = true;
939 return {};
940 }
941
942 // Break down the source location.
943 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
944 if (beginInfo.first.isInvalid()) {
945 if (Invalid) *Invalid = true;
946 return {};
947 }
948
949 unsigned EndOffs;
950 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
951 beginInfo.second > EndOffs) {
952 if (Invalid) *Invalid = true;
953 return {};
954 }
955
956 // Try to the load the file buffer.
957 bool invalidTemp = false;
958 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
959 if (invalidTemp) {
960 if (Invalid) *Invalid = true;
961 return {};
962 }
963
964 if (Invalid) *Invalid = false;
965 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
966}
967
968StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
969 const SourceManager &SM,
970 const LangOptions &LangOpts) {
971 assert(Loc.isMacroID() && "Only reasonable to call this on macros")((Loc.isMacroID() && "Only reasonable to call this on macros"
) ? static_cast<void> (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 971, __PRETTY_FUNCTION__))
;
972
973 // Find the location of the immediate macro expansion.
974 while (true) {
975 FileID FID = SM.getFileID(Loc);
976 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
977 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
978 Loc = Expansion.getExpansionLocStart();
979 if (!Expansion.isMacroArgExpansion())
980 break;
981
982 // For macro arguments we need to check that the argument did not come
983 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
984
985 // Loc points to the argument id of the macro definition, move to the
986 // macro expansion.
987 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
988 SourceLocation SpellLoc = Expansion.getSpellingLoc();
989 if (SpellLoc.isFileID())
990 break; // No inner macro.
991
992 // If spelling location resides in the same FileID as macro expansion
993 // location, it means there is no inner macro.
994 FileID MacroFID = SM.getFileID(Loc);
995 if (SM.isInFileID(SpellLoc, MacroFID))
996 break;
997
998 // Argument came from inner macro.
999 Loc = SpellLoc;
1000 }
1001
1002 // Find the spelling location of the start of the non-argument expansion
1003 // range. This is where the macro name was spelled in order to begin
1004 // expanding this macro.
1005 Loc = SM.getSpellingLoc(Loc);
1006
1007 // Dig out the buffer where the macro name was spelled and the extents of the
1008 // name so that we can render it into the expansion note.
1009 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1010 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1011 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1012 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1013}
1014
1015StringRef Lexer::getImmediateMacroNameForDiagnostics(
1016 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1017 assert(Loc.isMacroID() && "Only reasonable to call this on macros")((Loc.isMacroID() && "Only reasonable to call this on macros"
) ? static_cast<void> (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1017, __PRETTY_FUNCTION__))
;
1018 // Walk past macro argument expansions.
1019 while (SM.isMacroArgExpansion(Loc))
1020 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1021
1022 // If the macro's spelling has no FileID, then it's actually a token paste
1023 // or stringization (or similar) and not a macro at all.
1024 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1025 return {};
1026
1027 // Find the spelling location of the start of the non-argument expansion
1028 // range. This is where the macro name was spelled in order to begin
1029 // expanding this macro.
1030 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1031
1032 // Dig out the buffer where the macro name was spelled and the extents of the
1033 // name so that we can render it into the expansion note.
1034 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1035 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1036 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1037 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1038}
1039
1040bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1041 return isIdentifierBody(c, LangOpts.DollarIdents);
1042}
1043
1044bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1045 assert(isVerticalWhitespace(Str[0]))((isVerticalWhitespace(Str[0])) ? static_cast<void> (0)
: __assert_fail ("isVerticalWhitespace(Str[0])", "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1045, __PRETTY_FUNCTION__))
;
1046 if (Str - 1 < BufferStart)
1047 return false;
1048
1049 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1050 (Str[0] == '\r' && Str[-1] == '\n')) {
1051 if (Str - 2 < BufferStart)
1052 return false;
1053 --Str;
1054 }
1055 --Str;
1056
1057 // Rewind to first non-space character:
1058 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1059 --Str;
1060
1061 return *Str == '\\';
1062}
1063
1064StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1065 const SourceManager &SM) {
1066 if (Loc.isInvalid() || Loc.isMacroID())
1067 return {};
1068 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1069 if (LocInfo.first.isInvalid())
1070 return {};
1071 bool Invalid = false;
1072 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1073 if (Invalid)
1074 return {};
1075 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1076 if (!Line)
1077 return {};
1078 StringRef Rest = Buffer.substr(Line - Buffer.data());
1079 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1080 return NumWhitespaceChars == StringRef::npos
1081 ? ""
1082 : Rest.take_front(NumWhitespaceChars);
1083}
1084
1085//===----------------------------------------------------------------------===//
1086// Diagnostics forwarding code.
1087//===----------------------------------------------------------------------===//
1088
1089/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1090/// lexer buffer was all expanded at a single point, perform the mapping.
1091/// This is currently only used for _Pragma implementation, so it is the slow
1092/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1093static LLVM_ATTRIBUTE_NOINLINE__attribute__((noinline)) SourceLocation GetMappedTokenLoc(
1094 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1095static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1096 SourceLocation FileLoc,
1097 unsigned CharNo, unsigned TokLen) {
1098 assert(FileLoc.isMacroID() && "Must be a macro expansion")((FileLoc.isMacroID() && "Must be a macro expansion")
? static_cast<void> (0) : __assert_fail ("FileLoc.isMacroID() && \"Must be a macro expansion\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1098, __PRETTY_FUNCTION__))
;
1099
1100 // Otherwise, we're lexing "mapped tokens". This is used for things like
1101 // _Pragma handling. Combine the expansion location of FileLoc with the
1102 // spelling location.
1103 SourceManager &SM = PP.getSourceManager();
1104
1105 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1106 // characters come from spelling(FileLoc)+Offset.
1107 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1108 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1109
1110 // Figure out the expansion loc range, which is the range covered by the
1111 // original _Pragma(...) sequence.
1112 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1113
1114 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1115}
1116
1117/// getSourceLocation - Return a source location identifier for the specified
1118/// offset in the current file.
1119SourceLocation Lexer::getSourceLocation(const char *Loc,
1120 unsigned TokLen) const {
1121 assert(Loc >= BufferStart && Loc <= BufferEnd &&((Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!") ? static_cast<void
> (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1122, __PRETTY_FUNCTION__))
1122 "Location out of range for this buffer!")((Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!") ? static_cast<void
> (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1122, __PRETTY_FUNCTION__))
;
1123
1124 // In the normal case, we're just lexing from a simple file buffer, return
1125 // the file id from FileLoc with the offset specified.
1126 unsigned CharNo = Loc-BufferStart;
1127 if (FileLoc.isFileID())
1128 return FileLoc.getLocWithOffset(CharNo);
1129
1130 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1131 // tokens are lexed from where the _Pragma was defined.
1132 assert(PP && "This doesn't work on raw lexers")((PP && "This doesn't work on raw lexers") ? static_cast
<void> (0) : __assert_fail ("PP && \"This doesn't work on raw lexers\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1132, __PRETTY_FUNCTION__))
;
1133 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1134}
1135
1136/// Diag - Forwarding function for diagnostics. This translate a source
1137/// position in the current buffer into a SourceLocation object for rendering.
1138DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1139 return PP->Diag(getSourceLocation(Loc), DiagID);
1140}
1141
1142//===----------------------------------------------------------------------===//
1143// Trigraph and Escaped Newline Handling Code.
1144//===----------------------------------------------------------------------===//
1145
1146/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1147/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1148static char GetTrigraphCharForLetter(char Letter) {
1149 switch (Letter) {
1150 default: return 0;
1151 case '=': return '#';
1152 case ')': return ']';
1153 case '(': return '[';
1154 case '!': return '|';
1155 case '\'': return '^';
1156 case '>': return '}';
1157 case '/': return '\\';
1158 case '<': return '{';
1159 case '-': return '~';
1160 }
1161}
1162
1163/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1164/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1165/// return the result character. Finally, emit a warning about trigraph use
1166/// whether trigraphs are enabled or not.
1167static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1168 char Res = GetTrigraphCharForLetter(*CP);
1169 if (!Res || !L) return Res;
1170
1171 if (!L->getLangOpts().Trigraphs) {
1172 if (!L->isLexingRawMode())
1173 L->Diag(CP-2, diag::trigraph_ignored);
1174 return 0;
1175 }
1176
1177 if (!L->isLexingRawMode())
1178 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1179 return Res;
1180}
1181
1182/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1183/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1184/// trigraph equivalent on entry to this function.
1185unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1186 unsigned Size = 0;
1187 while (isWhitespace(Ptr[Size])) {
1188 ++Size;
1189
1190 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1191 continue;
1192
1193 // If this is a \r\n or \n\r, skip the other half.
1194 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1195 Ptr[Size-1] != Ptr[Size])
1196 ++Size;
1197
1198 return Size;
1199 }
1200
1201 // Not an escaped newline, must be a \t or something else.
1202 return 0;
1203}
1204
1205/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1206/// them), skip over them and return the first non-escaped-newline found,
1207/// otherwise return P.
1208const char *Lexer::SkipEscapedNewLines(const char *P) {
1209 while (true) {
1210 const char *AfterEscape;
1211 if (*P == '\\') {
1212 AfterEscape = P+1;
1213 } else if (*P == '?') {
1214 // If not a trigraph for escape, bail out.
1215 if (P[1] != '?' || P[2] != '/')
1216 return P;
1217 // FIXME: Take LangOpts into account; the language might not
1218 // support trigraphs.
1219 AfterEscape = P+3;
1220 } else {
1221 return P;
1222 }
1223
1224 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1225 if (NewLineSize == 0) return P;
1226 P = AfterEscape+NewLineSize;
1227 }
1228}
1229
1230Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1231 const SourceManager &SM,
1232 const LangOptions &LangOpts) {
1233 if (Loc.isMacroID()) {
1234 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1235 return None;
1236 }
1237 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1238
1239 // Break down the source location.
1240 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1241
1242 // Try to load the file buffer.
1243 bool InvalidTemp = false;
1244 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1245 if (InvalidTemp)
1246 return None;
1247
1248 const char *TokenBegin = File.data() + LocInfo.second;
1249
1250 // Lex from the start of the given location.
1251 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1252 TokenBegin, File.end());
1253 // Find the token.
1254 Token Tok;
1255 lexer.LexFromRawLexer(Tok);
1256 return Tok;
1257}
1258
1259/// Checks that the given token is the first token that occurs after the
1260/// given location (this excludes comments and whitespace). Returns the location
1261/// immediately after the specified token. If the token is not found or the
1262/// location is inside a macro, the returned source location will be invalid.
1263SourceLocation Lexer::findLocationAfterToken(
1264 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1265 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1266 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1267 if (!Tok || Tok->isNot(TKind))
1268 return {};
1269 SourceLocation TokenLoc = Tok->getLocation();
1270
1271 // Calculate how much whitespace needs to be skipped if any.
1272 unsigned NumWhitespaceChars = 0;
1273 if (SkipTrailingWhitespaceAndNewLine) {
1274 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1275 unsigned char C = *TokenEnd;
1276 while (isHorizontalWhitespace(C)) {
1277 C = *(++TokenEnd);
1278 NumWhitespaceChars++;
1279 }
1280
1281 // Skip \r, \n, \r\n, or \n\r
1282 if (C == '\n' || C == '\r') {
1283 char PrevC = C;
1284 C = *(++TokenEnd);
1285 NumWhitespaceChars++;
1286 if ((C == '\n' || C == '\r') && C != PrevC)
1287 NumWhitespaceChars++;
1288 }
1289 }
1290
1291 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1292}
1293
1294/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1295/// get its size, and return it. This is tricky in several cases:
1296/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1297/// then either return the trigraph (skipping 3 chars) or the '?',
1298/// depending on whether trigraphs are enabled or not.
1299/// 2. If this is an escaped newline (potentially with whitespace between
1300/// the backslash and newline), implicitly skip the newline and return
1301/// the char after it.
1302///
1303/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1304/// know that we can accumulate into Size, and that we have already incremented
1305/// Ptr by Size bytes.
1306///
1307/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1308/// be updated to match.
1309char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1310 Token *Tok) {
1311 // If we have a slash, look for an escaped newline.
1312 if (Ptr[0] == '\\') {
8
Taking true branch
1313 ++Size;
1314 ++Ptr;
1315Slash:
1316 // Common case, backslash-char where the char is not whitespace.
1317 if (!isWhitespace(Ptr[0])) return '\\';
9
Taking false branch
1318
1319 // See if we have optional whitespace characters between the slash and
1320 // newline.
1321 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
10
Assuming 'EscapedNewLineSize' is not equal to 0
11
Taking true branch
1322 // Remember that this token needs to be cleaned.
1323 if (Tok) Tok->setFlag(Token::NeedsCleaning);
12
Taking true branch
13
Calling 'Token::setFlag'
1324
1325 // Warn if there was whitespace between the backslash and newline.
1326 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1327 Diag(Ptr, diag::backslash_newline_space);
1328
1329 // Found backslash<whitespace><newline>. Parse the char after it.
1330 Size += EscapedNewLineSize;
1331 Ptr += EscapedNewLineSize;
1332
1333 // Use slow version to accumulate a correct size field.
1334 return getCharAndSizeSlow(Ptr, Size, Tok);
1335 }
1336
1337 // Otherwise, this is not an escaped newline, just return the slash.
1338 return '\\';
1339 }
1340
1341 // If this is a trigraph, process it.
1342 if (Ptr[0] == '?' && Ptr[1] == '?') {
1343 // If this is actually a legal trigraph (not something like "??x"), emit
1344 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1345 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1346 // Remember that this token needs to be cleaned.
1347 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1348
1349 Ptr += 3;
1350 Size += 3;
1351 if (C == '\\') goto Slash;
1352 return C;
1353 }
1354 }
1355
1356 // If this is neither, return a single character.
1357 ++Size;
1358 return *Ptr;
1359}
1360
1361/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1362/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1363/// and that we have already incremented Ptr by Size bytes.
1364///
1365/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1366/// be updated to match.
1367char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1368 const LangOptions &LangOpts) {
1369 // If we have a slash, look for an escaped newline.
1370 if (Ptr[0] == '\\') {
1371 ++Size;
1372 ++Ptr;
1373Slash:
1374 // Common case, backslash-char where the char is not whitespace.
1375 if (!isWhitespace(Ptr[0])) return '\\';
1376
1377 // See if we have optional whitespace characters followed by a newline.
1378 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1379 // Found backslash<whitespace><newline>. Parse the char after it.
1380 Size += EscapedNewLineSize;
1381 Ptr += EscapedNewLineSize;
1382
1383 // Use slow version to accumulate a correct size field.
1384 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1385 }
1386
1387 // Otherwise, this is not an escaped newline, just return the slash.
1388 return '\\';
1389 }
1390
1391 // If this is a trigraph, process it.
1392 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1393 // If this is actually a legal trigraph (not something like "??x"), return
1394 // it.
1395 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1396 Ptr += 3;
1397 Size += 3;
1398 if (C == '\\') goto Slash;
1399 return C;
1400 }
1401 }
1402
1403 // If this is neither, return a single character.
1404 ++Size;
1405 return *Ptr;
1406}
1407
1408//===----------------------------------------------------------------------===//
1409// Helper methods for lexing.
1410//===----------------------------------------------------------------------===//
1411
1412/// Routine that indiscriminately sets the offset into the source file.
1413void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1414 BufferPtr = BufferStart + Offset;
1415 if (BufferPtr > BufferEnd)
1416 BufferPtr = BufferEnd;
1417 // FIXME: What exactly does the StartOfLine bit mean? There are two
1418 // possible meanings for the "start" of the line: the first token on the
1419 // unexpanded line, or the first token on the expanded line.
1420 IsAtStartOfLine = StartOfLine;
1421 IsAtPhysicalStartOfLine = StartOfLine;
1422}
1423
1424static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1425 if (LangOpts.AsmPreprocessor) {
1426 return false;
1427 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1428 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1429 C11AllowedIDCharRanges);
1430 return C11AllowedIDChars.contains(C);
1431 } else if (LangOpts.CPlusPlus) {
1432 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1433 CXX03AllowedIDCharRanges);
1434 return CXX03AllowedIDChars.contains(C);
1435 } else {
1436 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1437 C99AllowedIDCharRanges);
1438 return C99AllowedIDChars.contains(C);
1439 }
1440}
1441
1442static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1443 assert(isAllowedIDChar(C, LangOpts))((isAllowedIDChar(C, LangOpts)) ? static_cast<void> (0)
: __assert_fail ("isAllowedIDChar(C, LangOpts)", "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1443, __PRETTY_FUNCTION__))
;
1444 if (LangOpts.AsmPreprocessor) {
1445 return false;
1446 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1447 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1448 C11DisallowedInitialIDCharRanges);
1449 return !C11DisallowedInitialIDChars.contains(C);
1450 } else if (LangOpts.CPlusPlus) {
1451 return true;
1452 } else {
1453 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1454 C99DisallowedInitialIDCharRanges);
1455 return !C99DisallowedInitialIDChars.contains(C);
1456 }
1457}
1458
1459static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1460 const char *End) {
1461 return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1462 L.getSourceLocation(End));
1463}
1464
1465static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1466 CharSourceRange Range, bool IsFirst) {
1467 // Check C99 compatibility.
1468 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1469 enum {
1470 CannotAppearInIdentifier = 0,
1471 CannotStartIdentifier
1472 };
1473
1474 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1475 C99AllowedIDCharRanges);
1476 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1477 C99DisallowedInitialIDCharRanges);
1478 if (!C99AllowedIDChars.contains(C)) {
1479 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1480 << Range
1481 << CannotAppearInIdentifier;
1482 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1483 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1484 << Range
1485 << CannotStartIdentifier;
1486 }
1487 }
1488
1489 // Check C++98 compatibility.
1490 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1491 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1492 CXX03AllowedIDCharRanges);
1493 if (!CXX03AllowedIDChars.contains(C)) {
1494 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1495 << Range;
1496 }
1497 }
1498}
1499
1500/// After encountering UTF-8 character C and interpreting it as an identifier
1501/// character, check whether it's a homoglyph for a common non-identifier
1502/// source character that is unlikely to be an intentional identifier
1503/// character and warn if so.
1504static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1505 CharSourceRange Range) {
1506 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1507 struct HomoglyphPair {
1508 uint32_t Character;
1509 char LooksLike;
1510 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1511 };
1512 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1513 {U'\u00ad', 0}, // SOFT HYPHEN
1514 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1515 {U'\u037e', ';'}, // GREEK QUESTION MARK
1516 {U'\u200b', 0}, // ZERO WIDTH SPACE
1517 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1518 {U'\u200d', 0}, // ZERO WIDTH JOINER
1519 {U'\u2060', 0}, // WORD JOINER
1520 {U'\u2061', 0}, // FUNCTION APPLICATION
1521 {U'\u2062', 0}, // INVISIBLE TIMES
1522 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1523 {U'\u2064', 0}, // INVISIBLE PLUS
1524 {U'\u2212', '-'}, // MINUS SIGN
1525 {U'\u2215', '/'}, // DIVISION SLASH
1526 {U'\u2216', '\\'}, // SET MINUS
1527 {U'\u2217', '*'}, // ASTERISK OPERATOR
1528 {U'\u2223', '|'}, // DIVIDES
1529 {U'\u2227', '^'}, // LOGICAL AND
1530 {U'\u2236', ':'}, // RATIO
1531 {U'\u223c', '~'}, // TILDE OPERATOR
1532 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1533 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1534 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1535 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1536 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1537 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1538 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1539 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1540 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1541 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1542 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1543 {U'\uff0c', ','}, // FULLWIDTH COMMA
1544 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1545 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1546 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1547 {U'\uff1a', ':'}, // FULLWIDTH COLON
1548 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1549 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1550 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1551 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1552 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1553 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1554 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1555 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1556 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1557 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1558 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1559 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1560 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1561 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1562 {0, 0}
1563 };
1564 auto Homoglyph =
1565 std::lower_bound(std::begin(SortedHomoglyphs),
1566 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1567 if (Homoglyph->Character == C) {
1568 llvm::SmallString<5> CharBuf;
1569 {
1570 llvm::raw_svector_ostream CharOS(CharBuf);
1571 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1572 }
1573 if (Homoglyph->LooksLike) {
1574 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1575 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1576 << Range << CharBuf << LooksLikeStr;
1577 } else {
1578 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1579 << Range << CharBuf;
1580 }
1581 }
1582}
1583
1584bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1585 Token &Result) {
1586 const char *UCNPtr = CurPtr + Size;
1587 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1588 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
1589 return false;
1590
1591 if (!isLexingRawMode())
1592 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1593 makeCharRange(*this, CurPtr, UCNPtr),
1594 /*IsFirst=*/false);
1595
1596 Result.setFlag(Token::HasUCN);
1597 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1598 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1599 CurPtr = UCNPtr;
1600 else
1601 while (CurPtr != UCNPtr)
1602 (void)getAndAdvanceChar(CurPtr, Result);
1603 return true;
1604}
1605
1606bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1607 const char *UnicodePtr = CurPtr;
1608 llvm::UTF32 CodePoint;
1609 llvm::ConversionResult Result =
1610 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1611 (const llvm::UTF8 *)BufferEnd,
1612 &CodePoint,
1613 llvm::strictConversion);
1614 if (Result != llvm::conversionOK ||
1615 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1616 return false;
1617
1618 if (!isLexingRawMode()) {
1619 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1620 makeCharRange(*this, CurPtr, UnicodePtr),
1621 /*IsFirst=*/false);
1622 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1623 makeCharRange(*this, CurPtr, UnicodePtr));
1624 }
1625
1626 CurPtr = UnicodePtr;
1627 return true;
1628}
1629
1630bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1631 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1632 unsigned Size;
1633 unsigned char C = *CurPtr++;
1634 while (isIdentifierBody(C))
1635 C = *CurPtr++;
1636
1637 --CurPtr; // Back up over the skipped character.
1638
1639 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
1640 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1641 //
1642 // TODO: Could merge these checks into an InfoTable flag to make the
1643 // comparison cheaper
1644 if (isASCII(C) && C != '\\' && C != '?' &&
1645 (C != '$' || !LangOpts.DollarIdents)) {
1646FinishIdentifier:
1647 const char *IdStart = BufferPtr;
1648 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1649 Result.setRawIdentifierData(IdStart);
1650
1651 // If we are in raw mode, return this identifier raw. There is no need to
1652 // look up identifier information or attempt to macro expand it.
1653 if (LexingRawMode)
1654 return true;
1655
1656 // Fill in Result.IdentifierInfo and update the token kind,
1657 // looking up the identifier in the identifier table.
1658 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1659 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1660 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1661
1662 // If the completion point is at the end of an identifier, we want to treat
1663 // the identifier as incomplete even if it resolves to a macro or a keyword.
1664 // This allows e.g. 'class^' to complete to 'classifier'.
1665 if (isCodeCompletionPoint(CurPtr)) {
1666 // Return the code-completion token.
1667 Result.setKind(tok::code_completion);
1668 // Skip the code-completion char and all immediate identifier characters.
1669 // This ensures we get consistent behavior when completing at any point in
1670 // an identifier (i.e. at the start, in the middle, at the end). Note that
1671 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1672 // simpler.
1673 assert(*CurPtr == 0 && "Completion character must be 0")((*CurPtr == 0 && "Completion character must be 0") ?
static_cast<void> (0) : __assert_fail ("*CurPtr == 0 && \"Completion character must be 0\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1673, __PRETTY_FUNCTION__))
;
1674 ++CurPtr;
1675 // Note that code completion token is not added as a separate character
1676 // when the completion point is at the end of the buffer. Therefore, we need
1677 // to check if the buffer has ended.
1678 if (CurPtr < BufferEnd) {
1679 while (isIdentifierBody(*CurPtr))
1680 ++CurPtr;
1681 }
1682 BufferPtr = CurPtr;
1683 return true;
1684 }
1685
1686 // Finally, now that we know we have an identifier, pass this off to the
1687 // preprocessor, which may macro expand it or something.
1688 if (II->isHandleIdentifierCase())
1689 return PP->HandleIdentifier(Result);
1690
1691 return true;
1692 }
1693
1694 // Otherwise, $,\,? in identifier found. Enter slower path.
1695
1696 C = getCharAndSize(CurPtr, Size);
1697 while (true) {
1698 if (C == '$') {
1699 // If we hit a $ and they are not supported in identifiers, we are done.
1700 if (!LangOpts.DollarIdents) goto FinishIdentifier;
1701
1702 // Otherwise, emit a diagnostic and continue.
1703 if (!isLexingRawMode())
1704 Diag(CurPtr, diag::ext_dollar_in_identifier);
1705 CurPtr = ConsumeChar(CurPtr, Size, Result);
1706 C = getCharAndSize(CurPtr, Size);
1707 continue;
1708 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
1709 C = getCharAndSize(CurPtr, Size);
1710 continue;
1711 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
1712 C = getCharAndSize(CurPtr, Size);
1713 continue;
1714 } else if (!isIdentifierBody(C)) {
1715 goto FinishIdentifier;
1716 }
1717
1718 // Otherwise, this character is good, consume it.
1719 CurPtr = ConsumeChar(CurPtr, Size, Result);
1720
1721 C = getCharAndSize(CurPtr, Size);
1722 while (isIdentifierBody(C)) {
1723 CurPtr = ConsumeChar(CurPtr, Size, Result);
1724 C = getCharAndSize(CurPtr, Size);
1725 }
1726 }
1727}
1728
1729/// isHexaLiteral - Return true if Start points to a hex constant.
1730/// in microsoft mode (where this is supposed to be several different tokens).
1731bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1732 unsigned Size;
1733 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1734 if (C1 != '0')
1735 return false;
1736 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1737 return (C2 == 'x' || C2 == 'X');
1738}
1739
1740/// LexNumericConstant - Lex the remainder of a integer or floating point
1741/// constant. From[-1] is the first character lexed. Return the end of the
1742/// constant.
1743bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1744 unsigned Size;
1745 char C = getCharAndSize(CurPtr, Size);
1746 char PrevCh = 0;
1747 while (isPreprocessingNumberBody(C)) {
1748 CurPtr = ConsumeChar(CurPtr, Size, Result);
1749 PrevCh = C;
1750 C = getCharAndSize(CurPtr, Size);
1751 }
1752
1753 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1754 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1755 // If we are in Microsoft mode, don't continue if the constant is hex.
1756 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1757 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1758 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1759 }
1760
1761 // If we have a hex FP constant, continue.
1762 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1763 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1764 // not-quite-conforming extension. Only do so if this looks like it's
1765 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1766 bool IsHexFloat = true;
1767 if (!LangOpts.C99) {
1768 if (!isHexaLiteral(BufferPtr, LangOpts))
1769 IsHexFloat = false;
1770 else if (!getLangOpts().CPlusPlus17 &&
1771 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1772 IsHexFloat = false;
1773 }
1774 if (IsHexFloat)
1775 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1776 }
1777
1778 // If we have a digit separator, continue.
1779 if (C == '\'' && getLangOpts().CPlusPlus14) {
1780 unsigned NextSize;
1781 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1782 if (isIdentifierBody(Next)) {
1783 if (!isLexingRawMode())
1784 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1785 CurPtr = ConsumeChar(CurPtr, Size, Result);
1786 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1787 return LexNumericConstant(Result, CurPtr);
1788 }
1789 }
1790
1791 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1792 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1793 return LexNumericConstant(Result, CurPtr);
1794 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1795 return LexNumericConstant(Result, CurPtr);
1796
1797 // Update the location of token as well as BufferPtr.
1798 const char *TokStart = BufferPtr;
1799 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1800 Result.setLiteralData(TokStart);
1801 return true;
1802}
1803
1804/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1805/// in C++11, or warn on a ud-suffix in C++98.
1806const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1807 bool IsStringLiteral) {
1808 assert(getLangOpts().CPlusPlus)((getLangOpts().CPlusPlus) ? static_cast<void> (0) : __assert_fail
("getLangOpts().CPlusPlus", "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 1808, __PRETTY_FUNCTION__))
;
1809
1810 // Maximally munch an identifier.
1811 unsigned Size;
1812 char C = getCharAndSize(CurPtr, Size);
1813 bool Consumed = false;
1814
1815 if (!isIdentifierHead(C)) {
1816 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1817 Consumed = true;
1818 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1819 Consumed = true;
1820 else
1821 return CurPtr;
1822 }
1823
1824 if (!getLangOpts().CPlusPlus11) {
1825 if (!isLexingRawMode())
1826 Diag(CurPtr,
1827 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1828 : diag::warn_cxx11_compat_reserved_user_defined_literal)
1829 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1830 return CurPtr;
1831 }
1832
1833 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1834 // that does not start with an underscore is ill-formed. As a conforming
1835 // extension, we treat all such suffixes as if they had whitespace before
1836 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1837 // likely to be a ud-suffix than a macro, however, and accept that.
1838 if (!Consumed) {
1839 bool IsUDSuffix = false;
1840 if (C == '_')
1841 IsUDSuffix = true;
1842 else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
1843 // In C++1y, we need to look ahead a few characters to see if this is a
1844 // valid suffix for a string literal or a numeric literal (this could be
1845 // the 'operator""if' defining a numeric literal operator).
1846 const unsigned MaxStandardSuffixLength = 3;
1847 char Buffer[MaxStandardSuffixLength] = { C };
1848 unsigned Consumed = Size;
1849 unsigned Chars = 1;
1850 while (true) {
1851 unsigned NextSize;
1852 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1853 getLangOpts());
1854 if (!isIdentifierBody(Next)) {
1855 // End of suffix. Check whether this is on the whitelist.
1856 const StringRef CompleteSuffix(Buffer, Chars);
1857 IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1858 CompleteSuffix);
1859 break;
1860 }
1861
1862 if (Chars == MaxStandardSuffixLength)
1863 // Too long: can't be a standard suffix.
1864 break;
1865
1866 Buffer[Chars++] = Next;
1867 Consumed += NextSize;
1868 }
1869 }
1870
1871 if (!IsUDSuffix) {
1872 if (!isLexingRawMode())
1873 Diag(CurPtr, getLangOpts().MSVCCompat
1874 ? diag::ext_ms_reserved_user_defined_literal
1875 : diag::ext_reserved_user_defined_literal)
1876 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1877 return CurPtr;
1878 }
1879
1880 CurPtr = ConsumeChar(CurPtr, Size, Result);
1881 }
1882
1883 Result.setFlag(Token::HasUDSuffix);
1884 while (true) {
1885 C = getCharAndSize(CurPtr, Size);
1886 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
1887 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
1888 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
1889 else break;
1890 }
1891
1892 return CurPtr;
1893}
1894
1895/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1896/// either " or L" or u8" or u" or U".
1897bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1898 tok::TokenKind Kind) {
1899 const char *AfterQuote = CurPtr;
1900 // Does this string contain the \0 character?
1901 const char *NulCharacter = nullptr;
1902
1903 if (!isLexingRawMode() &&
1904 (Kind == tok::utf8_string_literal ||
1905 Kind == tok::utf16_string_literal ||
1906 Kind == tok::utf32_string_literal))
1907 Diag(BufferPtr, getLangOpts().CPlusPlus
1908 ? diag::warn_cxx98_compat_unicode_literal
1909 : diag::warn_c99_compat_unicode_literal);
1910
1911 char C = getAndAdvanceChar(CurPtr, Result);
1912 while (C != '"') {
1913 // Skip escaped characters. Escaped newlines will already be processed by
1914 // getAndAdvanceChar.
1915 if (C == '\\')
1916 C = getAndAdvanceChar(CurPtr, Result);
1917
1918 if (C == '\n' || C == '\r' || // Newline.
1919 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
1920 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1921 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1922 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1923 return true;
1924 }
1925
1926 if (C == 0) {
1927 if (isCodeCompletionPoint(CurPtr-1)) {
1928 if (ParsingFilename)
1929 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
1930 else
1931 PP->CodeCompleteNaturalLanguage();
1932 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
1933 cutOffLexing();
1934 return true;
1935 }
1936
1937 NulCharacter = CurPtr-1;
1938 }
1939 C = getAndAdvanceChar(CurPtr, Result);
1940 }
1941
1942 // If we are in C++11, lex the optional ud-suffix.
1943 if (getLangOpts().CPlusPlus)
1944 CurPtr = LexUDSuffix(Result, CurPtr, true);
1945
1946 // If a nul character existed in the string, warn about it.
1947 if (NulCharacter && !isLexingRawMode())
1948 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1949
1950 // Update the location of the token as well as the BufferPtr instance var.
1951 const char *TokStart = BufferPtr;
1952 FormTokenWithChars(Result, CurPtr, Kind);
1953 Result.setLiteralData(TokStart);
1954 return true;
1955}
1956
1957/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1958/// having lexed R", LR", u8R", uR", or UR".
1959bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1960 tok::TokenKind Kind) {
1961 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1962 // Between the initial and final double quote characters of the raw string,
1963 // any transformations performed in phases 1 and 2 (trigraphs,
1964 // universal-character-names, and line splicing) are reverted.
1965
1966 if (!isLexingRawMode())
1967 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1968
1969 unsigned PrefixLen = 0;
1970
1971 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
1972 ++PrefixLen;
1973
1974 // If the last character was not a '(', then we didn't lex a valid delimiter.
1975 if (CurPtr[PrefixLen] != '(') {
1976 if (!isLexingRawMode()) {
1977 const char *PrefixEnd = &CurPtr[PrefixLen];
1978 if (PrefixLen == 16) {
1979 Diag(PrefixEnd, diag::err_raw_delim_too_long);
1980 } else {
1981 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1982 << StringRef(PrefixEnd, 1);
1983 }
1984 }
1985
1986 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1987 // it's possible the '"' was intended to be part of the raw string, but
1988 // there's not much we can do about that.
1989 while (true) {
1990 char C = *CurPtr++;
1991
1992 if (C == '"')
1993 break;
1994 if (C == 0 && CurPtr-1 == BufferEnd) {
1995 --CurPtr;
1996 break;
1997 }
1998 }
1999
2000 FormTokenWithChars(Result, CurPtr, tok::unknown);
2001 return true;
2002 }
2003
2004 // Save prefix and move CurPtr past it
2005 const char *Prefix = CurPtr;
2006 CurPtr += PrefixLen + 1; // skip over prefix and '('
2007
2008 while (true) {
2009 char C = *CurPtr++;
2010
2011 if (C == ')') {
2012 // Check for prefix match and closing quote.
2013 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2014 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2015 break;
2016 }
2017 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2018 if (!isLexingRawMode())
2019 Diag(BufferPtr, diag::err_unterminated_raw_string)
2020 << StringRef(Prefix, PrefixLen);
2021 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2022 return true;
2023 }
2024 }
2025
2026 // If we are in C++11, lex the optional ud-suffix.
2027 if (getLangOpts().CPlusPlus)
2028 CurPtr = LexUDSuffix(Result, CurPtr, true);
2029
2030 // Update the location of token as well as BufferPtr.
2031 const char *TokStart = BufferPtr;
2032 FormTokenWithChars(Result, CurPtr, Kind);
2033 Result.setLiteralData(TokStart);
2034 return true;
2035}
2036
2037/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2038/// after having lexed the '<' character. This is used for #include filenames.
2039bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2040 // Does this string contain the \0 character?
2041 const char *NulCharacter = nullptr;
2042 const char *AfterLessPos = CurPtr;
2043 char C = getAndAdvanceChar(CurPtr, Result);
2044 while (C != '>') {
2045 // Skip escaped characters. Escaped newlines will already be processed by
2046 // getAndAdvanceChar.
2047 if (C == '\\')
2048 C = getAndAdvanceChar(CurPtr, Result);
2049
2050 if (C == '\n' || C == '\r' || // Newline.
2051 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2052 // If the filename is unterminated, then it must just be a lone <
2053 // character. Return this as such.
2054 FormTokenWithChars(Result, AfterLessPos, tok::less);
2055 return true;
2056 }
2057
2058 if (C == 0) {
2059 if (isCodeCompletionPoint(CurPtr - 1)) {
2060 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2061 cutOffLexing();
2062 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2063 return true;
2064 }
2065 NulCharacter = CurPtr-1;
2066 }
2067 C = getAndAdvanceChar(CurPtr, Result);
2068 }
2069
2070 // If a nul character existed in the string, warn about it.
2071 if (NulCharacter && !isLexingRawMode())
2072 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2073
2074 // Update the location of token as well as BufferPtr.
2075 const char *TokStart = BufferPtr;
2076 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
2077 Result.setLiteralData(TokStart);
2078 return true;
2079}
2080
2081void Lexer::codeCompleteIncludedFile(const char *PathStart,
2082 const char *CompletionPoint,
2083 bool IsAngled) {
2084 // Completion only applies to the filename, after the last slash.
2085 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2086 auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/");
2087 StringRef Dir =
2088 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2089 const char *StartOfFilename =
2090 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2091 // Code completion filter range is the filename only, up to completion point.
2092 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2093 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2094 // We should replace the characters up to the closing quote, if any.
2095 while (CompletionPoint < BufferEnd) {
2096 char Next = *(CompletionPoint + 1);
2097 if (Next == 0 || Next == '\r' || Next == '\n')
2098 break;
2099 ++CompletionPoint;
2100 if (Next == (IsAngled ? '>' : '"'))
2101 break;
2102 }
2103 PP->setCodeCompletionTokenRange(
2104 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2105 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2106 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2107}
2108
2109/// LexCharConstant - Lex the remainder of a character constant, after having
2110/// lexed either ' or L' or u8' or u' or U'.
2111bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2112 tok::TokenKind Kind) {
2113 // Does this character contain the \0 character?
2114 const char *NulCharacter = nullptr;
2115
2116 if (!isLexingRawMode()) {
2117 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2118 Diag(BufferPtr, getLangOpts().CPlusPlus
2119 ? diag::warn_cxx98_compat_unicode_literal
2120 : diag::warn_c99_compat_unicode_literal);
2121 else if (Kind == tok::utf8_char_constant)
2122 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2123 }
2124
2125 char C = getAndAdvanceChar(CurPtr, Result);
2126 if (C == '\'') {
2127 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2128 Diag(BufferPtr, diag::ext_empty_character);
2129 FormTokenWithChars(Result, CurPtr, tok::unknown);
2130 return true;
2131 }
2132
2133 while (C != '\'') {
2134 // Skip escaped characters.
2135 if (C == '\\')
2136 C = getAndAdvanceChar(CurPtr, Result);
2137
2138 if (C == '\n' || C == '\r' || // Newline.
2139 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2140 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2141 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2142 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2143 return true;
2144 }
2145
2146 if (C == 0) {
2147 if (isCodeCompletionPoint(CurPtr-1)) {
2148 PP->CodeCompleteNaturalLanguage();
2149 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2150 cutOffLexing();
2151 return true;
2152 }
2153
2154 NulCharacter = CurPtr-1;
2155 }
2156 C = getAndAdvanceChar(CurPtr, Result);
2157 }
2158
2159 // If we are in C++11, lex the optional ud-suffix.
2160 if (getLangOpts().CPlusPlus)
2161 CurPtr = LexUDSuffix(Result, CurPtr, false);
2162
2163 // If a nul character existed in the character, warn about it.
2164 if (NulCharacter && !isLexingRawMode())
2165 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2166
2167 // Update the location of token as well as BufferPtr.
2168 const char *TokStart = BufferPtr;
2169 FormTokenWithChars(Result, CurPtr, Kind);
2170 Result.setLiteralData(TokStart);
2171 return true;
2172}
2173
2174/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2175/// Update BufferPtr to point to the next non-whitespace character and return.
2176///
2177/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2178bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2179 bool &TokAtPhysicalStartOfLine) {
2180 // Whitespace - Skip it, then return the token after the whitespace.
2181 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2182
2183 unsigned char Char = *CurPtr;
2184
2185 // Skip consecutive spaces efficiently.
2186 while (true) {
2187 // Skip horizontal whitespace very aggressively.
2188 while (isHorizontalWhitespace(Char))
2189 Char = *++CurPtr;
2190
2191 // Otherwise if we have something other than whitespace, we're done.
2192 if (!isVerticalWhitespace(Char))
2193 break;
2194
2195 if (ParsingPreprocessorDirective) {
2196 // End of preprocessor directive line, let LexTokenInternal handle this.
2197 BufferPtr = CurPtr;
2198 return false;
2199 }
2200
2201 // OK, but handle newline.
2202 SawNewline = true;
2203 Char = *++CurPtr;
2204 }
2205
2206 // If the client wants us to return whitespace, return it now.
2207 if (isKeepWhitespaceMode()) {
2208 FormTokenWithChars(Result, CurPtr, tok::unknown);
2209 if (SawNewline) {
2210 IsAtStartOfLine = true;
2211 IsAtPhysicalStartOfLine = true;
2212 }
2213 // FIXME: The next token will not have LeadingSpace set.
2214 return true;
2215 }
2216
2217 // If this isn't immediately after a newline, there is leading space.
2218 char PrevChar = CurPtr[-1];
2219 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2220
2221 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2222 if (SawNewline) {
2223 Result.setFlag(Token::StartOfLine);
2224 TokAtPhysicalStartOfLine = true;
2225 }
2226
2227 BufferPtr = CurPtr;
2228 return false;
2229}
2230
2231/// We have just read the // characters from input. Skip until we find the
2232/// newline character that terminates the comment. Then update BufferPtr and
2233/// return.
2234///
2235/// If we're in KeepCommentMode or any CommentHandler has inserted
2236/// some tokens, this will store the first token and return true.
2237bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2238 bool &TokAtPhysicalStartOfLine) {
2239 // If Line comments aren't explicitly enabled for this language, emit an
2240 // extension warning.
2241 if (!LangOpts.LineComment && !isLexingRawMode()) {
2242 Diag(BufferPtr, diag::ext_line_comment);
2243
2244 // Mark them enabled so we only emit one warning for this translation
2245 // unit.
2246 LangOpts.LineComment = true;
2247 }
2248
2249 // Scan over the body of the comment. The common case, when scanning, is that
2250 // the comment contains normal ascii characters with nothing interesting in
2251 // them. As such, optimize for this case with the inner loop.
2252 //
2253 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2254 // character that ends the line comment.
2255 char C;
2256 while (true) {
2257 C = *CurPtr;
2258 // Skip over characters in the fast loop.
2259 while (C != 0 && // Potentially EOF.
2260 C != '\n' && C != '\r') // Newline or DOS-style newline.
2261 C = *++CurPtr;
2262
2263 const char *NextLine = CurPtr;
2264 if (C != 0) {
2265 // We found a newline, see if it's escaped.
2266 const char *EscapePtr = CurPtr-1;
2267 bool HasSpace = false;
2268 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2269 --EscapePtr;
2270 HasSpace = true;
2271 }
2272
2273 if (*EscapePtr == '\\')
2274 // Escaped newline.
2275 CurPtr = EscapePtr;
2276 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2277 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2278 // Trigraph-escaped newline.
2279 CurPtr = EscapePtr-2;
2280 else
2281 break; // This is a newline, we're done.
2282
2283 // If there was space between the backslash and newline, warn about it.
2284 if (HasSpace && !isLexingRawMode())
2285 Diag(EscapePtr, diag::backslash_newline_space);
2286 }
2287
2288 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2289 // properly decode the character. Read it in raw mode to avoid emitting
2290 // diagnostics about things like trigraphs. If we see an escaped newline,
2291 // we'll handle it below.
2292 const char *OldPtr = CurPtr;
2293 bool OldRawMode = isLexingRawMode();
2294 LexingRawMode = true;
2295 C = getAndAdvanceChar(CurPtr, Result);
2296 LexingRawMode = OldRawMode;
2297
2298 // If we only read only one character, then no special handling is needed.
2299 // We're done and can skip forward to the newline.
2300 if (C != 0 && CurPtr == OldPtr+1) {
2301 CurPtr = NextLine;
2302 break;
2303 }
2304
2305 // If we read multiple characters, and one of those characters was a \r or
2306 // \n, then we had an escaped newline within the comment. Emit diagnostic
2307 // unless the next line is also a // comment.
2308 if (CurPtr != OldPtr + 1 && C != '/' &&
2309 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2310 for (; OldPtr != CurPtr; ++OldPtr)
2311 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2312 // Okay, we found a // comment that ends in a newline, if the next
2313 // line is also a // comment, but has spaces, don't emit a diagnostic.
2314 if (isWhitespace(C)) {
2315 const char *ForwardPtr = CurPtr;
2316 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2317 ++ForwardPtr;
2318 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2319 break;
2320 }
2321
2322 if (!isLexingRawMode())
2323 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2324 break;
2325 }
2326 }
2327
2328 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2329 --CurPtr;
2330 break;
2331 }
2332
2333 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2334 PP->CodeCompleteNaturalLanguage();
2335 cutOffLexing();
2336 return false;
2337 }
2338 }
2339
2340 // Found but did not consume the newline. Notify comment handlers about the
2341 // comment unless we're in a #if 0 block.
2342 if (PP && !isLexingRawMode() &&
2343 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2344 getSourceLocation(CurPtr)))) {
2345 BufferPtr = CurPtr;
2346 return true; // A token has to be returned.
2347 }
2348
2349 // If we are returning comments as tokens, return this comment as a token.
2350 if (inKeepCommentMode())
2351 return SaveLineComment(Result, CurPtr);
2352
2353 // If we are inside a preprocessor directive and we see the end of line,
2354 // return immediately, so that the lexer can return this as an EOD token.
2355 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2356 BufferPtr = CurPtr;
2357 return false;
2358 }
2359
2360 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2361 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2362 // contribute to another token), it isn't needed for correctness. Note that
2363 // this is ok even in KeepWhitespaceMode, because we would have returned the
2364 /// comment above in that mode.
2365 ++CurPtr;
2366
2367 // The next returned token is at the start of the line.
2368 Result.setFlag(Token::StartOfLine);
2369 TokAtPhysicalStartOfLine = true;
2370 // No leading whitespace seen so far.
2371 Result.clearFlag(Token::LeadingSpace);
2372 BufferPtr = CurPtr;
2373 return false;
2374}
2375
2376/// If in save-comment mode, package up this Line comment in an appropriate
2377/// way and return it.
2378bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2379 // If we're not in a preprocessor directive, just return the // comment
2380 // directly.
2381 FormTokenWithChars(Result, CurPtr, tok::comment);
2382
2383 if (!ParsingPreprocessorDirective || LexingRawMode)
2384 return true;
2385
2386 // If this Line-style comment is in a macro definition, transmogrify it into
2387 // a C-style block comment.
2388 bool Invalid = false;
2389 std::string Spelling = PP->getSpelling(Result, &Invalid);
2390 if (Invalid)
2391 return true;
2392
2393 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?")((Spelling[0] == '/' && Spelling[1] == '/' &&
"Not line comment?") ? static_cast<void> (0) : __assert_fail
("Spelling[0] == '/' && Spelling[1] == '/' && \"Not line comment?\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2393, __PRETTY_FUNCTION__))
;
2394 Spelling[1] = '*'; // Change prefix to "/*".
2395 Spelling += "*/"; // add suffix.
2396
2397 Result.setKind(tok::comment);
2398 PP->CreateString(Spelling, Result,
2399 Result.getLocation(), Result.getLocation());
2400 return true;
2401}
2402
2403/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2404/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2405/// a diagnostic if so. We know that the newline is inside of a block comment.
2406static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2407 Lexer *L) {
2408 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r')((CurPtr[0] == '\n' || CurPtr[0] == '\r') ? static_cast<void
> (0) : __assert_fail ("CurPtr[0] == '\\n' || CurPtr[0] == '\\r'"
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2408, __PRETTY_FUNCTION__))
;
2409
2410 // Back up off the newline.
2411 --CurPtr;
2412
2413 // If this is a two-character newline sequence, skip the other character.
2414 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2415 // \n\n or \r\r -> not escaped newline.
2416 if (CurPtr[0] == CurPtr[1])
2417 return false;
2418 // \n\r or \r\n -> skip the newline.
2419 --CurPtr;
2420 }
2421
2422 // If we have horizontal whitespace, skip over it. We allow whitespace
2423 // between the slash and newline.
2424 bool HasSpace = false;
2425 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2426 --CurPtr;
2427 HasSpace = true;
2428 }
2429
2430 // If we have a slash, we know this is an escaped newline.
2431 if (*CurPtr == '\\') {
2432 if (CurPtr[-1] != '*') return false;
2433 } else {
2434 // It isn't a slash, is it the ?? / trigraph?
2435 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
2436 CurPtr[-3] != '*')
2437 return false;
2438
2439 // This is the trigraph ending the comment. Emit a stern warning!
2440 CurPtr -= 2;
2441
2442 // If no trigraphs are enabled, warn that we ignored this trigraph and
2443 // ignore this * character.
2444 if (!L->getLangOpts().Trigraphs) {
2445 if (!L->isLexingRawMode())
2446 L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2447 return false;
2448 }
2449 if (!L->isLexingRawMode())
2450 L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2451 }
2452
2453 // Warn about having an escaped newline between the */ characters.
2454 if (!L->isLexingRawMode())
2455 L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2456
2457 // If there was space between the backslash and newline, warn about it.
2458 if (HasSpace && !L->isLexingRawMode())
2459 L->Diag(CurPtr, diag::backslash_newline_space);
2460
2461 return true;
2462}
2463
2464#ifdef __SSE2__1
2465#include <emmintrin.h>
2466#elif __ALTIVEC__
2467#include <altivec.h>
2468#undef bool
2469#endif
2470
2471/// We have just read from input the / and * characters that started a comment.
2472/// Read until we find the * and / characters that terminate the comment.
2473/// Note that we don't bother decoding trigraphs or escaped newlines in block
2474/// comments, because they cannot cause the comment to end. The only thing
2475/// that can happen is the comment could end with an escaped newline between
2476/// the terminating * and /.
2477///
2478/// If we're in KeepCommentMode or any CommentHandler has inserted
2479/// some tokens, this will store the first token and return true.
2480bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2481 bool &TokAtPhysicalStartOfLine) {
2482 // Scan one character past where we should, looking for a '/' character. Once
2483 // we find it, check to see if it was preceded by a *. This common
2484 // optimization helps people who like to put a lot of * characters in their
2485 // comments.
2486
2487 // The first character we get with newlines and trigraphs skipped to handle
2488 // the degenerate /*/ case below correctly if the * has an escaped newline
2489 // after it.
2490 unsigned CharSize;
2491 unsigned char C = getCharAndSize(CurPtr, CharSize);
2492 CurPtr += CharSize;
2493 if (C == 0 && CurPtr == BufferEnd+1) {
2494 if (!isLexingRawMode())
2495 Diag(BufferPtr, diag::err_unterminated_block_comment);
2496 --CurPtr;
2497
2498 // KeepWhitespaceMode should return this broken comment as a token. Since
2499 // it isn't a well formed comment, just return it as an 'unknown' token.
2500 if (isKeepWhitespaceMode()) {
2501 FormTokenWithChars(Result, CurPtr, tok::unknown);
2502 return true;
2503 }
2504
2505 BufferPtr = CurPtr;
2506 return false;
2507 }
2508
2509 // Check to see if the first character after the '/*' is another /. If so,
2510 // then this slash does not end the block comment, it is part of it.
2511 if (C == '/')
2512 C = *CurPtr++;
2513
2514 while (true) {
2515 // Skip over all non-interesting characters until we find end of buffer or a
2516 // (probably ending) '/' character.
2517 if (CurPtr + 24 < BufferEnd &&
2518 // If there is a code-completion point avoid the fast scan because it
2519 // doesn't check for '\0'.
2520 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2521 // While not aligned to a 16-byte boundary.
2522 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
2523 C = *CurPtr++;
2524
2525 if (C == '/') goto FoundSlash;
2526
2527#ifdef __SSE2__1
2528 __m128i Slashes = _mm_set1_epi8('/');
2529 while (CurPtr+16 <= BufferEnd) {
2530 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2531 Slashes));
2532 if (cmp != 0) {
2533 // Adjust the pointer to point directly after the first slash. It's
2534 // not necessary to set C here, it will be overwritten at the end of
2535 // the outer loop.
2536 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2537 goto FoundSlash;
2538 }
2539 CurPtr += 16;
2540 }
2541#elif __ALTIVEC__
2542 __vector unsigned char Slashes = {
2543 '/', '/', '/', '/', '/', '/', '/', '/',
2544 '/', '/', '/', '/', '/', '/', '/', '/'
2545 };
2546 while (CurPtr+16 <= BufferEnd &&
2547 !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
2548 CurPtr += 16;
2549#else
2550 // Scan for '/' quickly. Many block comments are very large.
2551 while (CurPtr[0] != '/' &&
2552 CurPtr[1] != '/' &&
2553 CurPtr[2] != '/' &&
2554 CurPtr[3] != '/' &&
2555 CurPtr+4 < BufferEnd) {
2556 CurPtr += 4;
2557 }
2558#endif
2559
2560 // It has to be one of the bytes scanned, increment to it and read one.
2561 C = *CurPtr++;
2562 }
2563
2564 // Loop to scan the remainder.
2565 while (C != '/' && C != '\0')
2566 C = *CurPtr++;
2567
2568 if (C == '/') {
2569 FoundSlash:
2570 if (CurPtr[-2] == '*') // We found the final */. We're done!
2571 break;
2572
2573 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2574 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2575 // We found the final */, though it had an escaped newline between the
2576 // * and /. We're done!
2577 break;
2578 }
2579 }
2580 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2581 // If this is a /* inside of the comment, emit a warning. Don't do this
2582 // if this is a /*/, which will end the comment. This misses cases with
2583 // embedded escaped newlines, but oh well.
2584 if (!isLexingRawMode())
2585 Diag(CurPtr-1, diag::warn_nested_block_comment);
2586 }
2587 } else if (C == 0 && CurPtr == BufferEnd+1) {
2588 if (!isLexingRawMode())
2589 Diag(BufferPtr, diag::err_unterminated_block_comment);
2590 // Note: the user probably forgot a */. We could continue immediately
2591 // after the /*, but this would involve lexing a lot of what really is the
2592 // comment, which surely would confuse the parser.
2593 --CurPtr;
2594
2595 // KeepWhitespaceMode should return this broken comment as a token. Since
2596 // it isn't a well formed comment, just return it as an 'unknown' token.
2597 if (isKeepWhitespaceMode()) {
2598 FormTokenWithChars(Result, CurPtr, tok::unknown);
2599 return true;
2600 }
2601
2602 BufferPtr = CurPtr;
2603 return false;
2604 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2605 PP->CodeCompleteNaturalLanguage();
2606 cutOffLexing();
2607 return false;
2608 }
2609
2610 C = *CurPtr++;
2611 }
2612
2613 // Notify comment handlers about the comment unless we're in a #if 0 block.
2614 if (PP && !isLexingRawMode() &&
2615 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2616 getSourceLocation(CurPtr)))) {
2617 BufferPtr = CurPtr;
2618 return true; // A token has to be returned.
2619 }
2620
2621 // If we are returning comments as tokens, return this comment as a token.
2622 if (inKeepCommentMode()) {
2623 FormTokenWithChars(Result, CurPtr, tok::comment);
2624 return true;
2625 }
2626
2627 // It is common for the tokens immediately after a /**/ comment to be
2628 // whitespace. Instead of going through the big switch, handle it
2629 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2630 // have already returned above with the comment as a token.
2631 if (isHorizontalWhitespace(*CurPtr)) {
2632 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2633 return false;
2634 }
2635
2636 // Otherwise, just return so that the next character will be lexed as a token.
2637 BufferPtr = CurPtr;
2638 Result.setFlag(Token::LeadingSpace);
2639 return false;
2640}
2641
2642//===----------------------------------------------------------------------===//
2643// Primary Lexing Entry Points
2644//===----------------------------------------------------------------------===//
2645
2646/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2647/// uninterpreted string. This switches the lexer out of directive mode.
2648void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2649 assert(ParsingPreprocessorDirective && ParsingFilename == false &&((ParsingPreprocessorDirective && ParsingFilename == false
&& "Must be in a preprocessing directive!") ? static_cast
<void> (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2650, __PRETTY_FUNCTION__))
1
Assuming the condition is true
2
Assuming the condition is true
3
'?' condition is true
2650 "Must be in a preprocessing directive!")((ParsingPreprocessorDirective && ParsingFilename == false
&& "Must be in a preprocessing directive!") ? static_cast
<void> (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2650, __PRETTY_FUNCTION__))
;
2651 Token Tmp;
2652
2653 // CurPtr - Cache BufferPtr in an automatic variable.
2654 const char *CurPtr = BufferPtr;
2655 while (true) {
4
Loop condition is true. Entering loop body
2656 char Char = getAndAdvanceChar(CurPtr, Tmp);
5
Calling 'Lexer::getAndAdvanceChar'
2657 switch (Char) {
2658 default:
2659 if (Result)
2660 Result->push_back(Char);
2661 break;
2662 case 0: // Null.
2663 // Found end of file?
2664 if (CurPtr-1 != BufferEnd) {
2665 if (isCodeCompletionPoint(CurPtr-1)) {
2666 PP->CodeCompleteNaturalLanguage();
2667 cutOffLexing();
2668 return;
2669 }
2670
2671 // Nope, normal character, continue.
2672 if (Result)
2673 Result->push_back(Char);
2674 break;
2675 }
2676 // FALL THROUGH.
2677 LLVM_FALLTHROUGH[[clang::fallthrough]];
2678 case '\r':
2679 case '\n':
2680 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2681 assert(CurPtr[-1] == Char && "Trigraphs for newline?")((CurPtr[-1] == Char && "Trigraphs for newline?") ? static_cast
<void> (0) : __assert_fail ("CurPtr[-1] == Char && \"Trigraphs for newline?\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2681, __PRETTY_FUNCTION__))
;
2682 BufferPtr = CurPtr-1;
2683
2684 // Next, lex the character, which should handle the EOD transition.
2685 Lex(Tmp);
2686 if (Tmp.is(tok::code_completion)) {
2687 if (PP)
2688 PP->CodeCompleteNaturalLanguage();
2689 Lex(Tmp);
2690 }
2691 assert(Tmp.is(tok::eod) && "Unexpected token!")((Tmp.is(tok::eod) && "Unexpected token!") ? static_cast
<void> (0) : __assert_fail ("Tmp.is(tok::eod) && \"Unexpected token!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2691, __PRETTY_FUNCTION__))
;
2692
2693 // Finally, we're done;
2694 return;
2695 }
2696 }
2697}
2698
2699/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2700/// condition, reporting diagnostics and handling other edge cases as required.
2701/// This returns true if Result contains a token, false if PP.Lex should be
2702/// called again.
2703bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2704 // If we hit the end of the file while parsing a preprocessor directive,
2705 // end the preprocessor directive first. The next token returned will
2706 // then be the end of file.
2707 if (ParsingPreprocessorDirective) {
2708 // Done parsing the "line".
2709 ParsingPreprocessorDirective = false;
2710 // Update the location of token as well as BufferPtr.
2711 FormTokenWithChars(Result, CurPtr, tok::eod);
2712
2713 // Restore comment saving mode, in case it was disabled for directive.
2714 if (PP)
2715 resetExtendedTokenMode();
2716 return true; // Have a token.
2717 }
2718
2719 // If we are in raw mode, return this event as an EOF token. Let the caller
2720 // that put us in raw mode handle the event.
2721 if (isLexingRawMode()) {
2722 Result.startToken();
2723 BufferPtr = BufferEnd;
2724 FormTokenWithChars(Result, BufferEnd, tok::eof);
2725 return true;
2726 }
2727
2728 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2729 PP->setRecordedPreambleConditionalStack(ConditionalStack);
2730 ConditionalStack.clear();
2731 }
2732
2733 // Issue diagnostics for unterminated #if and missing newline.
2734
2735 // If we are in a #if directive, emit an error.
2736 while (!ConditionalStack.empty()) {
2737 if (PP->getCodeCompletionFileLoc() != FileLoc)
2738 PP->Diag(ConditionalStack.back().IfLoc,
2739 diag::err_pp_unterminated_conditional);
2740 ConditionalStack.pop_back();
2741 }
2742
2743 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2744 // a pedwarn.
2745 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2746 DiagnosticsEngine &Diags = PP->getDiagnostics();
2747 SourceLocation EndLoc = getSourceLocation(BufferEnd);
2748 unsigned DiagID;
2749
2750 if (LangOpts.CPlusPlus11) {
2751 // C++11 [lex.phases] 2.2 p2
2752 // Prefer the C++98 pedantic compatibility warning over the generic,
2753 // non-extension, user-requested "missing newline at EOF" warning.
2754 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2755 DiagID = diag::warn_cxx98_compat_no_newline_eof;
2756 } else {
2757 DiagID = diag::warn_no_newline_eof;
2758 }
2759 } else {
2760 DiagID = diag::ext_no_newline_eof;
2761 }
2762
2763 Diag(BufferEnd, DiagID)
2764 << FixItHint::CreateInsertion(EndLoc, "\n");
2765 }
2766
2767 BufferPtr = CurPtr;
2768
2769 // Finally, let the preprocessor handle this.
2770 return PP->HandleEndOfFile(Result, isPragmaLexer());
2771}
2772
2773/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2774/// the specified lexer will return a tok::l_paren token, 0 if it is something
2775/// else and 2 if there are no more tokens in the buffer controlled by the
2776/// lexer.
2777unsigned Lexer::isNextPPTokenLParen() {
2778 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?")((!LexingRawMode && "How can we expand a macro from a skipping buffer?"
) ? static_cast<void> (0) : __assert_fail ("!LexingRawMode && \"How can we expand a macro from a skipping buffer?\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2778, __PRETTY_FUNCTION__))
;
2779
2780 // Switch to 'skipping' mode. This will ensure that we can lex a token
2781 // without emitting diagnostics, disables macro expansion, and will cause EOF
2782 // to return an EOF token instead of popping the include stack.
2783 LexingRawMode = true;
2784
2785 // Save state that can be changed while lexing so that we can restore it.
2786 const char *TmpBufferPtr = BufferPtr;
2787 bool inPPDirectiveMode = ParsingPreprocessorDirective;
2788 bool atStartOfLine = IsAtStartOfLine;
2789 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2790 bool leadingSpace = HasLeadingSpace;
2791
2792 Token Tok;
2793 Lex(Tok);
2794
2795 // Restore state that may have changed.
2796 BufferPtr = TmpBufferPtr;
2797 ParsingPreprocessorDirective = inPPDirectiveMode;
2798 HasLeadingSpace = leadingSpace;
2799 IsAtStartOfLine = atStartOfLine;
2800 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2801
2802 // Restore the lexer back to non-skipping mode.
2803 LexingRawMode = false;
2804
2805 if (Tok.is(tok::eof))
2806 return 2;
2807 return Tok.is(tok::l_paren);
2808}
2809
2810/// Find the end of a version control conflict marker.
2811static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2812 ConflictMarkerKind CMK) {
2813 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2814 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2815 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2816 size_t Pos = RestOfBuffer.find(Terminator);
2817 while (Pos != StringRef::npos) {
2818 // Must occur at start of line.
2819 if (Pos == 0 ||
2820 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
2821 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2822 Pos = RestOfBuffer.find(Terminator);
2823 continue;
2824 }
2825 return RestOfBuffer.data()+Pos;
2826 }
2827 return nullptr;
2828}
2829
2830/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2831/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2832/// and recover nicely. This returns true if it is a conflict marker and false
2833/// if not.
2834bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2835 // Only a conflict marker if it starts at the beginning of a line.
2836 if (CurPtr != BufferStart &&
2837 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2838 return false;
2839
2840 // Check to see if we have <<<<<<< or >>>>.
2841 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2842 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
2843 return false;
2844
2845 // If we have a situation where we don't care about conflict markers, ignore
2846 // it.
2847 if (CurrentConflictMarkerState || isLexingRawMode())
2848 return false;
2849
2850 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
2851
2852 // Check to see if there is an ending marker somewhere in the buffer at the
2853 // start of a line to terminate this conflict marker.
2854 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2855 // We found a match. We are really in a conflict marker.
2856 // Diagnose this, and ignore to the end of line.
2857 Diag(CurPtr, diag::err_conflict_marker);
2858 CurrentConflictMarkerState = Kind;
2859
2860 // Skip ahead to the end of line. We know this exists because the
2861 // end-of-conflict marker starts with \r or \n.
2862 while (*CurPtr != '\r' && *CurPtr != '\n') {
2863 assert(CurPtr != BufferEnd && "Didn't find end of line")((CurPtr != BufferEnd && "Didn't find end of line") ?
static_cast<void> (0) : __assert_fail ("CurPtr != BufferEnd && \"Didn't find end of line\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2863, __PRETTY_FUNCTION__))
;
2864 ++CurPtr;
2865 }
2866 BufferPtr = CurPtr;
2867 return true;
2868 }
2869
2870 // No end of conflict marker found.
2871 return false;
2872}
2873
2874/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2875/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2876/// is the end of a conflict marker. Handle it by ignoring up until the end of
2877/// the line. This returns true if it is a conflict marker and false if not.
2878bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2879 // Only a conflict marker if it starts at the beginning of a line.
2880 if (CurPtr != BufferStart &&
2881 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2882 return false;
2883
2884 // If we have a situation where we don't care about conflict markers, ignore
2885 // it.
2886 if (!CurrentConflictMarkerState || isLexingRawMode())
2887 return false;
2888
2889 // Check to see if we have the marker (4 characters in a row).
2890 for (unsigned i = 1; i != 4; ++i)
2891 if (CurPtr[i] != CurPtr[0])
2892 return false;
2893
2894 // If we do have it, search for the end of the conflict marker. This could
2895 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
2896 // be the end of conflict marker.
2897 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2898 CurrentConflictMarkerState)) {
2899 CurPtr = End;
2900
2901 // Skip ahead to the end of line.
2902 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2903 ++CurPtr;
2904
2905 BufferPtr = CurPtr;
2906
2907 // No longer in the conflict marker.
2908 CurrentConflictMarkerState = CMK_None;
2909 return true;
2910 }
2911
2912 return false;
2913}
2914
2915static const char *findPlaceholderEnd(const char *CurPtr,
2916 const char *BufferEnd) {
2917 if (CurPtr == BufferEnd)
2918 return nullptr;
2919 BufferEnd -= 1; // Scan until the second last character.
2920 for (; CurPtr != BufferEnd; ++CurPtr) {
2921 if (CurPtr[0] == '#' && CurPtr[1] == '>')
2922 return CurPtr + 2;
2923 }
2924 return nullptr;
2925}
2926
2927bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2928 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!")((CurPtr[-1] == '<' && CurPtr[0] == '#' &&
"Not a placeholder!") ? static_cast<void> (0) : __assert_fail
("CurPtr[-1] == '<' && CurPtr[0] == '#' && \"Not a placeholder!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 2928, __PRETTY_FUNCTION__))
;
2929 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
2930 return false;
2931 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2932 if (!End)
2933 return false;
2934 const char *Start = CurPtr - 1;
2935 if (!LangOpts.AllowEditorPlaceholders)
2936 Diag(Start, diag::err_placeholder_in_source);
2937 Result.startToken();
2938 FormTokenWithChars(Result, End, tok::raw_identifier);
2939 Result.setRawIdentifierData(Start);
2940 PP->LookUpIdentifierInfo(Result);
2941 Result.setFlag(Token::IsEditorPlaceholder);
2942 BufferPtr = End;
2943 return true;
2944}
2945
2946bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2947 if (PP && PP->isCodeCompletionEnabled()) {
2948 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2949 return Loc == PP->getCodeCompletionLoc();
2950 }
2951
2952 return false;
2953}
2954
2955uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2956 Token *Result) {
2957 unsigned CharSize;
2958 char Kind = getCharAndSize(StartPtr, CharSize);
2959
2960 unsigned NumHexDigits;
2961 if (Kind == 'u')
2962 NumHexDigits = 4;
2963 else if (Kind == 'U')
2964 NumHexDigits = 8;
2965 else
2966 return 0;
2967
2968 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
2969 if (Result && !isLexingRawMode())
2970 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2971 return 0;
2972 }
2973
2974 const char *CurPtr = StartPtr + CharSize;
2975 const char *KindLoc = &CurPtr[-1];
2976
2977 uint32_t CodePoint = 0;
2978 for (unsigned i = 0; i < NumHexDigits; ++i) {
2979 char C = getCharAndSize(CurPtr, CharSize);
2980
2981 unsigned Value = llvm::hexDigitValue(C);
2982 if (Value == -1U) {
2983 if (Result && !isLexingRawMode()) {
2984 if (i == 0) {
2985 Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2986 << StringRef(KindLoc, 1);
2987 } else {
2988 Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2989
2990 // If the user wrote \U1234, suggest a fixit to \u.
2991 if (i == 4 && NumHexDigits == 8) {
2992 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
2993 Diag(KindLoc, diag::note_ucn_four_not_eight)
2994 << FixItHint::CreateReplacement(URange, "u");
2995 }
2996 }
2997 }
2998
2999 return 0;
3000 }
3001
3002 CodePoint <<= 4;
3003 CodePoint += Value;
3004
3005 CurPtr += CharSize;
3006 }
3007
3008 if (Result) {
3009 Result->setFlag(Token::HasUCN);
3010 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
3011 StartPtr = CurPtr;
3012 else
3013 while (StartPtr != CurPtr)
3014 (void)getAndAdvanceChar(StartPtr, *Result);
3015 } else {
3016 StartPtr = CurPtr;
3017 }
3018
3019 // Don't apply C family restrictions to UCNs in assembly mode
3020 if (LangOpts.AsmPreprocessor)
3021 return CodePoint;
3022
3023 // C99 6.4.3p2: A universal character name shall not specify a character whose
3024 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3025 // 0060 (`), nor one in the range D800 through DFFF inclusive.)
3026 // C++11 [lex.charset]p2: If the hexadecimal value for a
3027 // universal-character-name corresponds to a surrogate code point (in the
3028 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3029 // if the hexadecimal value for a universal-character-name outside the
3030 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3031 // string literal corresponds to a control character (in either of the
3032 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3033 // basic source character set, the program is ill-formed.
3034 if (CodePoint < 0xA0) {
3035 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
3036 return CodePoint;
3037
3038 // We don't use isLexingRawMode() here because we need to warn about bad
3039 // UCNs even when skipping preprocessing tokens in a #if block.
3040 if (Result && PP) {
3041 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3042 Diag(BufferPtr, diag::err_ucn_control_character);
3043 else {
3044 char C = static_cast<char>(CodePoint);
3045 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3046 }
3047 }
3048
3049 return 0;
3050 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3051 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3052 // We don't use isLexingRawMode() here because we need to diagnose bad
3053 // UCNs even when skipping preprocessing tokens in a #if block.
3054 if (Result && PP) {
3055 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3056 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3057 else
3058 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3059 }
3060 return 0;
3061 }
3062
3063 return CodePoint;
3064}
3065
3066bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3067 const char *CurPtr) {
3068 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3069 UnicodeWhitespaceCharRanges);
3070 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3071 UnicodeWhitespaceChars.contains(C)) {
3072 Diag(BufferPtr, diag::ext_unicode_whitespace)
3073 << makeCharRange(*this, BufferPtr, CurPtr);
3074
3075 Result.setFlag(Token::LeadingSpace);
3076 return true;
3077 }
3078 return false;
3079}
3080
3081bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3082 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
3083 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3084 !PP->isPreprocessedOutput()) {
3085 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3086 makeCharRange(*this, BufferPtr, CurPtr),
3087 /*IsFirst=*/true);
3088 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
3089 makeCharRange(*this, BufferPtr, CurPtr));
3090 }
3091
3092 MIOpt.ReadToken();
3093 return LexIdentifier(Result, CurPtr);
3094 }
3095
3096 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3097 !PP->isPreprocessedOutput() &&
3098 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
3099 // Non-ASCII characters tend to creep into source code unintentionally.
3100 // Instead of letting the parser complain about the unknown token,
3101 // just drop the character.
3102 // Note that we can /only/ do this when the non-ASCII character is actually
3103 // spelled as Unicode, not written as a UCN. The standard requires that
3104 // we not throw away any possible preprocessor tokens, but there's a
3105 // loophole in the mapping of Unicode characters to basic character set
3106 // characters that allows us to map these particular characters to, say,
3107 // whitespace.
3108 Diag(BufferPtr, diag::err_non_ascii)
3109 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3110
3111 BufferPtr = CurPtr;
3112 return false;
3113 }
3114
3115 // Otherwise, we have an explicit UCN or a character that's unlikely to show
3116 // up by accident.
3117 MIOpt.ReadToken();
3118 FormTokenWithChars(Result, CurPtr, tok::unknown);
3119 return true;
3120}
3121
3122void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3123 IsAtStartOfLine = Result.isAtStartOfLine();
3124 HasLeadingSpace = Result.hasLeadingSpace();
3125 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3126 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3127}
3128
3129bool Lexer::Lex(Token &Result) {
3130 // Start a new token.
3131 Result.startToken();
3132
3133 // Set up misc whitespace flags for LexTokenInternal.
3134 if (IsAtStartOfLine) {
3135 Result.setFlag(Token::StartOfLine);
3136 IsAtStartOfLine = false;
3137 }
3138
3139 if (HasLeadingSpace) {
3140 Result.setFlag(Token::LeadingSpace);
3141 HasLeadingSpace = false;
3142 }
3143
3144 if (HasLeadingEmptyMacro) {
3145 Result.setFlag(Token::LeadingEmptyMacro);
3146 HasLeadingEmptyMacro = false;
3147 }
3148
3149 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3150 IsAtPhysicalStartOfLine = false;
3151 bool isRawLex = isLexingRawMode();
3152 (void) isRawLex;
3153 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3154 // (After the LexTokenInternal call, the lexer might be destroyed.)
3155 assert((returnedToken || !isRawLex) && "Raw lex must succeed")(((returnedToken || !isRawLex) && "Raw lex must succeed"
) ? static_cast<void> (0) : __assert_fail ("(returnedToken || !isRawLex) && \"Raw lex must succeed\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 3155, __PRETTY_FUNCTION__))
;
3156 return returnedToken;
3157}
3158
3159/// LexTokenInternal - This implements a simple C family lexer. It is an
3160/// extremely performance critical piece of code. This assumes that the buffer
3161/// has a null character at the end of the file. This returns a preprocessing
3162/// token, not a normal token, as such, it is an internal interface. It assumes
3163/// that the Flags of result have been cleared before calling this.
3164bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3165LexNextToken:
3166 // New token, can't need cleaning yet.
3167 Result.clearFlag(Token::NeedsCleaning);
3168 Result.setIdentifierInfo(nullptr);
3169
3170 // CurPtr - Cache BufferPtr in an automatic variable.
3171 const char *CurPtr = BufferPtr;
3172
3173 // Small amounts of horizontal whitespace is very common between tokens.
3174 if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
3175 ++CurPtr;
3176 while ((*CurPtr == ' ') || (*CurPtr == '\t'))
3177 ++CurPtr;
3178
3179 // If we are keeping whitespace and other tokens, just return what we just
3180 // skipped. The next lexer invocation will return the token after the
3181 // whitespace.
3182 if (isKeepWhitespaceMode()) {
3183 FormTokenWithChars(Result, CurPtr, tok::unknown);
3184 // FIXME: The next token will not have LeadingSpace set.
3185 return true;
3186 }
3187
3188 BufferPtr = CurPtr;
3189 Result.setFlag(Token::LeadingSpace);
3190 }
3191
3192 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3193
3194 // Read a character, advancing over it.
3195 char Char = getAndAdvanceChar(CurPtr, Result);
3196 tok::TokenKind Kind;
3197
3198 switch (Char) {
3199 case 0: // Null.
3200 // Found end of file?
3201 if (CurPtr-1 == BufferEnd)
3202 return LexEndOfFile(Result, CurPtr-1);
3203
3204 // Check if we are performing code completion.
3205 if (isCodeCompletionPoint(CurPtr-1)) {
3206 // Return the code-completion token.
3207 Result.startToken();
3208 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3209 return true;
3210 }
3211
3212 if (!isLexingRawMode())
3213 Diag(CurPtr-1, diag::null_in_file);
3214 Result.setFlag(Token::LeadingSpace);
3215 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3216 return true; // KeepWhitespaceMode
3217
3218 // We know the lexer hasn't changed, so just try again with this lexer.
3219 // (We manually eliminate the tail call to avoid recursion.)
3220 goto LexNextToken;
3221
3222 case 26: // DOS & CP/M EOF: "^Z".
3223 // If we're in Microsoft extensions mode, treat this as end of file.
3224 if (LangOpts.MicrosoftExt) {
3225 if (!isLexingRawMode())
3226 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3227 return LexEndOfFile(Result, CurPtr-1);
3228 }
3229
3230 // If Microsoft extensions are disabled, this is just random garbage.
3231 Kind = tok::unknown;
3232 break;
3233
3234 case '\r':
3235 if (CurPtr[0] == '\n')
3236 Char = getAndAdvanceChar(CurPtr, Result);
3237 LLVM_FALLTHROUGH[[clang::fallthrough]];
3238 case '\n':
3239 // If we are inside a preprocessor directive and we see the end of line,
3240 // we know we are done with the directive, so return an EOD token.
3241 if (ParsingPreprocessorDirective) {
3242 // Done parsing the "line".
3243 ParsingPreprocessorDirective = false;
3244
3245 // Restore comment saving mode, in case it was disabled for directive.
3246 if (PP)
3247 resetExtendedTokenMode();
3248
3249 // Since we consumed a newline, we are back at the start of a line.
3250 IsAtStartOfLine = true;
3251 IsAtPhysicalStartOfLine = true;
3252
3253 Kind = tok::eod;
3254 break;
3255 }
3256
3257 // No leading whitespace seen so far.
3258 Result.clearFlag(Token::LeadingSpace);
3259
3260 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3261 return true; // KeepWhitespaceMode
3262
3263 // We only saw whitespace, so just try again with this lexer.
3264 // (We manually eliminate the tail call to avoid recursion.)
3265 goto LexNextToken;
3266 case ' ':
3267 case '\t':
3268 case '\f':
3269 case '\v':
3270 SkipHorizontalWhitespace:
3271 Result.setFlag(Token::LeadingSpace);
3272 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3273 return true; // KeepWhitespaceMode
3274
3275 SkipIgnoredUnits:
3276 CurPtr = BufferPtr;
3277
3278 // If the next token is obviously a // or /* */ comment, skip it efficiently
3279 // too (without going through the big switch stmt).
3280 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3281 LangOpts.LineComment &&
3282 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3283 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3284 return true; // There is a token to return.
3285 goto SkipIgnoredUnits;
3286 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3287 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3288 return true; // There is a token to return.
3289 goto SkipIgnoredUnits;
3290 } else if (isHorizontalWhitespace(*CurPtr)) {
3291 goto SkipHorizontalWhitespace;
3292 }
3293 // We only saw whitespace, so just try again with this lexer.
3294 // (We manually eliminate the tail call to avoid recursion.)
3295 goto LexNextToken;
3296
3297 // C99 6.4.4.1: Integer Constants.
3298 // C99 6.4.4.2: Floating Constants.
3299 case '0': case '1': case '2': case '3': case '4':
3300 case '5': case '6': case '7': case '8': case '9':
3301 // Notify MIOpt that we read a non-whitespace/non-comment token.
3302 MIOpt.ReadToken();
3303 return LexNumericConstant(Result, CurPtr);
3304
3305 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3306 // Notify MIOpt that we read a non-whitespace/non-comment token.
3307 MIOpt.ReadToken();
3308
3309 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3310 Char = getCharAndSize(CurPtr, SizeTmp);
3311
3312 // UTF-16 string literal
3313 if (Char == '"')
3314 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3315 tok::utf16_string_literal);
3316
3317 // UTF-16 character constant
3318 if (Char == '\'')
3319 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3320 tok::utf16_char_constant);
3321
3322 // UTF-16 raw string literal
3323 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3324 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3325 return LexRawStringLiteral(Result,
3326 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3327 SizeTmp2, Result),
3328 tok::utf16_string_literal);
3329
3330 if (Char == '8') {
3331 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3332
3333 // UTF-8 string literal
3334 if (Char2 == '"')
3335 return LexStringLiteral(Result,
3336 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3337 SizeTmp2, Result),
3338 tok::utf8_string_literal);
3339 if (Char2 == '\'' && LangOpts.CPlusPlus17)
3340 return LexCharConstant(
3341 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3342 SizeTmp2, Result),
3343 tok::utf8_char_constant);
3344
3345 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3346 unsigned SizeTmp3;
3347 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3348 // UTF-8 raw string literal
3349 if (Char3 == '"') {
3350 return LexRawStringLiteral(Result,
3351 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3352 SizeTmp2, Result),
3353 SizeTmp3, Result),
3354 tok::utf8_string_literal);
3355 }
3356 }
3357 }
3358 }
3359
3360 // treat u like the start of an identifier.
3361 return LexIdentifier(Result, CurPtr);
3362
3363 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
3364 // Notify MIOpt that we read a non-whitespace/non-comment token.
3365 MIOpt.ReadToken();
3366
3367 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3368 Char = getCharAndSize(CurPtr, SizeTmp);
3369
3370 // UTF-32 string literal
3371 if (Char == '"')
3372 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3373 tok::utf32_string_literal);
3374
3375 // UTF-32 character constant
3376 if (Char == '\'')
3377 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3378 tok::utf32_char_constant);
3379
3380 // UTF-32 raw string literal
3381 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3382 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3383 return LexRawStringLiteral(Result,
3384 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3385 SizeTmp2, Result),
3386 tok::utf32_string_literal);
3387 }
3388
3389 // treat U like the start of an identifier.
3390 return LexIdentifier(Result, CurPtr);
3391
3392 case 'R': // Identifier or C++0x raw string literal
3393 // Notify MIOpt that we read a non-whitespace/non-comment token.
3394 MIOpt.ReadToken();
3395
3396 if (LangOpts.CPlusPlus11) {
3397 Char = getCharAndSize(CurPtr, SizeTmp);
3398
3399 if (Char == '"')
3400 return LexRawStringLiteral(Result,
3401 ConsumeChar(CurPtr, SizeTmp, Result),
3402 tok::string_literal);
3403 }
3404
3405 // treat R like the start of an identifier.
3406 return LexIdentifier(Result, CurPtr);
3407
3408 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3409 // Notify MIOpt that we read a non-whitespace/non-comment token.
3410 MIOpt.ReadToken();
3411 Char = getCharAndSize(CurPtr, SizeTmp);
3412
3413 // Wide string literal.
3414 if (Char == '"')
3415 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3416 tok::wide_string_literal);
3417
3418 // Wide raw string literal.
3419 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3420 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3421 return LexRawStringLiteral(Result,
3422 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3423 SizeTmp2, Result),
3424 tok::wide_string_literal);
3425
3426 // Wide character constant.
3427 if (Char == '\'')
3428 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3429 tok::wide_char_constant);
3430 // FALL THROUGH, treating L like the start of an identifier.
3431 LLVM_FALLTHROUGH[[clang::fallthrough]];
3432
3433 // C99 6.4.2: Identifiers.
3434 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3435 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3436 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3437 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3438 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3439 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3440 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3441 case 'v': case 'w': case 'x': case 'y': case 'z':
3442 case '_':
3443 // Notify MIOpt that we read a non-whitespace/non-comment token.
3444 MIOpt.ReadToken();
3445 return LexIdentifier(Result, CurPtr);
3446
3447 case '$': // $ in identifiers.
3448 if (LangOpts.DollarIdents) {
3449 if (!isLexingRawMode())
3450 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3451 // Notify MIOpt that we read a non-whitespace/non-comment token.
3452 MIOpt.ReadToken();
3453 return LexIdentifier(Result, CurPtr);
3454 }
3455
3456 Kind = tok::unknown;
3457 break;
3458
3459 // C99 6.4.4: Character Constants.
3460 case '\'':
3461 // Notify MIOpt that we read a non-whitespace/non-comment token.
3462 MIOpt.ReadToken();
3463 return LexCharConstant(Result, CurPtr, tok::char_constant);
3464
3465 // C99 6.4.5: String Literals.
3466 case '"':
3467 // Notify MIOpt that we read a non-whitespace/non-comment token.
3468 MIOpt.ReadToken();
3469 return LexStringLiteral(Result, CurPtr, tok::string_literal);
3470
3471 // C99 6.4.6: Punctuators.
3472 case '?':
3473 Kind = tok::question;
3474 break;
3475 case '[':
3476 Kind = tok::l_square;
3477 break;
3478 case ']':
3479 Kind = tok::r_square;
3480 break;
3481 case '(':
3482 Kind = tok::l_paren;
3483 break;
3484 case ')':
3485 Kind = tok::r_paren;
3486 break;
3487 case '{':
3488 Kind = tok::l_brace;
3489 break;
3490 case '}':
3491 Kind = tok::r_brace;
3492 break;
3493 case '.':
3494 Char = getCharAndSize(CurPtr, SizeTmp);
3495 if (Char >= '0' && Char <= '9') {
3496 // Notify MIOpt that we read a non-whitespace/non-comment token.
3497 MIOpt.ReadToken();
3498
3499 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3500 } else if (LangOpts.CPlusPlus && Char == '*') {
3501 Kind = tok::periodstar;
3502 CurPtr += SizeTmp;
3503 } else if (Char == '.' &&
3504 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3505 Kind = tok::ellipsis;
3506 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3507 SizeTmp2, Result);
3508 } else {
3509 Kind = tok::period;
3510 }
3511 break;
3512 case '&':
3513 Char = getCharAndSize(CurPtr, SizeTmp);
3514 if (Char == '&') {
3515 Kind = tok::ampamp;
3516 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3517 } else if (Char == '=') {
3518 Kind = tok::ampequal;
3519 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3520 } else {
3521 Kind = tok::amp;
3522 }
3523 break;
3524 case '*':
3525 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3526 Kind = tok::starequal;
3527 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3528 } else {
3529 Kind = tok::star;
3530 }
3531 break;
3532 case '+':
3533 Char = getCharAndSize(CurPtr, SizeTmp);
3534 if (Char == '+') {
3535 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3536 Kind = tok::plusplus;
3537 } else if (Char == '=') {
3538 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3539 Kind = tok::plusequal;
3540 } else {
3541 Kind = tok::plus;
3542 }
3543 break;
3544 case '-':
3545 Char = getCharAndSize(CurPtr, SizeTmp);
3546 if (Char == '-') { // --
3547 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3548 Kind = tok::minusminus;
3549 } else if (Char == '>' && LangOpts.CPlusPlus &&
3550 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3551 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3552 SizeTmp2, Result);
3553 Kind = tok::arrowstar;
3554 } else if (Char == '>') { // ->
3555 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3556 Kind = tok::arrow;
3557 } else if (Char == '=') { // -=
3558 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3559 Kind = tok::minusequal;
3560 } else {
3561 Kind = tok::minus;
3562 }
3563 break;
3564 case '~':
3565 Kind = tok::tilde;
3566 break;
3567 case '!':
3568 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3569 Kind = tok::exclaimequal;
3570 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3571 } else {
3572 Kind = tok::exclaim;
3573 }
3574 break;
3575 case '/':
3576 // 6.4.9: Comments
3577 Char = getCharAndSize(CurPtr, SizeTmp);
3578 if (Char == '/') { // Line comment.
3579 // Even if Line comments are disabled (e.g. in C89 mode), we generally
3580 // want to lex this as a comment. There is one problem with this though,
3581 // that in one particular corner case, this can change the behavior of the
3582 // resultant program. For example, In "foo //**/ bar", C89 would lex
3583 // this as "foo / bar" and languages with Line comments would lex it as
3584 // "foo". Check to see if the character after the second slash is a '*'.
3585 // If so, we will lex that as a "/" instead of the start of a comment.
3586 // However, we never do this if we are just preprocessing.
3587 bool TreatAsComment = LangOpts.LineComment &&
3588 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3589 if (!TreatAsComment)
3590 if (!(PP && PP->isPreprocessedOutput()))
3591 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3592
3593 if (TreatAsComment) {
3594 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3595 TokAtPhysicalStartOfLine))
3596 return true; // There is a token to return.
3597
3598 // It is common for the tokens immediately after a // comment to be
3599 // whitespace (indentation for the next line). Instead of going through
3600 // the big switch, handle it efficiently now.
3601 goto SkipIgnoredUnits;
3602 }
3603 }
3604
3605 if (Char == '*') { // /**/ comment.
3606 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3607 TokAtPhysicalStartOfLine))
3608 return true; // There is a token to return.
3609
3610 // We only saw whitespace, so just try again with this lexer.
3611 // (We manually eliminate the tail call to avoid recursion.)
3612 goto LexNextToken;
3613 }
3614
3615 if (Char == '=') {
3616 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3617 Kind = tok::slashequal;
3618 } else {
3619 Kind = tok::slash;
3620 }
3621 break;
3622 case '%':
3623 Char = getCharAndSize(CurPtr, SizeTmp);
3624 if (Char == '=') {
3625 Kind = tok::percentequal;
3626 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3627 } else if (LangOpts.Digraphs && Char == '>') {
3628 Kind = tok::r_brace; // '%>' -> '}'
3629 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3630 } else if (LangOpts.Digraphs && Char == ':') {
3631 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3632 Char = getCharAndSize(CurPtr, SizeTmp);
3633 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3634 Kind = tok::hashhash; // '%:%:' -> '##'
3635 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3636 SizeTmp2, Result);
3637 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
3638 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3639 if (!isLexingRawMode())
3640 Diag(BufferPtr, diag::ext_charize_microsoft);
3641 Kind = tok::hashat;
3642 } else { // '%:' -> '#'
3643 // We parsed a # character. If this occurs at the start of the line,
3644 // it's actually the start of a preprocessing directive. Callback to
3645 // the preprocessor to handle it.
3646 // TODO: -fpreprocessed mode??
3647 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3648 goto HandleDirective;
3649
3650 Kind = tok::hash;
3651 }
3652 } else {
3653 Kind = tok::percent;
3654 }
3655 break;
3656 case '<':
3657 Char = getCharAndSize(CurPtr, SizeTmp);
3658 if (ParsingFilename) {
3659 return LexAngledStringLiteral(Result, CurPtr);
3660 } else if (Char == '<') {
3661 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3662 if (After == '=') {
3663 Kind = tok::lesslessequal;
3664 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3665 SizeTmp2, Result);
3666 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
3667 // If this is actually a '<<<<<<<' version control conflict marker,
3668 // recognize it as such and recover nicely.
3669 goto LexNextToken;
3670 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
3671 // If this is '<<<<' and we're in a Perforce-style conflict marker,
3672 // ignore it.
3673 goto LexNextToken;
3674 } else if (LangOpts.CUDA && After == '<') {
3675 Kind = tok::lesslessless;
3676 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3677 SizeTmp2, Result);
3678 } else {
3679 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3680 Kind = tok::lessless;
3681 }
3682 } else if (Char == '=') {
3683 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3684 if (After == '>') {
3685 if (getLangOpts().CPlusPlus2a) {
3686 if (!isLexingRawMode())
3687 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3688 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3689 SizeTmp2, Result);
3690 Kind = tok::spaceship;
3691 break;
3692 }
3693 // Suggest adding a space between the '<=' and the '>' to avoid a
3694 // change in semantics if this turns up in C++ <=17 mode.
3695 if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
3696 Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
3697 << FixItHint::CreateInsertion(
3698 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3699 }
3700 }
3701 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3702 Kind = tok::lessequal;
3703 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
3704 if (LangOpts.CPlusPlus11 &&
3705 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3706 // C++0x [lex.pptoken]p3:
3707 // Otherwise, if the next three characters are <:: and the subsequent
3708 // character is neither : nor >, the < is treated as a preprocessor
3709 // token by itself and not as the first character of the alternative
3710 // token <:.
3711 unsigned SizeTmp3;
3712 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3713 if (After != ':' && After != '>') {
3714 Kind = tok::less;
3715 if (!isLexingRawMode())
3716 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3717 break;
3718 }
3719 }
3720
3721 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3722 Kind = tok::l_square;
3723 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
3724 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3725 Kind = tok::l_brace;
3726 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
3727 lexEditorPlaceholder(Result, CurPtr)) {
3728 return true;
3729 } else {
3730 Kind = tok::less;
3731 }
3732 break;
3733 case '>':
3734 Char = getCharAndSize(CurPtr, SizeTmp);
3735 if (Char == '=') {
3736 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3737 Kind = tok::greaterequal;
3738 } else if (Char == '>') {
3739 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3740 if (After == '=') {
3741 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3742 SizeTmp2, Result);
3743 Kind = tok::greatergreaterequal;
3744 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
3745 // If this is actually a '>>>>' conflict marker, recognize it as such
3746 // and recover nicely.
3747 goto LexNextToken;
3748 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
3749 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3750 goto LexNextToken;
3751 } else if (LangOpts.CUDA && After == '>') {
3752 Kind = tok::greatergreatergreater;
3753 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3754 SizeTmp2, Result);
3755 } else {
3756 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3757 Kind = tok::greatergreater;
3758 }
3759 } else {
3760 Kind = tok::greater;
3761 }
3762 break;
3763 case '^':
3764 Char = getCharAndSize(CurPtr, SizeTmp);
3765 if (Char == '=') {
3766 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3767 Kind = tok::caretequal;
3768 } else if (LangOpts.OpenCL && Char == '^') {
3769 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3770 Kind = tok::caretcaret;
3771 } else {
3772 Kind = tok::caret;
3773 }
3774 break;
3775 case '|':
3776 Char = getCharAndSize(CurPtr, SizeTmp);
3777 if (Char == '=') {
3778 Kind = tok::pipeequal;
3779 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3780 } else if (Char == '|') {
3781 // If this is '|||||||' and we're in a conflict marker, ignore it.
3782 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
3783 goto LexNextToken;
3784 Kind = tok::pipepipe;
3785 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3786 } else {
3787 Kind = tok::pipe;
3788 }
3789 break;
3790 case ':':
3791 Char = getCharAndSize(CurPtr, SizeTmp);
3792 if (LangOpts.Digraphs && Char == '>') {
3793 Kind = tok::r_square; // ':>' -> ']'
3794 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3795 } else if ((LangOpts.CPlusPlus ||
3796 LangOpts.DoubleSquareBracketAttributes) &&
3797 Char == ':') {
3798 Kind = tok::coloncolon;
3799 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3800 } else {
3801 Kind = tok::colon;
3802 }
3803 break;
3804 case ';':
3805 Kind = tok::semi;
3806 break;
3807 case '=':
3808 Char = getCharAndSize(CurPtr, SizeTmp);
3809 if (Char == '=') {
3810 // If this is '====' and we're in a conflict marker, ignore it.
3811 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
3812 goto LexNextToken;
3813
3814 Kind = tok::equalequal;
3815 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3816 } else {
3817 Kind = tok::equal;
3818 }
3819 break;
3820 case ',':
3821 Kind = tok::comma;
3822 break;
3823 case '#':
3824 Char = getCharAndSize(CurPtr, SizeTmp);
3825 if (Char == '#') {
3826 Kind = tok::hashhash;
3827 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3828 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
3829 Kind = tok::hashat;
3830 if (!isLexingRawMode())
3831 Diag(BufferPtr, diag::ext_charize_microsoft);
3832 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3833 } else {
3834 // We parsed a # character. If this occurs at the start of the line,
3835 // it's actually the start of a preprocessing directive. Callback to
3836 // the preprocessor to handle it.
3837 // TODO: -fpreprocessed mode??
3838 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3839 goto HandleDirective;
3840
3841 Kind = tok::hash;
3842 }
3843 break;
3844
3845 case '@':
3846 // Objective C support.
3847 if (CurPtr[-1] == '@' && LangOpts.ObjC)
3848 Kind = tok::at;
3849 else
3850 Kind = tok::unknown;
3851 break;
3852
3853 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3854 case '\\':
3855 if (!LangOpts.AsmPreprocessor) {
3856 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3857 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3858 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3859 return true; // KeepWhitespaceMode
3860
3861 // We only saw whitespace, so just try again with this lexer.
3862 // (We manually eliminate the tail call to avoid recursion.)
3863 goto LexNextToken;
3864 }
3865
3866 return LexUnicode(Result, CodePoint, CurPtr);
3867 }
3868 }
3869
3870 Kind = tok::unknown;
3871 break;
3872
3873 default: {
3874 if (isASCII(Char)) {
3875 Kind = tok::unknown;
3876 break;
3877 }
3878
3879 llvm::UTF32 CodePoint;
3880
3881 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3882 // an escaped newline.
3883 --CurPtr;
3884 llvm::ConversionResult Status =
3885 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3886 (const llvm::UTF8 *)BufferEnd,
3887 &CodePoint,
3888 llvm::strictConversion);
3889 if (Status == llvm::conversionOK) {
3890 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3891 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3892 return true; // KeepWhitespaceMode
3893
3894 // We only saw whitespace, so just try again with this lexer.
3895 // (We manually eliminate the tail call to avoid recursion.)
3896 goto LexNextToken;
3897 }
3898 return LexUnicode(Result, CodePoint, CurPtr);
3899 }
3900
3901 if (isLexingRawMode() || ParsingPreprocessorDirective ||
3902 PP->isPreprocessedOutput()) {
3903 ++CurPtr;
3904 Kind = tok::unknown;
3905 break;
3906 }
3907
3908 // Non-ASCII characters tend to creep into source code unintentionally.
3909 // Instead of letting the parser complain about the unknown token,
3910 // just diagnose the invalid UTF-8, then drop the character.
3911 Diag(CurPtr, diag::err_invalid_utf8);
3912
3913 BufferPtr = CurPtr+1;
3914 // We're pretending the character didn't exist, so just try again with
3915 // this lexer.
3916 // (We manually eliminate the tail call to avoid recursion.)
3917 goto LexNextToken;
3918 }
3919 }
3920
3921 // Notify MIOpt that we read a non-whitespace/non-comment token.
3922 MIOpt.ReadToken();
3923
3924 // Update the location of token as well as BufferPtr.
3925 FormTokenWithChars(Result, CurPtr, Kind);
3926 return true;
3927
3928HandleDirective:
3929 // We parsed a # character and it's the start of a preprocessing directive.
3930
3931 FormTokenWithChars(Result, CurPtr, tok::hash);
3932 PP->HandleDirective(Result);
3933
3934 if (PP->hadModuleLoaderFatalFailure()) {
3935 // With a fatal failure in the module loader, we abort parsing.
3936 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof")((Result.is(tok::eof) && "Preprocessor did not set tok:eof"
) ? static_cast<void> (0) : __assert_fail ("Result.is(tok::eof) && \"Preprocessor did not set tok:eof\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/lib/Lex/Lexer.cpp"
, 3936, __PRETTY_FUNCTION__))
;
3937 return true;
3938 }
3939
3940 // We parsed the directive; lex a token with the new state.
3941 return false;
3942}

/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Lexer.h

1//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the Lexer interface.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_LEX_LEXER_H
15#define LLVM_CLANG_LEX_LEXER_H
16
17#include "clang/Basic/LangOptions.h"
18#include "clang/Basic/SourceLocation.h"
19#include "clang/Basic/TokenKinds.h"
20#include "clang/Lex/PreprocessorLexer.h"
21#include "clang/Lex/Token.h"
22#include "llvm/ADT/Optional.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include <cassert>
26#include <cstdint>
27#include <string>
28
29namespace llvm {
30
31class MemoryBuffer;
32
33} // namespace llvm
34
35namespace clang {
36
37class DiagnosticBuilder;
38class Preprocessor;
39class SourceManager;
40
41/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
42/// recovering from.
43enum ConflictMarkerKind {
44 /// Not within a conflict marker.
45 CMK_None,
46
47 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
48 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
49 CMK_Normal,
50
51 /// A Perforce-style conflict marker, initiated by 4 ">"s,
52 /// separated by 4 "="s, and terminated by 4 "<"s.
53 CMK_Perforce
54};
55
56/// Describes the bounds (start, size) of the preamble and a flag required by
57/// PreprocessorOptions::PrecompiledPreambleBytes.
58/// The preamble includes the BOM, if any.
59struct PreambleBounds {
60 /// Size of the preamble in bytes.
61 unsigned Size;
62
63 /// Whether the preamble ends at the start of a new line.
64 ///
65 /// Used to inform the lexer as to whether it's starting at the beginning of
66 /// a line after skipping the preamble.
67 bool PreambleEndsAtStartOfLine;
68
69 PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
70 : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
71};
72
73/// Lexer - This provides a simple interface that turns a text buffer into a
74/// stream of tokens. This provides no support for file reading or buffering,
75/// or buffering/seeking of tokens, only forward lexing is supported. It relies
76/// on the specified Preprocessor object to handle preprocessor directives, etc.
77class Lexer : public PreprocessorLexer {
78 friend class Preprocessor;
79
80 void anchor() override;
81
82 //===--------------------------------------------------------------------===//
83 // Constant configuration values for this lexer.
84
85 // Start of the buffer.
86 const char *BufferStart;
87
88 // End of the buffer.
89 const char *BufferEnd;
90
91 // Location for start of file.
92 SourceLocation FileLoc;
93
94 // LangOpts enabled by this language (cache).
95 LangOptions LangOpts;
96
97 // True if lexer for _Pragma handling.
98 bool Is_PragmaLexer;
99
100 //===--------------------------------------------------------------------===//
101 // Context-specific lexing flags set by the preprocessor.
102 //
103
104 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
105 /// and return them as tokens. This is used for -C and -CC modes, and
106 /// whitespace preservation can be useful for some clients that want to lex
107 /// the file in raw mode and get every character from the file.
108 ///
109 /// When this is set to 2 it returns comments and whitespace. When set to 1
110 /// it returns comments, when it is set to 0 it returns normal tokens only.
111 unsigned char ExtendedTokenMode;
112
113 //===--------------------------------------------------------------------===//
114 // Context that changes as the file is lexed.
115 // NOTE: any state that mutates when in raw mode must have save/restore code
116 // in Lexer::isNextPPTokenLParen.
117
118 // BufferPtr - Current pointer into the buffer. This is the next character
119 // to be lexed.
120 const char *BufferPtr;
121
122 // IsAtStartOfLine - True if the next lexed token should get the "start of
123 // line" flag set on it.
124 bool IsAtStartOfLine;
125
126 bool IsAtPhysicalStartOfLine;
127
128 bool HasLeadingSpace;
129
130 bool HasLeadingEmptyMacro;
131
132 // CurrentConflictMarkerState - The kind of conflict marker we are handling.
133 ConflictMarkerKind CurrentConflictMarkerState;
134
135 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
136
137public:
138 /// Lexer constructor - Create a new lexer object for the specified buffer
139 /// with the specified preprocessor managing the lexing process. This lexer
140 /// assumes that the associated file buffer and Preprocessor objects will
141 /// outlive it, so it doesn't take ownership of either of them.
142 Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP);
143
144 /// Lexer constructor - Create a new raw lexer object. This object is only
145 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
146 /// text range will outlive it, so it doesn't take ownership of it.
147 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
148 const char *BufStart, const char *BufPtr, const char *BufEnd);
149
150 /// Lexer constructor - Create a new raw lexer object. This object is only
151 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
152 /// text range will outlive it, so it doesn't take ownership of it.
153 Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
154 const SourceManager &SM, const LangOptions &LangOpts);
155
156 Lexer(const Lexer &) = delete;
157 Lexer &operator=(const Lexer &) = delete;
158
159 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
160 /// _Pragma expansion. This has a variety of magic semantics that this method
161 /// sets up. It returns a new'd Lexer that must be delete'd when done.
162 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
163 SourceLocation ExpansionLocStart,
164 SourceLocation ExpansionLocEnd,
165 unsigned TokLen, Preprocessor &PP);
166
167 /// getLangOpts - Return the language features currently enabled.
168 /// NOTE: this lexer modifies features as a file is parsed!
169 const LangOptions &getLangOpts() const { return LangOpts; }
170
171 /// getFileLoc - Return the File Location for the file we are lexing out of.
172 /// The physical location encodes the location where the characters come from,
173 /// the virtual location encodes where we should *claim* the characters came
174 /// from. Currently this is only used by _Pragma handling.
175 SourceLocation getFileLoc() const { return FileLoc; }
176
177private:
178 /// Lex - Return the next token in the file. If this is the end of file, it
179 /// return the tok::eof token. This implicitly involves the preprocessor.
180 bool Lex(Token &Result);
181
182public:
183 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
184 bool isPragmaLexer() const { return Is_PragmaLexer; }
185
186private:
187 /// IndirectLex - An indirect call to 'Lex' that can be invoked via
188 /// the PreprocessorLexer interface.
189 void IndirectLex(Token &Result) override { Lex(Result); }
190
191public:
192 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
193 /// associated preprocessor object. Return true if the 'next character to
194 /// read' pointer points at the end of the lexer buffer, false otherwise.
195 bool LexFromRawLexer(Token &Result) {
196 assert(LexingRawMode && "Not already in raw mode!")((LexingRawMode && "Not already in raw mode!") ? static_cast
<void> (0) : __assert_fail ("LexingRawMode && \"Not already in raw mode!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Lexer.h"
, 196, __PRETTY_FUNCTION__))
;
197 Lex(Result);
198 // Note that lexing to the end of the buffer doesn't implicitly delete the
199 // lexer when in raw mode.
200 return BufferPtr == BufferEnd;
201 }
202
203 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
204 /// every character in the file, including whitespace and comments. This
205 /// should only be used in raw mode, as the preprocessor is not prepared to
206 /// deal with the excess tokens.
207 bool isKeepWhitespaceMode() const {
208 return ExtendedTokenMode > 1;
209 }
210
211 /// SetKeepWhitespaceMode - This method lets clients enable or disable
212 /// whitespace retention mode.
213 void SetKeepWhitespaceMode(bool Val) {
214 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&(((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
"Can only retain whitespace in raw mode or -traditional-cpp"
) ? static_cast<void> (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Lexer.h"
, 215, __PRETTY_FUNCTION__))
215 "Can only retain whitespace in raw mode or -traditional-cpp")(((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
"Can only retain whitespace in raw mode or -traditional-cpp"
) ? static_cast<void> (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Lexer.h"
, 215, __PRETTY_FUNCTION__))
;
216 ExtendedTokenMode = Val ? 2 : 0;
217 }
218
219 /// inKeepCommentMode - Return true if the lexer should return comments as
220 /// tokens.
221 bool inKeepCommentMode() const {
222 return ExtendedTokenMode > 0;
223 }
224
225 /// SetCommentRetentionMode - Change the comment retention mode of the lexer
226 /// to the specified mode. This is really only useful when lexing in raw
227 /// mode, because otherwise the lexer needs to manage this.
228 void SetCommentRetentionState(bool Mode) {
229 assert(!isKeepWhitespaceMode() &&((!isKeepWhitespaceMode() && "Can't play with comment retention state when retaining whitespace"
) ? static_cast<void> (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Lexer.h"
, 230, __PRETTY_FUNCTION__))
230 "Can't play with comment retention state when retaining whitespace")((!isKeepWhitespaceMode() && "Can't play with comment retention state when retaining whitespace"
) ? static_cast<void> (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Lexer.h"
, 230, __PRETTY_FUNCTION__))
;
231 ExtendedTokenMode = Mode ? 1 : 0;
232 }
233
234 /// Sets the extended token mode back to its initial value, according to the
235 /// language options and preprocessor. This controls whether the lexer
236 /// produces comment and whitespace tokens.
237 ///
238 /// This requires the lexer to have an associated preprocessor. A standalone
239 /// lexer has nothing to reset to.
240 void resetExtendedTokenMode();
241
242 /// Gets source code buffer.
243 StringRef getBuffer() const {
244 return StringRef(BufferStart, BufferEnd - BufferStart);
245 }
246
247 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
248 /// uninterpreted string. This switches the lexer out of directive mode.
249 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
250
251
252 /// Diag - Forwarding function for diagnostics. This translate a source
253 /// position in the current buffer into a SourceLocation object for rendering.
254 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
255
256 /// getSourceLocation - Return a source location identifier for the specified
257 /// offset in the current file.
258 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
259
260 /// getSourceLocation - Return a source location for the next character in
261 /// the current file.
262 SourceLocation getSourceLocation() override {
263 return getSourceLocation(BufferPtr);
264 }
265
266 /// Return the current location in the buffer.
267 const char *getBufferLocation() const { return BufferPtr; }
268
269 /// Stringify - Convert the specified string into a C string by i) escaping
270 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
271 /// If Charify is true, this escapes the ' character instead of ".
272 static std::string Stringify(StringRef Str, bool Charify = false);
273
274 /// Stringify - Convert the specified string into a C string by i) escaping
275 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
276 static void Stringify(SmallVectorImpl<char> &Str);
277
278 /// getSpelling - This method is used to get the spelling of a token into a
279 /// preallocated buffer, instead of as an std::string. The caller is required
280 /// to allocate enough space for the token, which is guaranteed to be at least
281 /// Tok.getLength() bytes long. The length of the actual result is returned.
282 ///
283 /// Note that this method may do two possible things: it may either fill in
284 /// the buffer specified with characters, or it may *change the input pointer*
285 /// to point to a constant buffer with the data already in it (avoiding a
286 /// copy). The caller is not allowed to modify the returned buffer pointer
287 /// if an internal buffer is returned.
288 static unsigned getSpelling(const Token &Tok, const char *&Buffer,
289 const SourceManager &SourceMgr,
290 const LangOptions &LangOpts,
291 bool *Invalid = nullptr);
292
293 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
294 /// token is the characters used to represent the token in the source file
295 /// after trigraph expansion and escaped-newline folding. In particular, this
296 /// wants to get the true, uncanonicalized, spelling of things like digraphs
297 /// UCNs, etc.
298 static std::string getSpelling(const Token &Tok,
299 const SourceManager &SourceMgr,
300 const LangOptions &LangOpts,
301 bool *Invalid = nullptr);
302
303 /// getSpelling - This method is used to get the spelling of the
304 /// token at the given source location. If, as is usually true, it
305 /// is not necessary to copy any data, then the returned string may
306 /// not point into the provided buffer.
307 ///
308 /// This method lexes at the expansion depth of the given
309 /// location and does not jump to the expansion or spelling
310 /// location.
311 static StringRef getSpelling(SourceLocation loc,
312 SmallVectorImpl<char> &buffer,
313 const SourceManager &SM,
314 const LangOptions &options,
315 bool *invalid = nullptr);
316
317 /// MeasureTokenLength - Relex the token at the specified location and return
318 /// its length in bytes in the input file. If the token needs cleaning (e.g.
319 /// includes a trigraph or an escaped newline) then this count includes bytes
320 /// that are part of that.
321 static unsigned MeasureTokenLength(SourceLocation Loc,
322 const SourceManager &SM,
323 const LangOptions &LangOpts);
324
325 /// Relex the token at the specified location.
326 /// \returns true if there was a failure, false on success.
327 static bool getRawToken(SourceLocation Loc, Token &Result,
328 const SourceManager &SM,
329 const LangOptions &LangOpts,
330 bool IgnoreWhiteSpace = false);
331
332 /// Given a location any where in a source buffer, find the location
333 /// that corresponds to the beginning of the token in which the original
334 /// source location lands.
335 static SourceLocation GetBeginningOfToken(SourceLocation Loc,
336 const SourceManager &SM,
337 const LangOptions &LangOpts);
338
339 /// Get the physical length (including trigraphs and escaped newlines) of the
340 /// first \p Characters characters of the token starting at TokStart.
341 static unsigned getTokenPrefixLength(SourceLocation TokStart,
342 unsigned CharNo,
343 const SourceManager &SM,
344 const LangOptions &LangOpts);
345
346 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
347 /// location at the start of a token, return a new location that specifies a
348 /// character within the token. This handles trigraphs and escaped newlines.
349 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
350 unsigned Characters,
351 const SourceManager &SM,
352 const LangOptions &LangOpts) {
353 return TokStart.getLocWithOffset(
354 getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
355 }
356
357 /// Computes the source location just past the end of the
358 /// token at this source location.
359 ///
360 /// This routine can be used to produce a source location that
361 /// points just past the end of the token referenced by \p Loc, and
362 /// is generally used when a diagnostic needs to point just after a
363 /// token where it expected something different that it received. If
364 /// the returned source location would not be meaningful (e.g., if
365 /// it points into a macro), this routine returns an invalid
366 /// source location.
367 ///
368 /// \param Offset an offset from the end of the token, where the source
369 /// location should refer to. The default offset (0) produces a source
370 /// location pointing just past the end of the token; an offset of 1 produces
371 /// a source location pointing to the last character in the token, etc.
372 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
373 const SourceManager &SM,
374 const LangOptions &LangOpts);
375
376 /// Given a token range, produce a corresponding CharSourceRange that
377 /// is not a token range. This allows the source range to be used by
378 /// components that don't have access to the lexer and thus can't find the
379 /// end of the range for themselves.
380 static CharSourceRange getAsCharRange(SourceRange Range,
381 const SourceManager &SM,
382 const LangOptions &LangOpts) {
383 SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
384 return End.isInvalid() ? CharSourceRange()
385 : CharSourceRange::getCharRange(
386 Range.getBegin(), End.getLocWithOffset(-1));
387 }
388 static CharSourceRange getAsCharRange(CharSourceRange Range,
389 const SourceManager &SM,
390 const LangOptions &LangOpts) {
391 return Range.isTokenRange()
392 ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
393 : Range;
394 }
395
396 /// Returns true if the given MacroID location points at the first
397 /// token of the macro expansion.
398 ///
399 /// \param MacroBegin If non-null and function returns true, it is set to
400 /// begin location of the macro.
401 static bool isAtStartOfMacroExpansion(SourceLocation loc,
402 const SourceManager &SM,
403 const LangOptions &LangOpts,
404 SourceLocation *MacroBegin = nullptr);
405
406 /// Returns true if the given MacroID location points at the last
407 /// token of the macro expansion.
408 ///
409 /// \param MacroEnd If non-null and function returns true, it is set to
410 /// end location of the macro.
411 static bool isAtEndOfMacroExpansion(SourceLocation loc,
412 const SourceManager &SM,
413 const LangOptions &LangOpts,
414 SourceLocation *MacroEnd = nullptr);
415
416 /// Accepts a range and returns a character range with file locations.
417 ///
418 /// Returns a null range if a part of the range resides inside a macro
419 /// expansion or the range does not reside on the same FileID.
420 ///
421 /// This function is trying to deal with macros and return a range based on
422 /// file locations. The cases where it can successfully handle macros are:
423 ///
424 /// -begin or end range lies at the start or end of a macro expansion, in
425 /// which case the location will be set to the expansion point, e.g:
426 /// \#define M 1 2
427 /// a M
428 /// If you have a range [a, 2] (where 2 came from the macro), the function
429 /// will return a range for "a M"
430 /// if you have range [a, 1], the function will fail because the range
431 /// overlaps with only a part of the macro
432 ///
433 /// -The macro is a function macro and the range can be mapped to the macro
434 /// arguments, e.g:
435 /// \#define M 1 2
436 /// \#define FM(x) x
437 /// FM(a b M)
438 /// if you have range [b, 2], the function will return the file range "b M"
439 /// inside the macro arguments.
440 /// if you have range [a, 2], the function will return the file range
441 /// "FM(a b M)" since the range includes all of the macro expansion.
442 static CharSourceRange makeFileCharRange(CharSourceRange Range,
443 const SourceManager &SM,
444 const LangOptions &LangOpts);
445
446 /// Returns a string for the source that the range encompasses.
447 static StringRef getSourceText(CharSourceRange Range,
448 const SourceManager &SM,
449 const LangOptions &LangOpts,
450 bool *Invalid = nullptr);
451
452 /// Retrieve the name of the immediate macro expansion.
453 ///
454 /// This routine starts from a source location, and finds the name of the macro
455 /// responsible for its immediate expansion. It looks through any intervening
456 /// macro argument expansions to compute this. It returns a StringRef which
457 /// refers to the SourceManager-owned buffer of the source where that macro
458 /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
459 static StringRef getImmediateMacroName(SourceLocation Loc,
460 const SourceManager &SM,
461 const LangOptions &LangOpts);
462
463 /// Retrieve the name of the immediate macro expansion.
464 ///
465 /// This routine starts from a source location, and finds the name of the
466 /// macro responsible for its immediate expansion. It looks through any
467 /// intervening macro argument expansions to compute this. It returns a
468 /// StringRef which refers to the SourceManager-owned buffer of the source
469 /// where that macro name is spelled. Thus, the result shouldn't out-live
470 /// that SourceManager.
471 ///
472 /// This differs from Lexer::getImmediateMacroName in that any macro argument
473 /// location will result in the topmost function macro that accepted it.
474 /// e.g.
475 /// \code
476 /// MAC1( MAC2(foo) )
477 /// \endcode
478 /// for location of 'foo' token, this function will return "MAC1" while
479 /// Lexer::getImmediateMacroName will return "MAC2".
480 static StringRef getImmediateMacroNameForDiagnostics(
481 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
482
483 /// Compute the preamble of the given file.
484 ///
485 /// The preamble of a file contains the initial comments, include directives,
486 /// and other preprocessor directives that occur before the code in this
487 /// particular file actually begins. The preamble of the main source file is
488 /// a potential prefix header.
489 ///
490 /// \param Buffer The memory buffer containing the file's contents.
491 ///
492 /// \param MaxLines If non-zero, restrict the length of the preamble
493 /// to fewer than this number of lines.
494 ///
495 /// \returns The offset into the file where the preamble ends and the rest
496 /// of the file begins along with a boolean value indicating whether
497 /// the preamble ends at the beginning of a new line.
498 static PreambleBounds ComputePreamble(StringRef Buffer,
499 const LangOptions &LangOpts,
500 unsigned MaxLines = 0);
501
502 /// Finds the token that comes right after the given location.
503 ///
504 /// Returns the next token, or none if the location is inside a macro.
505 static Optional<Token> findNextToken(SourceLocation Loc,
506 const SourceManager &SM,
507 const LangOptions &LangOpts);
508
509 /// Checks that the given token is the first token that occurs after
510 /// the given location (this excludes comments and whitespace). Returns the
511 /// location immediately after the specified token. If the token is not found
512 /// or the location is inside a macro, the returned source location will be
513 /// invalid.
514 static SourceLocation findLocationAfterToken(SourceLocation loc,
515 tok::TokenKind TKind,
516 const SourceManager &SM,
517 const LangOptions &LangOpts,
518 bool SkipTrailingWhitespaceAndNewLine);
519
520 /// Returns true if the given character could appear in an identifier.
521 static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
522
523 /// Checks whether new line pointed by Str is preceded by escape
524 /// sequence.
525 static bool isNewLineEscaped(const char *BufferStart, const char *Str);
526
527 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
528 /// emit a warning.
529 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
530 const LangOptions &LangOpts) {
531 // If this is not a trigraph and not a UCN or escaped newline, return
532 // quickly.
533 if (isObviouslySimpleCharacter(Ptr[0])) {
534 Size = 1;
535 return *Ptr;
536 }
537
538 Size = 0;
539 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
540 }
541
542 /// Returns the leading whitespace for line that corresponds to the given
543 /// location \p Loc.
544 static StringRef getIndentationForLine(SourceLocation Loc,
545 const SourceManager &SM);
546
547private:
548 //===--------------------------------------------------------------------===//
549 // Internal implementation interfaces.
550
551 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
552 /// by Lex.
553 ///
554 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
555
556 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
557
558 /// Given that a token begins with the Unicode character \p C, figure out
559 /// what kind of token it is and dispatch to the appropriate lexing helper
560 /// function.
561 bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
562
563 /// FormTokenWithChars - When we lex a token, we have identified a span
564 /// starting at BufferPtr, going to TokEnd that forms the token. This method
565 /// takes that range and assigns it to the token as its location and size. In
566 /// addition, since tokens cannot overlap, this also updates BufferPtr to be
567 /// TokEnd.
568 void FormTokenWithChars(Token &Result, const char *TokEnd,
569 tok::TokenKind Kind) {
570 unsigned TokLen = TokEnd-BufferPtr;
571 Result.setLength(TokLen);
572 Result.setLocation(getSourceLocation(BufferPtr, TokLen));
573 Result.setKind(Kind);
574 BufferPtr = TokEnd;
575 }
576
577 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
578 /// tok::l_paren token, 0 if it is something else and 2 if there are no more
579 /// tokens in the buffer controlled by this lexer.
580 unsigned isNextPPTokenLParen();
581
582 //===--------------------------------------------------------------------===//
583 // Lexer character reading interfaces.
584
585 // This lexer is built on two interfaces for reading characters, both of which
586 // automatically provide phase 1/2 translation. getAndAdvanceChar is used
587 // when we know that we will be reading a character from the input buffer and
588 // that this character will be part of the result token. This occurs in (f.e.)
589 // string processing, because we know we need to read until we find the
590 // closing '"' character.
591 //
592 // The second interface is the combination of getCharAndSize with
593 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
594 // returning it and its size. If the lexer decides that this character is
595 // part of the current token, it calls ConsumeChar on it. This two stage
596 // approach allows us to emit diagnostics for characters (e.g. warnings about
597 // trigraphs), knowing that they only are emitted if the character is
598 // consumed.
599
600 /// isObviouslySimpleCharacter - Return true if the specified character is
601 /// obviously the same in translation phase 1 and translation phase 3. This
602 /// can return false for characters that end up being the same, but it will
603 /// never return true for something that needs to be mapped.
604 static bool isObviouslySimpleCharacter(char C) {
605 return C != '?' && C != '\\';
606 }
607
608 /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
609 /// advance over it, and return it. This is tricky in several cases. Here we
610 /// just handle the trivial case and fall-back to the non-inlined
611 /// getCharAndSizeSlow method to handle the hard case.
612 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
613 // If this is not a trigraph and not a UCN or escaped newline, return
614 // quickly.
615 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
6
Taking false branch
616
617 unsigned Size = 0;
618 char C = getCharAndSizeSlow(Ptr, Size, &Tok);
7
Calling 'Lexer::getCharAndSizeSlow'
619 Ptr += Size;
620 return C;
621 }
622
623 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
624 /// and added to a given token, check to see if there are diagnostics that
625 /// need to be emitted or flags that need to be set on the token. If so, do
626 /// it.
627 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
628 // Normal case, we consumed exactly one token. Just return it.
629 if (Size == 1)
630 return Ptr+Size;
631
632 // Otherwise, re-lex the character with a current token, allowing
633 // diagnostics to be emitted and flags to be set.
634 Size = 0;
635 getCharAndSizeSlow(Ptr, Size, &Tok);
636 return Ptr+Size;
637 }
638
639 /// getCharAndSize - Peek a single 'character' from the specified buffer,
640 /// get its size, and return it. This is tricky in several cases. Here we
641 /// just handle the trivial case and fall-back to the non-inlined
642 /// getCharAndSizeSlow method to handle the hard case.
643 inline char getCharAndSize(const char *Ptr, unsigned &Size) {
644 // If this is not a trigraph and not a UCN or escaped newline, return
645 // quickly.
646 if (isObviouslySimpleCharacter(Ptr[0])) {
647 Size = 1;
648 return *Ptr;
649 }
650
651 Size = 0;
652 return getCharAndSizeSlow(Ptr, Size);
653 }
654
655 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
656 /// method.
657 char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
658 Token *Tok = nullptr);
659
660 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
661 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
662 /// to this function.
663 static unsigned getEscapedNewLineSize(const char *P);
664
665 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
666 /// them), skip over them and return the first non-escaped-newline found,
667 /// otherwise return P.
668 static const char *SkipEscapedNewLines(const char *P);
669
670 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
671 /// diagnostic.
672 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
673 const LangOptions &LangOpts);
674
675 //===--------------------------------------------------------------------===//
676 // Other lexer functions.
677
678 void SetByteOffset(unsigned Offset, bool StartOfLine);
679
680 void PropagateLineStartLeadingSpaceInfo(Token &Result);
681
682 const char *LexUDSuffix(Token &Result, const char *CurPtr,
683 bool IsStringLiteral);
684
685 // Helper functions to lex the remainder of a token of the specific type.
686 bool LexIdentifier (Token &Result, const char *CurPtr);
687 bool LexNumericConstant (Token &Result, const char *CurPtr);
688 bool LexStringLiteral (Token &Result, const char *CurPtr,
689 tok::TokenKind Kind);
690 bool LexRawStringLiteral (Token &Result, const char *CurPtr,
691 tok::TokenKind Kind);
692 bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
693 bool LexCharConstant (Token &Result, const char *CurPtr,
694 tok::TokenKind Kind);
695 bool LexEndOfFile (Token &Result, const char *CurPtr);
696 bool SkipWhitespace (Token &Result, const char *CurPtr,
697 bool &TokAtPhysicalStartOfLine);
698 bool SkipLineComment (Token &Result, const char *CurPtr,
699 bool &TokAtPhysicalStartOfLine);
700 bool SkipBlockComment (Token &Result, const char *CurPtr,
701 bool &TokAtPhysicalStartOfLine);
702 bool SaveLineComment (Token &Result, const char *CurPtr);
703
704 bool IsStartOfConflictMarker(const char *CurPtr);
705 bool HandleEndOfConflictMarker(const char *CurPtr);
706
707 bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
708
709 bool isCodeCompletionPoint(const char *CurPtr) const;
710 void cutOffLexing() { BufferPtr = BufferEnd; }
711
712 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
713
714 void codeCompleteIncludedFile(const char *PathStart,
715 const char *CompletionPoint, bool IsAngled);
716
717 /// Read a universal character name.
718 ///
719 /// \param StartPtr The position in the source buffer after the initial '\'.
720 /// If the UCN is syntactically well-formed (but not
721 /// necessarily valid), this parameter will be updated to
722 /// point to the character after the UCN.
723 /// \param SlashLoc The position in the source buffer of the '\'.
724 /// \param Result The token being formed. Pass \c nullptr to suppress
725 /// diagnostics and handle token formation in the caller.
726 ///
727 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
728 /// invalid.
729 uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
730
731 /// Try to consume a UCN as part of an identifier at the current
732 /// location.
733 /// \param CurPtr Initially points to the range of characters in the source
734 /// buffer containing the '\'. Updated to point past the end of
735 /// the UCN on success.
736 /// \param Size The number of characters occupied by the '\' (including
737 /// trigraphs and escaped newlines).
738 /// \param Result The token being produced. Marked as containing a UCN on
739 /// success.
740 /// \return \c true if a UCN was lexed and it produced an acceptable
741 /// identifier character, \c false otherwise.
742 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
743 Token &Result);
744
745 /// Try to consume an identifier character encoded in UTF-8.
746 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
747 /// sequence. On success, updated to point past the end of it.
748 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
749 /// character was lexed, \c false otherwise.
750 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
751};
752
753} // namespace clang
754
755#endif // LLVM_CLANG_LEX_LEXER_H

/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h

1//===--- Token.h - Token interface ------------------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the Token interface.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_LEX_TOKEN_H
15#define LLVM_CLANG_LEX_TOKEN_H
16
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/TokenKinds.h"
19#include "llvm/ADT/StringRef.h"
20#include <cassert>
21
22namespace clang {
23
24class IdentifierInfo;
25
26/// Token - This structure provides full information about a lexed token.
27/// It is not intended to be space efficient, it is intended to return as much
28/// information as possible about each returned token. This is expected to be
29/// compressed into a smaller form if memory footprint is important.
30///
31/// The parser can create a special "annotation token" representing a stream of
32/// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
33/// can be represented by a single typename annotation token that carries
34/// information about the SourceRange of the tokens and the type object.
35class Token {
36 /// The location of the token. This is actually a SourceLocation.
37 unsigned Loc;
38
39 // Conceptually these next two fields could be in a union. However, this
40 // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
41 // routine. Keeping as separate members with casts until a more beautiful fix
42 // presents itself.
43
44 /// UintData - This holds either the length of the token text, when
45 /// a normal token, or the end of the SourceRange when an annotation
46 /// token.
47 unsigned UintData;
48
49 /// PtrData - This is a union of four different pointer types, which depends
50 /// on what type of token this is:
51 /// Identifiers, keywords, etc:
52 /// This is an IdentifierInfo*, which contains the uniqued identifier
53 /// spelling.
54 /// Literals: isLiteral() returns true.
55 /// This is a pointer to the start of the token in a text buffer, which
56 /// may be dirty (have trigraphs / escaped newlines).
57 /// Annotations (resolved type names, C++ scopes, etc): isAnnotation().
58 /// This is a pointer to sema-specific data for the annotation token.
59 /// Eof:
60 // This is a pointer to a Decl.
61 /// Other:
62 /// This is null.
63 void *PtrData;
64
65 /// Kind - The actual flavor of token this is.
66 tok::TokenKind Kind;
67
68 /// Flags - Bits we track about this token, members of the TokenFlags enum.
69 unsigned short Flags;
70
71public:
72 // Various flags set per token:
73 enum TokenFlags {
74 StartOfLine = 0x01, // At start of line or only after whitespace
75 // (considering the line after macro expansion).
76 LeadingSpace = 0x02, // Whitespace exists before this token (considering
77 // whitespace after macro expansion).
78 DisableExpand = 0x04, // This identifier may never be macro expanded.
79 NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
80 LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
81 HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
82 HasUCN = 0x40, // This identifier contains a UCN.
83 IgnoredComma = 0x80, // This comma is not a macro argument separator (MS).
84 StringifiedInMacro = 0x100, // This string or character literal is formed by
85 // macro stringizing or charizing operator.
86 CommaAfterElided = 0x200, // The comma following this token was elided (MS).
87 IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
88 };
89
90 tok::TokenKind getKind() const { return Kind; }
91 void setKind(tok::TokenKind K) { Kind = K; }
92
93 /// is/isNot - Predicates to check if this token is a specific kind, as in
94 /// "if (Tok.is(tok::l_brace)) {...}".
95 bool is(tok::TokenKind K) const { return Kind == K; }
96 bool isNot(tok::TokenKind K) const { return Kind != K; }
97 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
98 return is(K1) || is(K2);
99 }
100 template <typename... Ts>
101 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, Ts... Ks) const {
102 return is(K1) || isOneOf(K2, Ks...);
103 }
104
105 /// Return true if this is a raw identifier (when lexing
106 /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode).
107 bool isAnyIdentifier() const {
108 return tok::isAnyIdentifier(getKind());
109 }
110
111 /// Return true if this is a "literal", like a numeric
112 /// constant, string, etc.
113 bool isLiteral() const {
114 return tok::isLiteral(getKind());
115 }
116
117 /// Return true if this is any of tok::annot_* kind tokens.
118 bool isAnnotation() const {
119 return tok::isAnnotation(getKind());
120 }
121
122 /// Return a source location identifier for the specified
123 /// offset in the current file.
124 SourceLocation getLocation() const {
125 return SourceLocation::getFromRawEncoding(Loc);
126 }
127 unsigned getLength() const {
128 assert(!isAnnotation() && "Annotation tokens have no length field")((!isAnnotation() && "Annotation tokens have no length field"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 128, __PRETTY_FUNCTION__))
;
129 return UintData;
130 }
131
132 void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); }
133 void setLength(unsigned Len) {
134 assert(!isAnnotation() && "Annotation tokens have no length field")((!isAnnotation() && "Annotation tokens have no length field"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 134, __PRETTY_FUNCTION__))
;
135 UintData = Len;
136 }
137
138 SourceLocation getAnnotationEndLoc() const {
139 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")((isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 139, __PRETTY_FUNCTION__))
;
140 return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc);
141 }
142 void setAnnotationEndLoc(SourceLocation L) {
143 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")((isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 143, __PRETTY_FUNCTION__))
;
144 UintData = L.getRawEncoding();
145 }
146
147 SourceLocation getLastLoc() const {
148 return isAnnotation() ? getAnnotationEndLoc() : getLocation();
149 }
150
151 SourceLocation getEndLoc() const {
152 return isAnnotation() ? getAnnotationEndLoc()
153 : getLocation().getLocWithOffset(getLength());
154 }
155
156 /// SourceRange of the group of tokens that this annotation token
157 /// represents.
158 SourceRange getAnnotationRange() const {
159 return SourceRange(getLocation(), getAnnotationEndLoc());
160 }
161 void setAnnotationRange(SourceRange R) {
162 setLocation(R.getBegin());
163 setAnnotationEndLoc(R.getEnd());
164 }
165
166 const char *getName() const { return tok::getTokenName(Kind); }
167
168 /// Reset all flags to cleared.
169 void startToken() {
170 Kind = tok::unknown;
171 Flags = 0;
172 PtrData = nullptr;
173 UintData = 0;
174 Loc = SourceLocation().getRawEncoding();
175 }
176
177 IdentifierInfo *getIdentifierInfo() const {
178 assert(isNot(tok::raw_identifier) &&((isNot(tok::raw_identifier) && "getIdentifierInfo() on a tok::raw_identifier token!"
) ? static_cast<void> (0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 179, __PRETTY_FUNCTION__))
179 "getIdentifierInfo() on a tok::raw_identifier token!")((isNot(tok::raw_identifier) && "getIdentifierInfo() on a tok::raw_identifier token!"
) ? static_cast<void> (0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 179, __PRETTY_FUNCTION__))
;
180 assert(!isAnnotation() &&((!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 181, __PRETTY_FUNCTION__))
181 "getIdentifierInfo() on an annotation token!")((!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 181, __PRETTY_FUNCTION__))
;
182 if (isLiteral()) return nullptr;
183 if (is(tok::eof)) return nullptr;
184 return (IdentifierInfo*) PtrData;
185 }
186 void setIdentifierInfo(IdentifierInfo *II) {
187 PtrData = (void*) II;
188 }
189
190 const void *getEofData() const {
191 assert(is(tok::eof))((is(tok::eof)) ? static_cast<void> (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 191, __PRETTY_FUNCTION__))
;
192 return reinterpret_cast<const void *>(PtrData);
193 }
194 void setEofData(const void *D) {
195 assert(is(tok::eof))((is(tok::eof)) ? static_cast<void> (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 195, __PRETTY_FUNCTION__))
;
196 assert(!PtrData)((!PtrData) ? static_cast<void> (0) : __assert_fail ("!PtrData"
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 196, __PRETTY_FUNCTION__))
;
197 PtrData = const_cast<void *>(D);
198 }
199
200 /// getRawIdentifier - For a raw identifier token (i.e., an identifier
201 /// lexed in raw mode), returns a reference to the text substring in the
202 /// buffer if known.
203 StringRef getRawIdentifier() const {
204 assert(is(tok::raw_identifier))((is(tok::raw_identifier)) ? static_cast<void> (0) : __assert_fail
("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 204, __PRETTY_FUNCTION__))
;
205 return StringRef(reinterpret_cast<const char *>(PtrData), getLength());
206 }
207 void setRawIdentifierData(const char *Ptr) {
208 assert(is(tok::raw_identifier))((is(tok::raw_identifier)) ? static_cast<void> (0) : __assert_fail
("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 208, __PRETTY_FUNCTION__))
;
209 PtrData = const_cast<char*>(Ptr);
210 }
211
212 /// getLiteralData - For a literal token (numeric constant, string, etc), this
213 /// returns a pointer to the start of it in the text buffer if known, null
214 /// otherwise.
215 const char *getLiteralData() const {
216 assert(isLiteral() && "Cannot get literal data of non-literal")((isLiteral() && "Cannot get literal data of non-literal"
) ? static_cast<void> (0) : __assert_fail ("isLiteral() && \"Cannot get literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 216, __PRETTY_FUNCTION__))
;
217 return reinterpret_cast<const char*>(PtrData);
218 }
219 void setLiteralData(const char *Ptr) {
220 assert(isLiteral() && "Cannot set literal data of non-literal")((isLiteral() && "Cannot set literal data of non-literal"
) ? static_cast<void> (0) : __assert_fail ("isLiteral() && \"Cannot set literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 220, __PRETTY_FUNCTION__))
;
221 PtrData = const_cast<char*>(Ptr);
222 }
223
224 void *getAnnotationValue() const {
225 assert(isAnnotation() && "Used AnnotVal on non-annotation token")((isAnnotation() && "Used AnnotVal on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 225, __PRETTY_FUNCTION__))
;
226 return PtrData;
227 }
228 void setAnnotationValue(void *val) {
229 assert(isAnnotation() && "Used AnnotVal on non-annotation token")((isAnnotation() && "Used AnnotVal on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-8~svn350071/tools/clang/include/clang/Lex/Token.h"
, 229, __PRETTY_FUNCTION__))
;
230 PtrData = val;
231 }
232
233 /// Set the specified flag.
234 void setFlag(TokenFlags Flag) {
235 Flags |= Flag;
14
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage
236 }
237
238 /// Get the specified flag.
239 bool getFlag(TokenFlags Flag) const {
240 return (Flags & Flag) != 0;
241 }
242
243 /// Unset the specified flag.
244 void clearFlag(TokenFlags Flag) {
245 Flags &= ~Flag;
246 }
247
248 /// Return the internal represtation of the flags.
249 ///
250 /// This is only intended for low-level operations such as writing tokens to
251 /// disk.
252 unsigned getFlags() const {
253 return Flags;
254 }
255
256 /// Set a flag to either true or false.
257 void setFlagValue(TokenFlags Flag, bool Val) {
258 if (Val)
259 setFlag(Flag);
260 else
261 clearFlag(Flag);
262 }
263
264 /// isAtStartOfLine - Return true if this token is at the start of a line.
265 ///
266 bool isAtStartOfLine() const { return getFlag(StartOfLine); }
267
268 /// Return true if this token has whitespace before it.
269 ///
270 bool hasLeadingSpace() const { return getFlag(LeadingSpace); }
271
272 /// Return true if this identifier token should never
273 /// be expanded in the future, due to C99 6.10.3.4p2.
274 bool isExpandDisabled() const { return getFlag(DisableExpand); }
275
276 /// Return true if we have an ObjC keyword identifier.
277 bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const;
278
279 /// Return the ObjC keyword kind.
280 tok::ObjCKeywordKind getObjCKeywordID() const;
281
282 /// Return true if this token has trigraphs or escaped newlines in it.
283 bool needsCleaning() const { return getFlag(NeedsCleaning); }
284
285 /// Return true if this token has an empty macro before it.
286 ///
287 bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); }
288
289 /// Return true if this token is a string or character literal which
290 /// has a ud-suffix.
291 bool hasUDSuffix() const { return getFlag(HasUDSuffix); }
292
293 /// Returns true if this token contains a universal character name.
294 bool hasUCN() const { return getFlag(HasUCN); }
295
296 /// Returns true if this token is formed by macro by stringizing or charizing
297 /// operator.
298 bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); }
299
300 /// Returns true if the comma after this token was elided.
301 bool commaAfterElided() const { return getFlag(CommaAfterElided); }
302
303 /// Returns true if this token is an editor placeholder.
304 ///
305 /// Editor placeholders are produced by the code-completion engine and are
306 /// represented as characters between '<#' and '#>' in the source code. The
307 /// lexer uses identifier tokens to represent placeholders.
308 bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
309};
310
311/// Information about the conditional stack (\#if directives)
312/// currently active.
313struct PPConditionalInfo {
314 /// Location where the conditional started.
315 SourceLocation IfLoc;
316
317 /// True if this was contained in a skipping directive, e.g.,
318 /// in a "\#if 0" block.
319 bool WasSkipping;
320
321 /// True if we have emitted tokens already, and now we're in
322 /// an \#else block or something. Only useful in Skipping blocks.
323 bool FoundNonSkip;
324
325 /// True if we've seen a \#else in this block. If so,
326 /// \#elif/\#else directives are not allowed.
327 bool FoundElse;
328};
329
330} // end namespace clang
331
332namespace llvm {
333 template <>
334 struct isPodLike<clang::Token> { static const bool value = true; };
335} // end namespace llvm
336
337#endif // LLVM_CLANG_LEX_TOKEN_H