Bug Summary

File:tools/clang/include/clang/Lex/Token.h
Warning:line 237, column 11
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Lexer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -relaxed-aliasing -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-9/lib/clang/9.0.0 -D CLANG_VENDOR="Debian " -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-9~svn361465/build-llvm/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include -I /build/llvm-toolchain-snapshot-9~svn361465/build-llvm/tools/clang/include -I /build/llvm-toolchain-snapshot-9~svn361465/build-llvm/include -I /build/llvm-toolchain-snapshot-9~svn361465/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/include/clang/9.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-9/lib/clang/9.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-9~svn361465/build-llvm/tools/clang/lib/Lex -fdebug-prefix-map=/build/llvm-toolchain-snapshot-9~svn361465=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fobjc-runtime=gcc -fno-common -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2019-05-24-031927-21217-1 -x c++ /build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp -faddrsig

/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp

1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
15#include "clang/Basic/CharInfo.h"
16#include "clang/Basic/IdentifierTable.h"
17#include "clang/Basic/LangOptions.h"
18#include "clang/Basic/SourceLocation.h"
19#include "clang/Basic/SourceManager.h"
20#include "clang/Basic/TokenKinds.h"
21#include "clang/Lex/LexDiagnostic.h"
22#include "clang/Lex/LiteralSupport.h"
23#include "clang/Lex/MultipleIncludeOpt.h"
24#include "clang/Lex/Preprocessor.h"
25#include "clang/Lex/PreprocessorOptions.h"
26#include "clang/Lex/Token.h"
27#include "clang/Basic/Diagnostic.h"
28#include "clang/Basic/LLVM.h"
29#include "clang/Basic/TokenKinds.h"
30#include "llvm/ADT/None.h"
31#include "llvm/ADT/Optional.h"
32#include "llvm/ADT/StringExtras.h"
33#include "llvm/ADT/StringSwitch.h"
34#include "llvm/ADT/StringRef.h"
35#include "llvm/Support/Compiler.h"
36#include "llvm/Support/ConvertUTF.h"
37#include "llvm/Support/MathExtras.h"
38#include "llvm/Support/MemoryBuffer.h"
39#include "llvm/Support/NativeFormatting.h"
40#include "llvm/Support/UnicodeCharRanges.h"
41#include <algorithm>
42#include <cassert>
43#include <cstddef>
44#include <cstdint>
45#include <cstring>
46#include <string>
47#include <tuple>
48#include <utility>
49
50using namespace clang;
51
52//===----------------------------------------------------------------------===//
53// Token Class Implementation
54//===----------------------------------------------------------------------===//
55
56/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
57bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
58 if (isAnnotation())
59 return false;
60 if (IdentifierInfo *II = getIdentifierInfo())
61 return II->getObjCKeywordID() == objcKey;
62 return false;
63}
64
65/// getObjCKeywordID - Return the ObjC keyword kind.
66tok::ObjCKeywordKind Token::getObjCKeywordID() const {
67 if (isAnnotation())
68 return tok::objc_not_keyword;
69 IdentifierInfo *specId = getIdentifierInfo();
70 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
71}
72
73//===----------------------------------------------------------------------===//
74// Lexer Class Implementation
75//===----------------------------------------------------------------------===//
76
77void Lexer::anchor() {}
78
79void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
80 const char *BufEnd) {
81 BufferStart = BufStart;
82 BufferPtr = BufPtr;
83 BufferEnd = BufEnd;
84
85 assert(BufEnd[0] == 0 &&((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 87, __PRETTY_FUNCTION__))
86 "We assume that the input buffer has a null character at the end"((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 87, __PRETTY_FUNCTION__))
87 " to simplify lexing!")((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? static_cast<void> (0) : __assert_fail
("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 87, __PRETTY_FUNCTION__))
;
88
89 // Check whether we have a BOM in the beginning of the buffer. If yes - act
90 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
91 // skip the UTF-8 BOM if it's present.
92 if (BufferStart == BufferPtr) {
93 // Determine the size of the BOM.
94 StringRef Buf(BufferStart, BufferEnd - BufferStart);
95 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97 .Default(0);
98
99 // Skip the BOM.
100 BufferPtr += BOMLength;
101 }
102
103 Is_PragmaLexer = false;
104 CurrentConflictMarkerState = CMK_None;
105
106 // Start of the file is a start of line.
107 IsAtStartOfLine = true;
108 IsAtPhysicalStartOfLine = true;
109
110 HasLeadingSpace = false;
111 HasLeadingEmptyMacro = false;
112
113 // We are not after parsing a #.
114 ParsingPreprocessorDirective = false;
115
116 // We are not after parsing #include.
117 ParsingFilename = false;
118
119 // We are not in raw mode. Raw mode disables diagnostics and interpretation
120 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
121 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122 // or otherwise skipping over tokens.
123 LexingRawMode = false;
124
125 // Default to not keeping comments.
126 ExtendedTokenMode = 0;
127}
128
129/// Lexer constructor - Create a new lexer object for the specified buffer
130/// with the specified preprocessor managing the lexing process. This lexer
131/// assumes that the associated file buffer and Preprocessor objects will
132/// outlive it, so it doesn't take ownership of either of them.
133Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
134 : PreprocessorLexer(&PP, FID),
135 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
136 LangOpts(PP.getLangOpts()) {
137 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
138 InputFile->getBufferEnd());
139
140 resetExtendedTokenMode();
141}
142
143/// Lexer constructor - Create a new raw lexer object. This object is only
144/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
145/// range will outlive it, so it doesn't take ownership of it.
146Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
147 const char *BufStart, const char *BufPtr, const char *BufEnd)
148 : FileLoc(fileloc), LangOpts(langOpts) {
149 InitLexer(BufStart, BufPtr, BufEnd);
150
151 // We *are* in raw mode.
152 LexingRawMode = true;
153}
154
155/// Lexer constructor - Create a new raw lexer object. This object is only
156/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
157/// range will outlive it, so it doesn't take ownership of it.
158Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
159 const SourceManager &SM, const LangOptions &langOpts)
160 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
161 FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
162
163void Lexer::resetExtendedTokenMode() {
164 assert(PP && "Cannot reset token mode without a preprocessor")((PP && "Cannot reset token mode without a preprocessor"
) ? static_cast<void> (0) : __assert_fail ("PP && \"Cannot reset token mode without a preprocessor\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 164, __PRETTY_FUNCTION__))
;
165 if (LangOpts.TraditionalCPP)
166 SetKeepWhitespaceMode(true);
167 else
168 SetCommentRetentionState(PP->getCommentRetentionState());
169}
170
171/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
172/// _Pragma expansion. This has a variety of magic semantics that this method
173/// sets up. It returns a new'd Lexer that must be delete'd when done.
174///
175/// On entrance to this routine, TokStartLoc is a macro location which has a
176/// spelling loc that indicates the bytes to be lexed for the token and an
177/// expansion location that indicates where all lexed tokens should be
178/// "expanded from".
179///
180/// TODO: It would really be nice to make _Pragma just be a wrapper around a
181/// normal lexer that remaps tokens as they fly by. This would require making
182/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
183/// interface that could handle this stuff. This would pull GetMappedTokenLoc
184/// out of the critical path of the lexer!
185///
186Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
187 SourceLocation ExpansionLocStart,
188 SourceLocation ExpansionLocEnd,
189 unsigned TokLen, Preprocessor &PP) {
190 SourceManager &SM = PP.getSourceManager();
191
192 // Create the lexer as if we were going to lex the file normally.
193 FileID SpellingFID = SM.getFileID(SpellingLoc);
194 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
195 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
196
197 // Now that the lexer is created, change the start/end locations so that we
198 // just lex the subsection of the file that we want. This is lexing from a
199 // scratch buffer.
200 const char *StrData = SM.getCharacterData(SpellingLoc);
201
202 L->BufferPtr = StrData;
203 L->BufferEnd = StrData+TokLen;
204 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!")((L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"
) ? static_cast<void> (0) : __assert_fail ("L->BufferEnd[0] == 0 && \"Buffer is not nul terminated!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 204, __PRETTY_FUNCTION__))
;
205
206 // Set the SourceLocation with the remapping information. This ensures that
207 // GetMappedTokenLoc will remap the tokens as they are lexed.
208 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
209 ExpansionLocStart,
210 ExpansionLocEnd, TokLen);
211
212 // Ensure that the lexer thinks it is inside a directive, so that end \n will
213 // return an EOD token.
214 L->ParsingPreprocessorDirective = true;
215
216 // This lexer really is for _Pragma.
217 L->Is_PragmaLexer = true;
218 return L;
219}
220
221template <typename T> static void StringifyImpl(T &Str, char Quote) {
222 typename T::size_type i = 0, e = Str.size();
223 while (i < e) {
224 if (Str[i] == '\\' || Str[i] == Quote) {
225 Str.insert(Str.begin() + i, '\\');
226 i += 2;
227 ++e;
228 } else if (Str[i] == '\n' || Str[i] == '\r') {
229 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
230 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
231 Str[i] != Str[i + 1]) {
232 Str[i] = '\\';
233 Str[i + 1] = 'n';
234 } else {
235 // Replace '\n' and '\r' to '\\' followed by 'n'.
236 Str[i] = '\\';
237 Str.insert(Str.begin() + i + 1, 'n');
238 ++e;
239 }
240 i += 2;
241 } else
242 ++i;
243 }
244}
245
246std::string Lexer::Stringify(StringRef Str, bool Charify) {
247 std::string Result = Str;
248 char Quote = Charify ? '\'' : '"';
249 StringifyImpl(Result, Quote);
250 return Result;
251}
252
253void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
254
255//===----------------------------------------------------------------------===//
256// Token Spelling
257//===----------------------------------------------------------------------===//
258
259/// Slow case of getSpelling. Extract the characters comprising the
260/// spelling of this token from the provided input buffer.
261static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
262 const LangOptions &LangOpts, char *Spelling) {
263 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token")((Tok.needsCleaning() && "getSpellingSlow called on simple token"
) ? static_cast<void> (0) : __assert_fail ("Tok.needsCleaning() && \"getSpellingSlow called on simple token\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 263, __PRETTY_FUNCTION__))
;
264
265 size_t Length = 0;
266 const char *BufEnd = BufPtr + Tok.getLength();
267
268 if (tok::isStringLiteral(Tok.getKind())) {
269 // Munch the encoding-prefix and opening double-quote.
270 while (BufPtr < BufEnd) {
271 unsigned Size;
272 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
273 BufPtr += Size;
274
275 if (Spelling[Length - 1] == '"')
276 break;
277 }
278
279 // Raw string literals need special handling; trigraph expansion and line
280 // splicing do not occur within their d-char-sequence nor within their
281 // r-char-sequence.
282 if (Length >= 2 &&
283 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
284 // Search backwards from the end of the token to find the matching closing
285 // quote.
286 const char *RawEnd = BufEnd;
287 do --RawEnd; while (*RawEnd != '"');
288 size_t RawLength = RawEnd - BufPtr + 1;
289
290 // Everything between the quotes is included verbatim in the spelling.
291 memcpy(Spelling + Length, BufPtr, RawLength);
292 Length += RawLength;
293 BufPtr += RawLength;
294
295 // The rest of the token is lexed normally.
296 }
297 }
298
299 while (BufPtr < BufEnd) {
300 unsigned Size;
301 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
302 BufPtr += Size;
303 }
304
305 assert(Length < Tok.getLength() &&((Length < Tok.getLength() && "NeedsCleaning flag set on token that didn't need cleaning!"
) ? static_cast<void> (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 306, __PRETTY_FUNCTION__))
306 "NeedsCleaning flag set on token that didn't need cleaning!")((Length < Tok.getLength() && "NeedsCleaning flag set on token that didn't need cleaning!"
) ? static_cast<void> (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 306, __PRETTY_FUNCTION__))
;
307 return Length;
308}
309
310/// getSpelling() - Return the 'spelling' of this token. The spelling of a
311/// token are the characters used to represent the token in the source file
312/// after trigraph expansion and escaped-newline folding. In particular, this
313/// wants to get the true, uncanonicalized, spelling of things like digraphs
314/// UCNs, etc.
315StringRef Lexer::getSpelling(SourceLocation loc,
316 SmallVectorImpl<char> &buffer,
317 const SourceManager &SM,
318 const LangOptions &options,
319 bool *invalid) {
320 // Break down the source location.
321 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
322
323 // Try to the load the file buffer.
324 bool invalidTemp = false;
325 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
326 if (invalidTemp) {
327 if (invalid) *invalid = true;
328 return {};
329 }
330
331 const char *tokenBegin = file.data() + locInfo.second;
332
333 // Lex from the start of the given location.
334 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
335 file.begin(), tokenBegin, file.end());
336 Token token;
337 lexer.LexFromRawLexer(token);
338
339 unsigned length = token.getLength();
340
341 // Common case: no need for cleaning.
342 if (!token.needsCleaning())
343 return StringRef(tokenBegin, length);
344
345 // Hard case, we need to relex the characters into the string.
346 buffer.resize(length);
347 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
348 return StringRef(buffer.data(), buffer.size());
349}
350
351/// getSpelling() - Return the 'spelling' of this token. The spelling of a
352/// token are the characters used to represent the token in the source file
353/// after trigraph expansion and escaped-newline folding. In particular, this
354/// wants to get the true, uncanonicalized, spelling of things like digraphs
355/// UCNs, etc.
356std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
357 const LangOptions &LangOpts, bool *Invalid) {
358 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(((int)Tok.getLength() >= 0 && "Token character range is bogus!"
) ? static_cast<void> (0) : __assert_fail ("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 358, __PRETTY_FUNCTION__))
;
359
360 bool CharDataInvalid = false;
361 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
362 &CharDataInvalid);
363 if (Invalid)
364 *Invalid = CharDataInvalid;
365 if (CharDataInvalid)
366 return {};
367
368 // If this token contains nothing interesting, return it directly.
369 if (!Tok.needsCleaning())
370 return std::string(TokStart, TokStart + Tok.getLength());
371
372 std::string Result;
373 Result.resize(Tok.getLength());
374 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
375 return Result;
376}
377
378/// getSpelling - This method is used to get the spelling of a token into a
379/// preallocated buffer, instead of as an std::string. The caller is required
380/// to allocate enough space for the token, which is guaranteed to be at least
381/// Tok.getLength() bytes long. The actual length of the token is returned.
382///
383/// Note that this method may do two possible things: it may either fill in
384/// the buffer specified with characters, or it may *change the input pointer*
385/// to point to a constant buffer with the data already in it (avoiding a
386/// copy). The caller is not allowed to modify the returned buffer pointer
387/// if an internal buffer is returned.
388unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
389 const SourceManager &SourceMgr,
390 const LangOptions &LangOpts, bool *Invalid) {
391 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(((int)Tok.getLength() >= 0 && "Token character range is bogus!"
) ? static_cast<void> (0) : __assert_fail ("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 391, __PRETTY_FUNCTION__))
;
392
393 const char *TokStart = nullptr;
394 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
395 if (Tok.is(tok::raw_identifier))
396 TokStart = Tok.getRawIdentifier().data();
397 else if (!Tok.hasUCN()) {
398 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
399 // Just return the string from the identifier table, which is very quick.
400 Buffer = II->getNameStart();
401 return II->getLength();
402 }
403 }
404
405 // NOTE: this can be checked even after testing for an IdentifierInfo.
406 if (Tok.isLiteral())
407 TokStart = Tok.getLiteralData();
408
409 if (!TokStart) {
410 // Compute the start of the token in the input lexer buffer.
411 bool CharDataInvalid = false;
412 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
413 if (Invalid)
414 *Invalid = CharDataInvalid;
415 if (CharDataInvalid) {
416 Buffer = "";
417 return 0;
418 }
419 }
420
421 // If this token contains nothing interesting, return it directly.
422 if (!Tok.needsCleaning()) {
423 Buffer = TokStart;
424 return Tok.getLength();
425 }
426
427 // Otherwise, hard case, relex the characters into the string.
428 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
429}
430
431/// MeasureTokenLength - Relex the token at the specified location and return
432/// its length in bytes in the input file. If the token needs cleaning (e.g.
433/// includes a trigraph or an escaped newline) then this count includes bytes
434/// that are part of that.
435unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
436 const SourceManager &SM,
437 const LangOptions &LangOpts) {
438 Token TheTok;
439 if (getRawToken(Loc, TheTok, SM, LangOpts))
440 return 0;
441 return TheTok.getLength();
442}
443
444/// Relex the token at the specified location.
445/// \returns true if there was a failure, false on success.
446bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
447 const SourceManager &SM,
448 const LangOptions &LangOpts,
449 bool IgnoreWhiteSpace) {
450 // TODO: this could be special cased for common tokens like identifiers, ')',
451 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
452 // all obviously single-char tokens. This could use
453 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
454 // something.
455
456 // If this comes from a macro expansion, we really do want the macro name, not
457 // the token this macro expanded to.
458 Loc = SM.getExpansionLoc(Loc);
459 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
460 bool Invalid = false;
461 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
462 if (Invalid)
463 return true;
464
465 const char *StrData = Buffer.data()+LocInfo.second;
466
467 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
468 return true;
469
470 // Create a lexer starting at the beginning of this token.
471 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
472 Buffer.begin(), StrData, Buffer.end());
473 TheLexer.SetCommentRetentionState(true);
474 TheLexer.LexFromRawLexer(Result);
475 return false;
476}
477
478/// Returns the pointer that points to the beginning of line that contains
479/// the given offset, or null if the offset if invalid.
480static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
481 const char *BufStart = Buffer.data();
482 if (Offset >= Buffer.size())
483 return nullptr;
484
485 const char *LexStart = BufStart + Offset;
486 for (; LexStart != BufStart; --LexStart) {
487 if (isVerticalWhitespace(LexStart[0]) &&
488 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
489 // LexStart should point at first character of logical line.
490 ++LexStart;
491 break;
492 }
493 }
494 return LexStart;
495}
496
497static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
498 const SourceManager &SM,
499 const LangOptions &LangOpts) {
500 assert(Loc.isFileID())((Loc.isFileID()) ? static_cast<void> (0) : __assert_fail
("Loc.isFileID()", "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 500, __PRETTY_FUNCTION__))
;
501 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
502 if (LocInfo.first.isInvalid())
503 return Loc;
504
505 bool Invalid = false;
506 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
507 if (Invalid)
508 return Loc;
509
510 // Back up from the current location until we hit the beginning of a line
511 // (or the buffer). We'll relex from that point.
512 const char *StrData = Buffer.data() + LocInfo.second;
513 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
514 if (!LexStart || LexStart == StrData)
515 return Loc;
516
517 // Create a lexer starting at the beginning of this token.
518 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
519 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
520 Buffer.end());
521 TheLexer.SetCommentRetentionState(true);
522
523 // Lex tokens until we find the token that contains the source location.
524 Token TheTok;
525 do {
526 TheLexer.LexFromRawLexer(TheTok);
527
528 if (TheLexer.getBufferLocation() > StrData) {
529 // Lexing this token has taken the lexer past the source location we're
530 // looking for. If the current token encompasses our source location,
531 // return the beginning of that token.
532 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
533 return TheTok.getLocation();
534
535 // We ended up skipping over the source location entirely, which means
536 // that it points into whitespace. We're done here.
537 break;
538 }
539 } while (TheTok.getKind() != tok::eof);
540
541 // We've passed our source location; just return the original source location.
542 return Loc;
543}
544
545SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
546 const SourceManager &SM,
547 const LangOptions &LangOpts) {
548 if (Loc.isFileID())
549 return getBeginningOfFileToken(Loc, SM, LangOpts);
550
551 if (!SM.isMacroArgExpansion(Loc))
552 return Loc;
553
554 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
555 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
556 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
557 std::pair<FileID, unsigned> BeginFileLocInfo =
558 SM.getDecomposedLoc(BeginFileLoc);
559 assert(FileLocInfo.first == BeginFileLocInfo.first &&((FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo
.second >= BeginFileLocInfo.second) ? static_cast<void>
(0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 560, __PRETTY_FUNCTION__))
560 FileLocInfo.second >= BeginFileLocInfo.second)((FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo
.second >= BeginFileLocInfo.second) ? static_cast<void>
(0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 560, __PRETTY_FUNCTION__))
;
561 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
562}
563
564namespace {
565
566enum PreambleDirectiveKind {
567 PDK_Skipped,
568 PDK_Unknown
569};
570
571} // namespace
572
573PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
574 const LangOptions &LangOpts,
575 unsigned MaxLines) {
576 // Create a lexer starting at the beginning of the file. Note that we use a
577 // "fake" file source location at offset 1 so that the lexer will track our
578 // position within the file.
579 const unsigned StartOffset = 1;
580 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
581 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
582 Buffer.end());
583 TheLexer.SetCommentRetentionState(true);
584
585 bool InPreprocessorDirective = false;
586 Token TheTok;
587 SourceLocation ActiveCommentLoc;
588
589 unsigned MaxLineOffset = 0;
590 if (MaxLines) {
591 const char *CurPtr = Buffer.begin();
592 unsigned CurLine = 0;
593 while (CurPtr != Buffer.end()) {
594 char ch = *CurPtr++;
595 if (ch == '\n') {
596 ++CurLine;
597 if (CurLine == MaxLines)
598 break;
599 }
600 }
601 if (CurPtr != Buffer.end())
602 MaxLineOffset = CurPtr - Buffer.begin();
603 }
604
605 do {
606 TheLexer.LexFromRawLexer(TheTok);
607
608 if (InPreprocessorDirective) {
609 // If we've hit the end of the file, we're done.
610 if (TheTok.getKind() == tok::eof) {
611 break;
612 }
613
614 // If we haven't hit the end of the preprocessor directive, skip this
615 // token.
616 if (!TheTok.isAtStartOfLine())
617 continue;
618
619 // We've passed the end of the preprocessor directive, and will look
620 // at this token again below.
621 InPreprocessorDirective = false;
622 }
623
624 // Keep track of the # of lines in the preamble.
625 if (TheTok.isAtStartOfLine()) {
626 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
627
628 // If we were asked to limit the number of lines in the preamble,
629 // and we're about to exceed that limit, we're done.
630 if (MaxLineOffset && TokOffset >= MaxLineOffset)
631 break;
632 }
633
634 // Comments are okay; skip over them.
635 if (TheTok.getKind() == tok::comment) {
636 if (ActiveCommentLoc.isInvalid())
637 ActiveCommentLoc = TheTok.getLocation();
638 continue;
639 }
640
641 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
642 // This is the start of a preprocessor directive.
643 Token HashTok = TheTok;
644 InPreprocessorDirective = true;
645 ActiveCommentLoc = SourceLocation();
646
647 // Figure out which directive this is. Since we're lexing raw tokens,
648 // we don't have an identifier table available. Instead, just look at
649 // the raw identifier to recognize and categorize preprocessor directives.
650 TheLexer.LexFromRawLexer(TheTok);
651 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
652 StringRef Keyword = TheTok.getRawIdentifier();
653 PreambleDirectiveKind PDK
654 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
655 .Case("include", PDK_Skipped)
656 .Case("__include_macros", PDK_Skipped)
657 .Case("define", PDK_Skipped)
658 .Case("undef", PDK_Skipped)
659 .Case("line", PDK_Skipped)
660 .Case("error", PDK_Skipped)
661 .Case("pragma", PDK_Skipped)
662 .Case("import", PDK_Skipped)
663 .Case("include_next", PDK_Skipped)
664 .Case("warning", PDK_Skipped)
665 .Case("ident", PDK_Skipped)
666 .Case("sccs", PDK_Skipped)
667 .Case("assert", PDK_Skipped)
668 .Case("unassert", PDK_Skipped)
669 .Case("if", PDK_Skipped)
670 .Case("ifdef", PDK_Skipped)
671 .Case("ifndef", PDK_Skipped)
672 .Case("elif", PDK_Skipped)
673 .Case("else", PDK_Skipped)
674 .Case("endif", PDK_Skipped)
675 .Default(PDK_Unknown);
676
677 switch (PDK) {
678 case PDK_Skipped:
679 continue;
680
681 case PDK_Unknown:
682 // We don't know what this directive is; stop at the '#'.
683 break;
684 }
685 }
686
687 // We only end up here if we didn't recognize the preprocessor
688 // directive or it was one that can't occur in the preamble at this
689 // point. Roll back the current token to the location of the '#'.
690 InPreprocessorDirective = false;
691 TheTok = HashTok;
692 }
693
694 // We hit a token that we don't recognize as being in the
695 // "preprocessing only" part of the file, so we're no longer in
696 // the preamble.
697 break;
698 } while (true);
699
700 SourceLocation End;
701 if (ActiveCommentLoc.isValid())
702 End = ActiveCommentLoc; // don't truncate a decl comment.
703 else
704 End = TheTok.getLocation();
705
706 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
707 TheTok.isAtStartOfLine());
708}
709
710unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
711 const SourceManager &SM,
712 const LangOptions &LangOpts) {
713 // Figure out how many physical characters away the specified expansion
714 // character is. This needs to take into consideration newlines and
715 // trigraphs.
716 bool Invalid = false;
717 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
718
719 // If they request the first char of the token, we're trivially done.
720 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
721 return 0;
722
723 unsigned PhysOffset = 0;
724
725 // The usual case is that tokens don't contain anything interesting. Skip
726 // over the uninteresting characters. If a token only consists of simple
727 // chars, this method is extremely fast.
728 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
729 if (CharNo == 0)
730 return PhysOffset;
731 ++TokPtr;
732 --CharNo;
733 ++PhysOffset;
734 }
735
736 // If we have a character that may be a trigraph or escaped newline, use a
737 // lexer to parse it correctly.
738 for (; CharNo; --CharNo) {
739 unsigned Size;
740 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
741 TokPtr += Size;
742 PhysOffset += Size;
743 }
744
745 // Final detail: if we end up on an escaped newline, we want to return the
746 // location of the actual byte of the token. For example foo\<newline>bar
747 // advanced by 3 should return the location of b, not of \\. One compounding
748 // detail of this is that the escape may be made by a trigraph.
749 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
750 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
751
752 return PhysOffset;
753}
754
755/// Computes the source location just past the end of the
756/// token at this source location.
757///
758/// This routine can be used to produce a source location that
759/// points just past the end of the token referenced by \p Loc, and
760/// is generally used when a diagnostic needs to point just after a
761/// token where it expected something different that it received. If
762/// the returned source location would not be meaningful (e.g., if
763/// it points into a macro), this routine returns an invalid
764/// source location.
765///
766/// \param Offset an offset from the end of the token, where the source
767/// location should refer to. The default offset (0) produces a source
768/// location pointing just past the end of the token; an offset of 1 produces
769/// a source location pointing to the last character in the token, etc.
770SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
771 const SourceManager &SM,
772 const LangOptions &LangOpts) {
773 if (Loc.isInvalid())
774 return {};
775
776 if (Loc.isMacroID()) {
777 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
778 return {}; // Points inside the macro expansion.
779 }
780
781 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
782 if (Len > Offset)
783 Len = Len - Offset;
784 else
785 return Loc;
786
787 return Loc.getLocWithOffset(Len);
788}
789
790/// Returns true if the given MacroID location points at the first
791/// token of the macro expansion.
792bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
793 const SourceManager &SM,
794 const LangOptions &LangOpts,
795 SourceLocation *MacroBegin) {
796 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")((loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"
) ? static_cast<void> (0) : __assert_fail ("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 796, __PRETTY_FUNCTION__))
;
797
798 SourceLocation expansionLoc;
799 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
800 return false;
801
802 if (expansionLoc.isFileID()) {
803 // No other macro expansions, this is the first.
804 if (MacroBegin)
805 *MacroBegin = expansionLoc;
806 return true;
807 }
808
809 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
810}
811
812/// Returns true if the given MacroID location points at the last
813/// token of the macro expansion.
814bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
815 const SourceManager &SM,
816 const LangOptions &LangOpts,
817 SourceLocation *MacroEnd) {
818 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")((loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"
) ? static_cast<void> (0) : __assert_fail ("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 818, __PRETTY_FUNCTION__))
;
819
820 SourceLocation spellLoc = SM.getSpellingLoc(loc);
821 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
822 if (tokLen == 0)
823 return false;
824
825 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
826 SourceLocation expansionLoc;
827 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
828 return false;
829
830 if (expansionLoc.isFileID()) {
831 // No other macro expansions.
832 if (MacroEnd)
833 *MacroEnd = expansionLoc;
834 return true;
835 }
836
837 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
838}
839
840static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
841 const SourceManager &SM,
842 const LangOptions &LangOpts) {
843 SourceLocation Begin = Range.getBegin();
844 SourceLocation End = Range.getEnd();
845 assert(Begin.isFileID() && End.isFileID())((Begin.isFileID() && End.isFileID()) ? static_cast<
void> (0) : __assert_fail ("Begin.isFileID() && End.isFileID()"
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 845, __PRETTY_FUNCTION__))
;
846 if (Range.isTokenRange()) {
847 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
848 if (End.isInvalid())
849 return {};
850 }
851
852 // Break down the source locations.
853 FileID FID;
854 unsigned BeginOffs;
855 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
856 if (FID.isInvalid())
857 return {};
858
859 unsigned EndOffs;
860 if (!SM.isInFileID(End, FID, &EndOffs) ||
861 BeginOffs > EndOffs)
862 return {};
863
864 return CharSourceRange::getCharRange(Begin, End);
865}
866
867CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
868 const SourceManager &SM,
869 const LangOptions &LangOpts) {
870 SourceLocation Begin = Range.getBegin();
871 SourceLocation End = Range.getEnd();
872 if (Begin.isInvalid() || End.isInvalid())
873 return {};
874
875 if (Begin.isFileID() && End.isFileID())
876 return makeRangeFromFileLocs(Range, SM, LangOpts);
877
878 if (Begin.isMacroID() && End.isFileID()) {
879 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
880 return {};
881 Range.setBegin(Begin);
882 return makeRangeFromFileLocs(Range, SM, LangOpts);
883 }
884
885 if (Begin.isFileID() && End.isMacroID()) {
886 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
887 &End)) ||
888 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
889 &End)))
890 return {};
891 Range.setEnd(End);
892 return makeRangeFromFileLocs(Range, SM, LangOpts);
893 }
894
895 assert(Begin.isMacroID() && End.isMacroID())((Begin.isMacroID() && End.isMacroID()) ? static_cast
<void> (0) : __assert_fail ("Begin.isMacroID() && End.isMacroID()"
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 895, __PRETTY_FUNCTION__))
;
896 SourceLocation MacroBegin, MacroEnd;
897 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
898 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
899 &MacroEnd)) ||
900 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
901 &MacroEnd)))) {
902 Range.setBegin(MacroBegin);
903 Range.setEnd(MacroEnd);
904 return makeRangeFromFileLocs(Range, SM, LangOpts);
905 }
906
907 bool Invalid = false;
908 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
909 &Invalid);
910 if (Invalid)
911 return {};
912
913 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
914 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
915 &Invalid);
916 if (Invalid)
917 return {};
918
919 if (EndEntry.getExpansion().isMacroArgExpansion() &&
920 BeginEntry.getExpansion().getExpansionLocStart() ==
921 EndEntry.getExpansion().getExpansionLocStart()) {
922 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
923 Range.setEnd(SM.getImmediateSpellingLoc(End));
924 return makeFileCharRange(Range, SM, LangOpts);
925 }
926 }
927
928 return {};
929}
930
931StringRef Lexer::getSourceText(CharSourceRange Range,
932 const SourceManager &SM,
933 const LangOptions &LangOpts,
934 bool *Invalid) {
935 Range = makeFileCharRange(Range, SM, LangOpts);
936 if (Range.isInvalid()) {
937 if (Invalid) *Invalid = true;
938 return {};
939 }
940
941 // Break down the source location.
942 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
943 if (beginInfo.first.isInvalid()) {
944 if (Invalid) *Invalid = true;
945 return {};
946 }
947
948 unsigned EndOffs;
949 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
950 beginInfo.second > EndOffs) {
951 if (Invalid) *Invalid = true;
952 return {};
953 }
954
955 // Try to the load the file buffer.
956 bool invalidTemp = false;
957 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
958 if (invalidTemp) {
959 if (Invalid) *Invalid = true;
960 return {};
961 }
962
963 if (Invalid) *Invalid = false;
964 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
965}
966
967StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
968 const SourceManager &SM,
969 const LangOptions &LangOpts) {
970 assert(Loc.isMacroID() && "Only reasonable to call this on macros")((Loc.isMacroID() && "Only reasonable to call this on macros"
) ? static_cast<void> (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 970, __PRETTY_FUNCTION__))
;
971
972 // Find the location of the immediate macro expansion.
973 while (true) {
974 FileID FID = SM.getFileID(Loc);
975 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
976 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
977 Loc = Expansion.getExpansionLocStart();
978 if (!Expansion.isMacroArgExpansion())
979 break;
980
981 // For macro arguments we need to check that the argument did not come
982 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
983
984 // Loc points to the argument id of the macro definition, move to the
985 // macro expansion.
986 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
987 SourceLocation SpellLoc = Expansion.getSpellingLoc();
988 if (SpellLoc.isFileID())
989 break; // No inner macro.
990
991 // If spelling location resides in the same FileID as macro expansion
992 // location, it means there is no inner macro.
993 FileID MacroFID = SM.getFileID(Loc);
994 if (SM.isInFileID(SpellLoc, MacroFID))
995 break;
996
997 // Argument came from inner macro.
998 Loc = SpellLoc;
999 }
1000
1001 // Find the spelling location of the start of the non-argument expansion
1002 // range. This is where the macro name was spelled in order to begin
1003 // expanding this macro.
1004 Loc = SM.getSpellingLoc(Loc);
1005
1006 // Dig out the buffer where the macro name was spelled and the extents of the
1007 // name so that we can render it into the expansion note.
1008 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1009 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1010 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1011 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1012}
1013
1014StringRef Lexer::getImmediateMacroNameForDiagnostics(
1015 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1016 assert(Loc.isMacroID() && "Only reasonable to call this on macros")((Loc.isMacroID() && "Only reasonable to call this on macros"
) ? static_cast<void> (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1016, __PRETTY_FUNCTION__))
;
1017 // Walk past macro argument expansions.
1018 while (SM.isMacroArgExpansion(Loc))
1019 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1020
1021 // If the macro's spelling has no FileID, then it's actually a token paste
1022 // or stringization (or similar) and not a macro at all.
1023 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1024 return {};
1025
1026 // Find the spelling location of the start of the non-argument expansion
1027 // range. This is where the macro name was spelled in order to begin
1028 // expanding this macro.
1029 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1030
1031 // Dig out the buffer where the macro name was spelled and the extents of the
1032 // name so that we can render it into the expansion note.
1033 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1034 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1035 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1036 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1037}
1038
1039bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1040 return isIdentifierBody(c, LangOpts.DollarIdents);
1041}
1042
1043bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1044 assert(isVerticalWhitespace(Str[0]))((isVerticalWhitespace(Str[0])) ? static_cast<void> (0)
: __assert_fail ("isVerticalWhitespace(Str[0])", "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1044, __PRETTY_FUNCTION__))
;
1045 if (Str - 1 < BufferStart)
1046 return false;
1047
1048 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1049 (Str[0] == '\r' && Str[-1] == '\n')) {
1050 if (Str - 2 < BufferStart)
1051 return false;
1052 --Str;
1053 }
1054 --Str;
1055
1056 // Rewind to first non-space character:
1057 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1058 --Str;
1059
1060 return *Str == '\\';
1061}
1062
1063StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1064 const SourceManager &SM) {
1065 if (Loc.isInvalid() || Loc.isMacroID())
1066 return {};
1067 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1068 if (LocInfo.first.isInvalid())
1069 return {};
1070 bool Invalid = false;
1071 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1072 if (Invalid)
1073 return {};
1074 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1075 if (!Line)
1076 return {};
1077 StringRef Rest = Buffer.substr(Line - Buffer.data());
1078 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1079 return NumWhitespaceChars == StringRef::npos
1080 ? ""
1081 : Rest.take_front(NumWhitespaceChars);
1082}
1083
1084//===----------------------------------------------------------------------===//
1085// Diagnostics forwarding code.
1086//===----------------------------------------------------------------------===//
1087
1088/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1089/// lexer buffer was all expanded at a single point, perform the mapping.
1090/// This is currently only used for _Pragma implementation, so it is the slow
1091/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1092static LLVM_ATTRIBUTE_NOINLINE__attribute__((noinline)) SourceLocation GetMappedTokenLoc(
1093 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1094static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1095 SourceLocation FileLoc,
1096 unsigned CharNo, unsigned TokLen) {
1097 assert(FileLoc.isMacroID() && "Must be a macro expansion")((FileLoc.isMacroID() && "Must be a macro expansion")
? static_cast<void> (0) : __assert_fail ("FileLoc.isMacroID() && \"Must be a macro expansion\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1097, __PRETTY_FUNCTION__))
;
1098
1099 // Otherwise, we're lexing "mapped tokens". This is used for things like
1100 // _Pragma handling. Combine the expansion location of FileLoc with the
1101 // spelling location.
1102 SourceManager &SM = PP.getSourceManager();
1103
1104 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1105 // characters come from spelling(FileLoc)+Offset.
1106 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1107 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1108
1109 // Figure out the expansion loc range, which is the range covered by the
1110 // original _Pragma(...) sequence.
1111 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1112
1113 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1114}
1115
1116/// getSourceLocation - Return a source location identifier for the specified
1117/// offset in the current file.
1118SourceLocation Lexer::getSourceLocation(const char *Loc,
1119 unsigned TokLen) const {
1120 assert(Loc >= BufferStart && Loc <= BufferEnd &&((Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!") ? static_cast<void
> (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1121, __PRETTY_FUNCTION__))
1121 "Location out of range for this buffer!")((Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!") ? static_cast<void
> (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1121, __PRETTY_FUNCTION__))
;
1122
1123 // In the normal case, we're just lexing from a simple file buffer, return
1124 // the file id from FileLoc with the offset specified.
1125 unsigned CharNo = Loc-BufferStart;
1126 if (FileLoc.isFileID())
1127 return FileLoc.getLocWithOffset(CharNo);
1128
1129 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1130 // tokens are lexed from where the _Pragma was defined.
1131 assert(PP && "This doesn't work on raw lexers")((PP && "This doesn't work on raw lexers") ? static_cast
<void> (0) : __assert_fail ("PP && \"This doesn't work on raw lexers\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1131, __PRETTY_FUNCTION__))
;
1132 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1133}
1134
1135/// Diag - Forwarding function for diagnostics. This translate a source
1136/// position in the current buffer into a SourceLocation object for rendering.
1137DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1138 return PP->Diag(getSourceLocation(Loc), DiagID);
1139}
1140
1141//===----------------------------------------------------------------------===//
1142// Trigraph and Escaped Newline Handling Code.
1143//===----------------------------------------------------------------------===//
1144
1145/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1146/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1147static char GetTrigraphCharForLetter(char Letter) {
1148 switch (Letter) {
1149 default: return 0;
1150 case '=': return '#';
1151 case ')': return ']';
1152 case '(': return '[';
1153 case '!': return '|';
1154 case '\'': return '^';
1155 case '>': return '}';
1156 case '/': return '\\';
1157 case '<': return '{';
1158 case '-': return '~';
1159 }
1160}
1161
1162/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1163/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1164/// return the result character. Finally, emit a warning about trigraph use
1165/// whether trigraphs are enabled or not.
1166static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1167 char Res = GetTrigraphCharForLetter(*CP);
1168 if (!Res || !L) return Res;
1169
1170 if (!L->getLangOpts().Trigraphs) {
1171 if (!L->isLexingRawMode())
1172 L->Diag(CP-2, diag::trigraph_ignored);
1173 return 0;
1174 }
1175
1176 if (!L->isLexingRawMode())
1177 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1178 return Res;
1179}
1180
1181/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1182/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1183/// trigraph equivalent on entry to this function.
1184unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1185 unsigned Size = 0;
1186 while (isWhitespace(Ptr[Size])) {
1187 ++Size;
1188
1189 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1190 continue;
1191
1192 // If this is a \r\n or \n\r, skip the other half.
1193 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1194 Ptr[Size-1] != Ptr[Size])
1195 ++Size;
1196
1197 return Size;
1198 }
1199
1200 // Not an escaped newline, must be a \t or something else.
1201 return 0;
1202}
1203
1204/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1205/// them), skip over them and return the first non-escaped-newline found,
1206/// otherwise return P.
1207const char *Lexer::SkipEscapedNewLines(const char *P) {
1208 while (true) {
1209 const char *AfterEscape;
1210 if (*P == '\\') {
1211 AfterEscape = P+1;
1212 } else if (*P == '?') {
1213 // If not a trigraph for escape, bail out.
1214 if (P[1] != '?' || P[2] != '/')
1215 return P;
1216 // FIXME: Take LangOpts into account; the language might not
1217 // support trigraphs.
1218 AfterEscape = P+3;
1219 } else {
1220 return P;
1221 }
1222
1223 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1224 if (NewLineSize == 0) return P;
1225 P = AfterEscape+NewLineSize;
1226 }
1227}
1228
1229Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1230 const SourceManager &SM,
1231 const LangOptions &LangOpts) {
1232 if (Loc.isMacroID()) {
1233 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1234 return None;
1235 }
1236 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1237
1238 // Break down the source location.
1239 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1240
1241 // Try to load the file buffer.
1242 bool InvalidTemp = false;
1243 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1244 if (InvalidTemp)
1245 return None;
1246
1247 const char *TokenBegin = File.data() + LocInfo.second;
1248
1249 // Lex from the start of the given location.
1250 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1251 TokenBegin, File.end());
1252 // Find the token.
1253 Token Tok;
1254 lexer.LexFromRawLexer(Tok);
1255 return Tok;
1256}
1257
1258/// Checks that the given token is the first token that occurs after the
1259/// given location (this excludes comments and whitespace). Returns the location
1260/// immediately after the specified token. If the token is not found or the
1261/// location is inside a macro, the returned source location will be invalid.
1262SourceLocation Lexer::findLocationAfterToken(
1263 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1264 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1265 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1266 if (!Tok || Tok->isNot(TKind))
1267 return {};
1268 SourceLocation TokenLoc = Tok->getLocation();
1269
1270 // Calculate how much whitespace needs to be skipped if any.
1271 unsigned NumWhitespaceChars = 0;
1272 if (SkipTrailingWhitespaceAndNewLine) {
1273 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1274 unsigned char C = *TokenEnd;
1275 while (isHorizontalWhitespace(C)) {
1276 C = *(++TokenEnd);
1277 NumWhitespaceChars++;
1278 }
1279
1280 // Skip \r, \n, \r\n, or \n\r
1281 if (C == '\n' || C == '\r') {
1282 char PrevC = C;
1283 C = *(++TokenEnd);
1284 NumWhitespaceChars++;
1285 if ((C == '\n' || C == '\r') && C != PrevC)
1286 NumWhitespaceChars++;
1287 }
1288 }
1289
1290 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1291}
1292
1293/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1294/// get its size, and return it. This is tricky in several cases:
1295/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1296/// then either return the trigraph (skipping 3 chars) or the '?',
1297/// depending on whether trigraphs are enabled or not.
1298/// 2. If this is an escaped newline (potentially with whitespace between
1299/// the backslash and newline), implicitly skip the newline and return
1300/// the char after it.
1301///
1302/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1303/// know that we can accumulate into Size, and that we have already incremented
1304/// Ptr by Size bytes.
1305///
1306/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1307/// be updated to match.
1308char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1309 Token *Tok) {
1310 // If we have a slash, look for an escaped newline.
1311 if (Ptr[0] == '\\') {
8
Taking true branch
1312 ++Size;
1313 ++Ptr;
1314Slash:
1315 // Common case, backslash-char where the char is not whitespace.
1316 if (!isWhitespace(Ptr[0])) return '\\';
9
Assuming the condition is false
10
Taking false branch
1317
1318 // See if we have optional whitespace characters between the slash and
1319 // newline.
1320 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
11
Assuming 'EscapedNewLineSize' is not equal to 0
12
Taking true branch
1321 // Remember that this token needs to be cleaned.
1322 if (Tok) Tok->setFlag(Token::NeedsCleaning);
13
Taking true branch
14
Calling 'Token::setFlag'
1323
1324 // Warn if there was whitespace between the backslash and newline.
1325 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1326 Diag(Ptr, diag::backslash_newline_space);
1327
1328 // Found backslash<whitespace><newline>. Parse the char after it.
1329 Size += EscapedNewLineSize;
1330 Ptr += EscapedNewLineSize;
1331
1332 // Use slow version to accumulate a correct size field.
1333 return getCharAndSizeSlow(Ptr, Size, Tok);
1334 }
1335
1336 // Otherwise, this is not an escaped newline, just return the slash.
1337 return '\\';
1338 }
1339
1340 // If this is a trigraph, process it.
1341 if (Ptr[0] == '?' && Ptr[1] == '?') {
1342 // If this is actually a legal trigraph (not something like "??x"), emit
1343 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1344 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1345 // Remember that this token needs to be cleaned.
1346 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1347
1348 Ptr += 3;
1349 Size += 3;
1350 if (C == '\\') goto Slash;
1351 return C;
1352 }
1353 }
1354
1355 // If this is neither, return a single character.
1356 ++Size;
1357 return *Ptr;
1358}
1359
1360/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1361/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1362/// and that we have already incremented Ptr by Size bytes.
1363///
1364/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1365/// be updated to match.
1366char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1367 const LangOptions &LangOpts) {
1368 // If we have a slash, look for an escaped newline.
1369 if (Ptr[0] == '\\') {
1370 ++Size;
1371 ++Ptr;
1372Slash:
1373 // Common case, backslash-char where the char is not whitespace.
1374 if (!isWhitespace(Ptr[0])) return '\\';
1375
1376 // See if we have optional whitespace characters followed by a newline.
1377 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1378 // Found backslash<whitespace><newline>. Parse the char after it.
1379 Size += EscapedNewLineSize;
1380 Ptr += EscapedNewLineSize;
1381
1382 // Use slow version to accumulate a correct size field.
1383 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1384 }
1385
1386 // Otherwise, this is not an escaped newline, just return the slash.
1387 return '\\';
1388 }
1389
1390 // If this is a trigraph, process it.
1391 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1392 // If this is actually a legal trigraph (not something like "??x"), return
1393 // it.
1394 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1395 Ptr += 3;
1396 Size += 3;
1397 if (C == '\\') goto Slash;
1398 return C;
1399 }
1400 }
1401
1402 // If this is neither, return a single character.
1403 ++Size;
1404 return *Ptr;
1405}
1406
1407//===----------------------------------------------------------------------===//
1408// Helper methods for lexing.
1409//===----------------------------------------------------------------------===//
1410
1411/// Routine that indiscriminately sets the offset into the source file.
1412void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1413 BufferPtr = BufferStart + Offset;
1414 if (BufferPtr > BufferEnd)
1415 BufferPtr = BufferEnd;
1416 // FIXME: What exactly does the StartOfLine bit mean? There are two
1417 // possible meanings for the "start" of the line: the first token on the
1418 // unexpanded line, or the first token on the expanded line.
1419 IsAtStartOfLine = StartOfLine;
1420 IsAtPhysicalStartOfLine = StartOfLine;
1421}
1422
1423static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1424 if (LangOpts.AsmPreprocessor) {
1425 return false;
1426 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1427 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1428 C11AllowedIDCharRanges);
1429 return C11AllowedIDChars.contains(C);
1430 } else if (LangOpts.CPlusPlus) {
1431 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1432 CXX03AllowedIDCharRanges);
1433 return CXX03AllowedIDChars.contains(C);
1434 } else {
1435 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1436 C99AllowedIDCharRanges);
1437 return C99AllowedIDChars.contains(C);
1438 }
1439}
1440
1441static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1442 assert(isAllowedIDChar(C, LangOpts))((isAllowedIDChar(C, LangOpts)) ? static_cast<void> (0)
: __assert_fail ("isAllowedIDChar(C, LangOpts)", "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1442, __PRETTY_FUNCTION__))
;
1443 if (LangOpts.AsmPreprocessor) {
1444 return false;
1445 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1446 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1447 C11DisallowedInitialIDCharRanges);
1448 return !C11DisallowedInitialIDChars.contains(C);
1449 } else if (LangOpts.CPlusPlus) {
1450 return true;
1451 } else {
1452 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1453 C99DisallowedInitialIDCharRanges);
1454 return !C99DisallowedInitialIDChars.contains(C);
1455 }
1456}
1457
1458static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1459 const char *End) {
1460 return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1461 L.getSourceLocation(End));
1462}
1463
1464static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1465 CharSourceRange Range, bool IsFirst) {
1466 // Check C99 compatibility.
1467 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1468 enum {
1469 CannotAppearInIdentifier = 0,
1470 CannotStartIdentifier
1471 };
1472
1473 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1474 C99AllowedIDCharRanges);
1475 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1476 C99DisallowedInitialIDCharRanges);
1477 if (!C99AllowedIDChars.contains(C)) {
1478 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1479 << Range
1480 << CannotAppearInIdentifier;
1481 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1482 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1483 << Range
1484 << CannotStartIdentifier;
1485 }
1486 }
1487
1488 // Check C++98 compatibility.
1489 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1490 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1491 CXX03AllowedIDCharRanges);
1492 if (!CXX03AllowedIDChars.contains(C)) {
1493 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1494 << Range;
1495 }
1496 }
1497}
1498
1499/// After encountering UTF-8 character C and interpreting it as an identifier
1500/// character, check whether it's a homoglyph for a common non-identifier
1501/// source character that is unlikely to be an intentional identifier
1502/// character and warn if so.
1503static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1504 CharSourceRange Range) {
1505 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1506 struct HomoglyphPair {
1507 uint32_t Character;
1508 char LooksLike;
1509 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1510 };
1511 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1512 {U'\u00ad', 0}, // SOFT HYPHEN
1513 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1514 {U'\u037e', ';'}, // GREEK QUESTION MARK
1515 {U'\u200b', 0}, // ZERO WIDTH SPACE
1516 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1517 {U'\u200d', 0}, // ZERO WIDTH JOINER
1518 {U'\u2060', 0}, // WORD JOINER
1519 {U'\u2061', 0}, // FUNCTION APPLICATION
1520 {U'\u2062', 0}, // INVISIBLE TIMES
1521 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1522 {U'\u2064', 0}, // INVISIBLE PLUS
1523 {U'\u2212', '-'}, // MINUS SIGN
1524 {U'\u2215', '/'}, // DIVISION SLASH
1525 {U'\u2216', '\\'}, // SET MINUS
1526 {U'\u2217', '*'}, // ASTERISK OPERATOR
1527 {U'\u2223', '|'}, // DIVIDES
1528 {U'\u2227', '^'}, // LOGICAL AND
1529 {U'\u2236', ':'}, // RATIO
1530 {U'\u223c', '~'}, // TILDE OPERATOR
1531 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1532 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1533 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1534 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1535 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1536 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1537 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1538 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1539 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1540 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1541 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1542 {U'\uff0c', ','}, // FULLWIDTH COMMA
1543 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1544 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1545 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1546 {U'\uff1a', ':'}, // FULLWIDTH COLON
1547 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1548 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1549 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1550 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1551 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1552 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1553 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1554 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1555 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1556 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1557 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1558 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1559 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1560 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1561 {0, 0}
1562 };
1563 auto Homoglyph =
1564 std::lower_bound(std::begin(SortedHomoglyphs),
1565 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1566 if (Homoglyph->Character == C) {
1567 llvm::SmallString<5> CharBuf;
1568 {
1569 llvm::raw_svector_ostream CharOS(CharBuf);
1570 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1571 }
1572 if (Homoglyph->LooksLike) {
1573 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1574 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1575 << Range << CharBuf << LooksLikeStr;
1576 } else {
1577 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1578 << Range << CharBuf;
1579 }
1580 }
1581}
1582
1583bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1584 Token &Result) {
1585 const char *UCNPtr = CurPtr + Size;
1586 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1587 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
1588 return false;
1589
1590 if (!isLexingRawMode())
1591 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1592 makeCharRange(*this, CurPtr, UCNPtr),
1593 /*IsFirst=*/false);
1594
1595 Result.setFlag(Token::HasUCN);
1596 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1597 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1598 CurPtr = UCNPtr;
1599 else
1600 while (CurPtr != UCNPtr)
1601 (void)getAndAdvanceChar(CurPtr, Result);
1602 return true;
1603}
1604
1605bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1606 const char *UnicodePtr = CurPtr;
1607 llvm::UTF32 CodePoint;
1608 llvm::ConversionResult Result =
1609 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1610 (const llvm::UTF8 *)BufferEnd,
1611 &CodePoint,
1612 llvm::strictConversion);
1613 if (Result != llvm::conversionOK ||
1614 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1615 return false;
1616
1617 if (!isLexingRawMode()) {
1618 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1619 makeCharRange(*this, CurPtr, UnicodePtr),
1620 /*IsFirst=*/false);
1621 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1622 makeCharRange(*this, CurPtr, UnicodePtr));
1623 }
1624
1625 CurPtr = UnicodePtr;
1626 return true;
1627}
1628
1629bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1630 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1631 unsigned Size;
1632 unsigned char C = *CurPtr++;
1633 while (isIdentifierBody(C))
1634 C = *CurPtr++;
1635
1636 --CurPtr; // Back up over the skipped character.
1637
1638 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
1639 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1640 //
1641 // TODO: Could merge these checks into an InfoTable flag to make the
1642 // comparison cheaper
1643 if (isASCII(C) && C != '\\' && C != '?' &&
1644 (C != '$' || !LangOpts.DollarIdents)) {
1645FinishIdentifier:
1646 const char *IdStart = BufferPtr;
1647 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1648 Result.setRawIdentifierData(IdStart);
1649
1650 // If we are in raw mode, return this identifier raw. There is no need to
1651 // look up identifier information or attempt to macro expand it.
1652 if (LexingRawMode)
1653 return true;
1654
1655 // Fill in Result.IdentifierInfo and update the token kind,
1656 // looking up the identifier in the identifier table.
1657 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1658 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1659 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1660
1661 // If the completion point is at the end of an identifier, we want to treat
1662 // the identifier as incomplete even if it resolves to a macro or a keyword.
1663 // This allows e.g. 'class^' to complete to 'classifier'.
1664 if (isCodeCompletionPoint(CurPtr)) {
1665 // Return the code-completion token.
1666 Result.setKind(tok::code_completion);
1667 // Skip the code-completion char and all immediate identifier characters.
1668 // This ensures we get consistent behavior when completing at any point in
1669 // an identifier (i.e. at the start, in the middle, at the end). Note that
1670 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1671 // simpler.
1672 assert(*CurPtr == 0 && "Completion character must be 0")((*CurPtr == 0 && "Completion character must be 0") ?
static_cast<void> (0) : __assert_fail ("*CurPtr == 0 && \"Completion character must be 0\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1672, __PRETTY_FUNCTION__))
;
1673 ++CurPtr;
1674 // Note that code completion token is not added as a separate character
1675 // when the completion point is at the end of the buffer. Therefore, we need
1676 // to check if the buffer has ended.
1677 if (CurPtr < BufferEnd) {
1678 while (isIdentifierBody(*CurPtr))
1679 ++CurPtr;
1680 }
1681 BufferPtr = CurPtr;
1682 return true;
1683 }
1684
1685 // Finally, now that we know we have an identifier, pass this off to the
1686 // preprocessor, which may macro expand it or something.
1687 if (II->isHandleIdentifierCase())
1688 return PP->HandleIdentifier(Result);
1689
1690 return true;
1691 }
1692
1693 // Otherwise, $,\,? in identifier found. Enter slower path.
1694
1695 C = getCharAndSize(CurPtr, Size);
1696 while (true) {
1697 if (C == '$') {
1698 // If we hit a $ and they are not supported in identifiers, we are done.
1699 if (!LangOpts.DollarIdents) goto FinishIdentifier;
1700
1701 // Otherwise, emit a diagnostic and continue.
1702 if (!isLexingRawMode())
1703 Diag(CurPtr, diag::ext_dollar_in_identifier);
1704 CurPtr = ConsumeChar(CurPtr, Size, Result);
1705 C = getCharAndSize(CurPtr, Size);
1706 continue;
1707 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
1708 C = getCharAndSize(CurPtr, Size);
1709 continue;
1710 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
1711 C = getCharAndSize(CurPtr, Size);
1712 continue;
1713 } else if (!isIdentifierBody(C)) {
1714 goto FinishIdentifier;
1715 }
1716
1717 // Otherwise, this character is good, consume it.
1718 CurPtr = ConsumeChar(CurPtr, Size, Result);
1719
1720 C = getCharAndSize(CurPtr, Size);
1721 while (isIdentifierBody(C)) {
1722 CurPtr = ConsumeChar(CurPtr, Size, Result);
1723 C = getCharAndSize(CurPtr, Size);
1724 }
1725 }
1726}
1727
1728/// isHexaLiteral - Return true if Start points to a hex constant.
1729/// in microsoft mode (where this is supposed to be several different tokens).
1730bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1731 unsigned Size;
1732 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1733 if (C1 != '0')
1734 return false;
1735 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1736 return (C2 == 'x' || C2 == 'X');
1737}
1738
1739/// LexNumericConstant - Lex the remainder of a integer or floating point
1740/// constant. From[-1] is the first character lexed. Return the end of the
1741/// constant.
1742bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1743 unsigned Size;
1744 char C = getCharAndSize(CurPtr, Size);
1745 char PrevCh = 0;
1746 while (isPreprocessingNumberBody(C)) {
1747 CurPtr = ConsumeChar(CurPtr, Size, Result);
1748 PrevCh = C;
1749 C = getCharAndSize(CurPtr, Size);
1750 }
1751
1752 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1753 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1754 // If we are in Microsoft mode, don't continue if the constant is hex.
1755 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1756 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1757 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1758 }
1759
1760 // If we have a hex FP constant, continue.
1761 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1762 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1763 // not-quite-conforming extension. Only do so if this looks like it's
1764 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1765 bool IsHexFloat = true;
1766 if (!LangOpts.C99) {
1767 if (!isHexaLiteral(BufferPtr, LangOpts))
1768 IsHexFloat = false;
1769 else if (!getLangOpts().CPlusPlus17 &&
1770 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1771 IsHexFloat = false;
1772 }
1773 if (IsHexFloat)
1774 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1775 }
1776
1777 // If we have a digit separator, continue.
1778 if (C == '\'' && getLangOpts().CPlusPlus14) {
1779 unsigned NextSize;
1780 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1781 if (isIdentifierBody(Next)) {
1782 if (!isLexingRawMode())
1783 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1784 CurPtr = ConsumeChar(CurPtr, Size, Result);
1785 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1786 return LexNumericConstant(Result, CurPtr);
1787 }
1788 }
1789
1790 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1791 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1792 return LexNumericConstant(Result, CurPtr);
1793 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1794 return LexNumericConstant(Result, CurPtr);
1795
1796 // Update the location of token as well as BufferPtr.
1797 const char *TokStart = BufferPtr;
1798 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1799 Result.setLiteralData(TokStart);
1800 return true;
1801}
1802
1803/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1804/// in C++11, or warn on a ud-suffix in C++98.
1805const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1806 bool IsStringLiteral) {
1807 assert(getLangOpts().CPlusPlus)((getLangOpts().CPlusPlus) ? static_cast<void> (0) : __assert_fail
("getLangOpts().CPlusPlus", "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 1807, __PRETTY_FUNCTION__))
;
1808
1809 // Maximally munch an identifier.
1810 unsigned Size;
1811 char C = getCharAndSize(CurPtr, Size);
1812 bool Consumed = false;
1813
1814 if (!isIdentifierHead(C)) {
1815 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1816 Consumed = true;
1817 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1818 Consumed = true;
1819 else
1820 return CurPtr;
1821 }
1822
1823 if (!getLangOpts().CPlusPlus11) {
1824 if (!isLexingRawMode())
1825 Diag(CurPtr,
1826 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1827 : diag::warn_cxx11_compat_reserved_user_defined_literal)
1828 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1829 return CurPtr;
1830 }
1831
1832 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1833 // that does not start with an underscore is ill-formed. As a conforming
1834 // extension, we treat all such suffixes as if they had whitespace before
1835 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1836 // likely to be a ud-suffix than a macro, however, and accept that.
1837 if (!Consumed) {
1838 bool IsUDSuffix = false;
1839 if (C == '_')
1840 IsUDSuffix = true;
1841 else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
1842 // In C++1y, we need to look ahead a few characters to see if this is a
1843 // valid suffix for a string literal or a numeric literal (this could be
1844 // the 'operator""if' defining a numeric literal operator).
1845 const unsigned MaxStandardSuffixLength = 3;
1846 char Buffer[MaxStandardSuffixLength] = { C };
1847 unsigned Consumed = Size;
1848 unsigned Chars = 1;
1849 while (true) {
1850 unsigned NextSize;
1851 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1852 getLangOpts());
1853 if (!isIdentifierBody(Next)) {
1854 // End of suffix. Check whether this is on the whitelist.
1855 const StringRef CompleteSuffix(Buffer, Chars);
1856 IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1857 CompleteSuffix);
1858 break;
1859 }
1860
1861 if (Chars == MaxStandardSuffixLength)
1862 // Too long: can't be a standard suffix.
1863 break;
1864
1865 Buffer[Chars++] = Next;
1866 Consumed += NextSize;
1867 }
1868 }
1869
1870 if (!IsUDSuffix) {
1871 if (!isLexingRawMode())
1872 Diag(CurPtr, getLangOpts().MSVCCompat
1873 ? diag::ext_ms_reserved_user_defined_literal
1874 : diag::ext_reserved_user_defined_literal)
1875 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1876 return CurPtr;
1877 }
1878
1879 CurPtr = ConsumeChar(CurPtr, Size, Result);
1880 }
1881
1882 Result.setFlag(Token::HasUDSuffix);
1883 while (true) {
1884 C = getCharAndSize(CurPtr, Size);
1885 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
1886 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
1887 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
1888 else break;
1889 }
1890
1891 return CurPtr;
1892}
1893
1894/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1895/// either " or L" or u8" or u" or U".
1896bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1897 tok::TokenKind Kind) {
1898 const char *AfterQuote = CurPtr;
1899 // Does this string contain the \0 character?
1900 const char *NulCharacter = nullptr;
1901
1902 if (!isLexingRawMode() &&
1903 (Kind == tok::utf8_string_literal ||
1904 Kind == tok::utf16_string_literal ||
1905 Kind == tok::utf32_string_literal))
1906 Diag(BufferPtr, getLangOpts().CPlusPlus
1907 ? diag::warn_cxx98_compat_unicode_literal
1908 : diag::warn_c99_compat_unicode_literal);
1909
1910 char C = getAndAdvanceChar(CurPtr, Result);
1911 while (C != '"') {
1912 // Skip escaped characters. Escaped newlines will already be processed by
1913 // getAndAdvanceChar.
1914 if (C == '\\')
1915 C = getAndAdvanceChar(CurPtr, Result);
1916
1917 if (C == '\n' || C == '\r' || // Newline.
1918 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
1919 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1920 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1921 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1922 return true;
1923 }
1924
1925 if (C == 0) {
1926 if (isCodeCompletionPoint(CurPtr-1)) {
1927 if (ParsingFilename)
1928 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
1929 else
1930 PP->CodeCompleteNaturalLanguage();
1931 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
1932 cutOffLexing();
1933 return true;
1934 }
1935
1936 NulCharacter = CurPtr-1;
1937 }
1938 C = getAndAdvanceChar(CurPtr, Result);
1939 }
1940
1941 // If we are in C++11, lex the optional ud-suffix.
1942 if (getLangOpts().CPlusPlus)
1943 CurPtr = LexUDSuffix(Result, CurPtr, true);
1944
1945 // If a nul character existed in the string, warn about it.
1946 if (NulCharacter && !isLexingRawMode())
1947 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1948
1949 // Update the location of the token as well as the BufferPtr instance var.
1950 const char *TokStart = BufferPtr;
1951 FormTokenWithChars(Result, CurPtr, Kind);
1952 Result.setLiteralData(TokStart);
1953 return true;
1954}
1955
1956/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1957/// having lexed R", LR", u8R", uR", or UR".
1958bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1959 tok::TokenKind Kind) {
1960 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1961 // Between the initial and final double quote characters of the raw string,
1962 // any transformations performed in phases 1 and 2 (trigraphs,
1963 // universal-character-names, and line splicing) are reverted.
1964
1965 if (!isLexingRawMode())
1966 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1967
1968 unsigned PrefixLen = 0;
1969
1970 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
1971 ++PrefixLen;
1972
1973 // If the last character was not a '(', then we didn't lex a valid delimiter.
1974 if (CurPtr[PrefixLen] != '(') {
1975 if (!isLexingRawMode()) {
1976 const char *PrefixEnd = &CurPtr[PrefixLen];
1977 if (PrefixLen == 16) {
1978 Diag(PrefixEnd, diag::err_raw_delim_too_long);
1979 } else {
1980 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1981 << StringRef(PrefixEnd, 1);
1982 }
1983 }
1984
1985 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1986 // it's possible the '"' was intended to be part of the raw string, but
1987 // there's not much we can do about that.
1988 while (true) {
1989 char C = *CurPtr++;
1990
1991 if (C == '"')
1992 break;
1993 if (C == 0 && CurPtr-1 == BufferEnd) {
1994 --CurPtr;
1995 break;
1996 }
1997 }
1998
1999 FormTokenWithChars(Result, CurPtr, tok::unknown);
2000 return true;
2001 }
2002
2003 // Save prefix and move CurPtr past it
2004 const char *Prefix = CurPtr;
2005 CurPtr += PrefixLen + 1; // skip over prefix and '('
2006
2007 while (true) {
2008 char C = *CurPtr++;
2009
2010 if (C == ')') {
2011 // Check for prefix match and closing quote.
2012 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2013 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2014 break;
2015 }
2016 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2017 if (!isLexingRawMode())
2018 Diag(BufferPtr, diag::err_unterminated_raw_string)
2019 << StringRef(Prefix, PrefixLen);
2020 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2021 return true;
2022 }
2023 }
2024
2025 // If we are in C++11, lex the optional ud-suffix.
2026 if (getLangOpts().CPlusPlus)
2027 CurPtr = LexUDSuffix(Result, CurPtr, true);
2028
2029 // Update the location of token as well as BufferPtr.
2030 const char *TokStart = BufferPtr;
2031 FormTokenWithChars(Result, CurPtr, Kind);
2032 Result.setLiteralData(TokStart);
2033 return true;
2034}
2035
2036/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2037/// after having lexed the '<' character. This is used for #include filenames.
2038bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2039 // Does this string contain the \0 character?
2040 const char *NulCharacter = nullptr;
2041 const char *AfterLessPos = CurPtr;
2042 char C = getAndAdvanceChar(CurPtr, Result);
2043 while (C != '>') {
2044 // Skip escaped characters. Escaped newlines will already be processed by
2045 // getAndAdvanceChar.
2046 if (C == '\\')
2047 C = getAndAdvanceChar(CurPtr, Result);
2048
2049 if (C == '\n' || C == '\r' || // Newline.
2050 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2051 // If the filename is unterminated, then it must just be a lone <
2052 // character. Return this as such.
2053 FormTokenWithChars(Result, AfterLessPos, tok::less);
2054 return true;
2055 }
2056
2057 if (C == 0) {
2058 if (isCodeCompletionPoint(CurPtr - 1)) {
2059 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2060 cutOffLexing();
2061 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2062 return true;
2063 }
2064 NulCharacter = CurPtr-1;
2065 }
2066 C = getAndAdvanceChar(CurPtr, Result);
2067 }
2068
2069 // If a nul character existed in the string, warn about it.
2070 if (NulCharacter && !isLexingRawMode())
2071 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2072
2073 // Update the location of token as well as BufferPtr.
2074 const char *TokStart = BufferPtr;
2075 FormTokenWithChars(Result, CurPtr, tok::header_name);
2076 Result.setLiteralData(TokStart);
2077 return true;
2078}
2079
2080void Lexer::codeCompleteIncludedFile(const char *PathStart,
2081 const char *CompletionPoint,
2082 bool IsAngled) {
2083 // Completion only applies to the filename, after the last slash.
2084 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2085 auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/");
2086 StringRef Dir =
2087 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2088 const char *StartOfFilename =
2089 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2090 // Code completion filter range is the filename only, up to completion point.
2091 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2092 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2093 // We should replace the characters up to the closing quote, if any.
2094 while (CompletionPoint < BufferEnd) {
2095 char Next = *(CompletionPoint + 1);
2096 if (Next == 0 || Next == '\r' || Next == '\n')
2097 break;
2098 ++CompletionPoint;
2099 if (Next == (IsAngled ? '>' : '"'))
2100 break;
2101 }
2102 PP->setCodeCompletionTokenRange(
2103 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2104 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2105 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2106}
2107
2108/// LexCharConstant - Lex the remainder of a character constant, after having
2109/// lexed either ' or L' or u8' or u' or U'.
2110bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2111 tok::TokenKind Kind) {
2112 // Does this character contain the \0 character?
2113 const char *NulCharacter = nullptr;
2114
2115 if (!isLexingRawMode()) {
2116 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2117 Diag(BufferPtr, getLangOpts().CPlusPlus
2118 ? diag::warn_cxx98_compat_unicode_literal
2119 : diag::warn_c99_compat_unicode_literal);
2120 else if (Kind == tok::utf8_char_constant)
2121 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2122 }
2123
2124 char C = getAndAdvanceChar(CurPtr, Result);
2125 if (C == '\'') {
2126 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2127 Diag(BufferPtr, diag::ext_empty_character);
2128 FormTokenWithChars(Result, CurPtr, tok::unknown);
2129 return true;
2130 }
2131
2132 while (C != '\'') {
2133 // Skip escaped characters.
2134 if (C == '\\')
2135 C = getAndAdvanceChar(CurPtr, Result);
2136
2137 if (C == '\n' || C == '\r' || // Newline.
2138 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2139 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2140 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2141 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2142 return true;
2143 }
2144
2145 if (C == 0) {
2146 if (isCodeCompletionPoint(CurPtr-1)) {
2147 PP->CodeCompleteNaturalLanguage();
2148 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2149 cutOffLexing();
2150 return true;
2151 }
2152
2153 NulCharacter = CurPtr-1;
2154 }
2155 C = getAndAdvanceChar(CurPtr, Result);
2156 }
2157
2158 // If we are in C++11, lex the optional ud-suffix.
2159 if (getLangOpts().CPlusPlus)
2160 CurPtr = LexUDSuffix(Result, CurPtr, false);
2161
2162 // If a nul character existed in the character, warn about it.
2163 if (NulCharacter && !isLexingRawMode())
2164 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2165
2166 // Update the location of token as well as BufferPtr.
2167 const char *TokStart = BufferPtr;
2168 FormTokenWithChars(Result, CurPtr, Kind);
2169 Result.setLiteralData(TokStart);
2170 return true;
2171}
2172
2173/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2174/// Update BufferPtr to point to the next non-whitespace character and return.
2175///
2176/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2177bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2178 bool &TokAtPhysicalStartOfLine) {
2179 // Whitespace - Skip it, then return the token after the whitespace.
2180 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2181
2182 unsigned char Char = *CurPtr;
2183
2184 // Skip consecutive spaces efficiently.
2185 while (true) {
2186 // Skip horizontal whitespace very aggressively.
2187 while (isHorizontalWhitespace(Char))
2188 Char = *++CurPtr;
2189
2190 // Otherwise if we have something other than whitespace, we're done.
2191 if (!isVerticalWhitespace(Char))
2192 break;
2193
2194 if (ParsingPreprocessorDirective) {
2195 // End of preprocessor directive line, let LexTokenInternal handle this.
2196 BufferPtr = CurPtr;
2197 return false;
2198 }
2199
2200 // OK, but handle newline.
2201 SawNewline = true;
2202 Char = *++CurPtr;
2203 }
2204
2205 // If the client wants us to return whitespace, return it now.
2206 if (isKeepWhitespaceMode()) {
2207 FormTokenWithChars(Result, CurPtr, tok::unknown);
2208 if (SawNewline) {
2209 IsAtStartOfLine = true;
2210 IsAtPhysicalStartOfLine = true;
2211 }
2212 // FIXME: The next token will not have LeadingSpace set.
2213 return true;
2214 }
2215
2216 // If this isn't immediately after a newline, there is leading space.
2217 char PrevChar = CurPtr[-1];
2218 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2219
2220 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2221 if (SawNewline) {
2222 Result.setFlag(Token::StartOfLine);
2223 TokAtPhysicalStartOfLine = true;
2224 }
2225
2226 BufferPtr = CurPtr;
2227 return false;
2228}
2229
2230/// We have just read the // characters from input. Skip until we find the
2231/// newline character that terminates the comment. Then update BufferPtr and
2232/// return.
2233///
2234/// If we're in KeepCommentMode or any CommentHandler has inserted
2235/// some tokens, this will store the first token and return true.
2236bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2237 bool &TokAtPhysicalStartOfLine) {
2238 // If Line comments aren't explicitly enabled for this language, emit an
2239 // extension warning.
2240 if (!LangOpts.LineComment && !isLexingRawMode()) {
2241 Diag(BufferPtr, diag::ext_line_comment);
2242
2243 // Mark them enabled so we only emit one warning for this translation
2244 // unit.
2245 LangOpts.LineComment = true;
2246 }
2247
2248 // Scan over the body of the comment. The common case, when scanning, is that
2249 // the comment contains normal ascii characters with nothing interesting in
2250 // them. As such, optimize for this case with the inner loop.
2251 //
2252 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2253 // character that ends the line comment.
2254 char C;
2255 while (true) {
2256 C = *CurPtr;
2257 // Skip over characters in the fast loop.
2258 while (C != 0 && // Potentially EOF.
2259 C != '\n' && C != '\r') // Newline or DOS-style newline.
2260 C = *++CurPtr;
2261
2262 const char *NextLine = CurPtr;
2263 if (C != 0) {
2264 // We found a newline, see if it's escaped.
2265 const char *EscapePtr = CurPtr-1;
2266 bool HasSpace = false;
2267 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2268 --EscapePtr;
2269 HasSpace = true;
2270 }
2271
2272 if (*EscapePtr == '\\')
2273 // Escaped newline.
2274 CurPtr = EscapePtr;
2275 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2276 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2277 // Trigraph-escaped newline.
2278 CurPtr = EscapePtr-2;
2279 else
2280 break; // This is a newline, we're done.
2281
2282 // If there was space between the backslash and newline, warn about it.
2283 if (HasSpace && !isLexingRawMode())
2284 Diag(EscapePtr, diag::backslash_newline_space);
2285 }
2286
2287 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2288 // properly decode the character. Read it in raw mode to avoid emitting
2289 // diagnostics about things like trigraphs. If we see an escaped newline,
2290 // we'll handle it below.
2291 const char *OldPtr = CurPtr;
2292 bool OldRawMode = isLexingRawMode();
2293 LexingRawMode = true;
2294 C = getAndAdvanceChar(CurPtr, Result);
2295 LexingRawMode = OldRawMode;
2296
2297 // If we only read only one character, then no special handling is needed.
2298 // We're done and can skip forward to the newline.
2299 if (C != 0 && CurPtr == OldPtr+1) {
2300 CurPtr = NextLine;
2301 break;
2302 }
2303
2304 // If we read multiple characters, and one of those characters was a \r or
2305 // \n, then we had an escaped newline within the comment. Emit diagnostic
2306 // unless the next line is also a // comment.
2307 if (CurPtr != OldPtr + 1 && C != '/' &&
2308 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2309 for (; OldPtr != CurPtr; ++OldPtr)
2310 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2311 // Okay, we found a // comment that ends in a newline, if the next
2312 // line is also a // comment, but has spaces, don't emit a diagnostic.
2313 if (isWhitespace(C)) {
2314 const char *ForwardPtr = CurPtr;
2315 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2316 ++ForwardPtr;
2317 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2318 break;
2319 }
2320
2321 if (!isLexingRawMode())
2322 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2323 break;
2324 }
2325 }
2326
2327 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2328 --CurPtr;
2329 break;
2330 }
2331
2332 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2333 PP->CodeCompleteNaturalLanguage();
2334 cutOffLexing();
2335 return false;
2336 }
2337 }
2338
2339 // Found but did not consume the newline. Notify comment handlers about the
2340 // comment unless we're in a #if 0 block.
2341 if (PP && !isLexingRawMode() &&
2342 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2343 getSourceLocation(CurPtr)))) {
2344 BufferPtr = CurPtr;
2345 return true; // A token has to be returned.
2346 }
2347
2348 // If we are returning comments as tokens, return this comment as a token.
2349 if (inKeepCommentMode())
2350 return SaveLineComment(Result, CurPtr);
2351
2352 // If we are inside a preprocessor directive and we see the end of line,
2353 // return immediately, so that the lexer can return this as an EOD token.
2354 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2355 BufferPtr = CurPtr;
2356 return false;
2357 }
2358
2359 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2360 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2361 // contribute to another token), it isn't needed for correctness. Note that
2362 // this is ok even in KeepWhitespaceMode, because we would have returned the
2363 /// comment above in that mode.
2364 ++CurPtr;
2365
2366 // The next returned token is at the start of the line.
2367 Result.setFlag(Token::StartOfLine);
2368 TokAtPhysicalStartOfLine = true;
2369 // No leading whitespace seen so far.
2370 Result.clearFlag(Token::LeadingSpace);
2371 BufferPtr = CurPtr;
2372 return false;
2373}
2374
2375/// If in save-comment mode, package up this Line comment in an appropriate
2376/// way and return it.
2377bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2378 // If we're not in a preprocessor directive, just return the // comment
2379 // directly.
2380 FormTokenWithChars(Result, CurPtr, tok::comment);
2381
2382 if (!ParsingPreprocessorDirective || LexingRawMode)
2383 return true;
2384
2385 // If this Line-style comment is in a macro definition, transmogrify it into
2386 // a C-style block comment.
2387 bool Invalid = false;
2388 std::string Spelling = PP->getSpelling(Result, &Invalid);
2389 if (Invalid)
2390 return true;
2391
2392 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?")((Spelling[0] == '/' && Spelling[1] == '/' &&
"Not line comment?") ? static_cast<void> (0) : __assert_fail
("Spelling[0] == '/' && Spelling[1] == '/' && \"Not line comment?\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2392, __PRETTY_FUNCTION__))
;
2393 Spelling[1] = '*'; // Change prefix to "/*".
2394 Spelling += "*/"; // add suffix.
2395
2396 Result.setKind(tok::comment);
2397 PP->CreateString(Spelling, Result,
2398 Result.getLocation(), Result.getLocation());
2399 return true;
2400}
2401
2402/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2403/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2404/// a diagnostic if so. We know that the newline is inside of a block comment.
2405static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2406 Lexer *L) {
2407 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r')((CurPtr[0] == '\n' || CurPtr[0] == '\r') ? static_cast<void
> (0) : __assert_fail ("CurPtr[0] == '\\n' || CurPtr[0] == '\\r'"
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2407, __PRETTY_FUNCTION__))
;
2408
2409 // Back up off the newline.
2410 --CurPtr;
2411
2412 // If this is a two-character newline sequence, skip the other character.
2413 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2414 // \n\n or \r\r -> not escaped newline.
2415 if (CurPtr[0] == CurPtr[1])
2416 return false;
2417 // \n\r or \r\n -> skip the newline.
2418 --CurPtr;
2419 }
2420
2421 // If we have horizontal whitespace, skip over it. We allow whitespace
2422 // between the slash and newline.
2423 bool HasSpace = false;
2424 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2425 --CurPtr;
2426 HasSpace = true;
2427 }
2428
2429 // If we have a slash, we know this is an escaped newline.
2430 if (*CurPtr == '\\') {
2431 if (CurPtr[-1] != '*') return false;
2432 } else {
2433 // It isn't a slash, is it the ?? / trigraph?
2434 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
2435 CurPtr[-3] != '*')
2436 return false;
2437
2438 // This is the trigraph ending the comment. Emit a stern warning!
2439 CurPtr -= 2;
2440
2441 // If no trigraphs are enabled, warn that we ignored this trigraph and
2442 // ignore this * character.
2443 if (!L->getLangOpts().Trigraphs) {
2444 if (!L->isLexingRawMode())
2445 L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2446 return false;
2447 }
2448 if (!L->isLexingRawMode())
2449 L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2450 }
2451
2452 // Warn about having an escaped newline between the */ characters.
2453 if (!L->isLexingRawMode())
2454 L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2455
2456 // If there was space between the backslash and newline, warn about it.
2457 if (HasSpace && !L->isLexingRawMode())
2458 L->Diag(CurPtr, diag::backslash_newline_space);
2459
2460 return true;
2461}
2462
2463#ifdef __SSE2__1
2464#include <emmintrin.h>
2465#elif __ALTIVEC__
2466#include <altivec.h>
2467#undef bool
2468#endif
2469
2470/// We have just read from input the / and * characters that started a comment.
2471/// Read until we find the * and / characters that terminate the comment.
2472/// Note that we don't bother decoding trigraphs or escaped newlines in block
2473/// comments, because they cannot cause the comment to end. The only thing
2474/// that can happen is the comment could end with an escaped newline between
2475/// the terminating * and /.
2476///
2477/// If we're in KeepCommentMode or any CommentHandler has inserted
2478/// some tokens, this will store the first token and return true.
2479bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2480 bool &TokAtPhysicalStartOfLine) {
2481 // Scan one character past where we should, looking for a '/' character. Once
2482 // we find it, check to see if it was preceded by a *. This common
2483 // optimization helps people who like to put a lot of * characters in their
2484 // comments.
2485
2486 // The first character we get with newlines and trigraphs skipped to handle
2487 // the degenerate /*/ case below correctly if the * has an escaped newline
2488 // after it.
2489 unsigned CharSize;
2490 unsigned char C = getCharAndSize(CurPtr, CharSize);
2491 CurPtr += CharSize;
2492 if (C == 0 && CurPtr == BufferEnd+1) {
2493 if (!isLexingRawMode())
2494 Diag(BufferPtr, diag::err_unterminated_block_comment);
2495 --CurPtr;
2496
2497 // KeepWhitespaceMode should return this broken comment as a token. Since
2498 // it isn't a well formed comment, just return it as an 'unknown' token.
2499 if (isKeepWhitespaceMode()) {
2500 FormTokenWithChars(Result, CurPtr, tok::unknown);
2501 return true;
2502 }
2503
2504 BufferPtr = CurPtr;
2505 return false;
2506 }
2507
2508 // Check to see if the first character after the '/*' is another /. If so,
2509 // then this slash does not end the block comment, it is part of it.
2510 if (C == '/')
2511 C = *CurPtr++;
2512
2513 while (true) {
2514 // Skip over all non-interesting characters until we find end of buffer or a
2515 // (probably ending) '/' character.
2516 if (CurPtr + 24 < BufferEnd &&
2517 // If there is a code-completion point avoid the fast scan because it
2518 // doesn't check for '\0'.
2519 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2520 // While not aligned to a 16-byte boundary.
2521 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
2522 C = *CurPtr++;
2523
2524 if (C == '/') goto FoundSlash;
2525
2526#ifdef __SSE2__1
2527 __m128i Slashes = _mm_set1_epi8('/');
2528 while (CurPtr+16 <= BufferEnd) {
2529 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2530 Slashes));
2531 if (cmp != 0) {
2532 // Adjust the pointer to point directly after the first slash. It's
2533 // not necessary to set C here, it will be overwritten at the end of
2534 // the outer loop.
2535 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2536 goto FoundSlash;
2537 }
2538 CurPtr += 16;
2539 }
2540#elif __ALTIVEC__
2541 __vector unsigned char Slashes = {
2542 '/', '/', '/', '/', '/', '/', '/', '/',
2543 '/', '/', '/', '/', '/', '/', '/', '/'
2544 };
2545 while (CurPtr+16 <= BufferEnd &&
2546 !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
2547 CurPtr += 16;
2548#else
2549 // Scan for '/' quickly. Many block comments are very large.
2550 while (CurPtr[0] != '/' &&
2551 CurPtr[1] != '/' &&
2552 CurPtr[2] != '/' &&
2553 CurPtr[3] != '/' &&
2554 CurPtr+4 < BufferEnd) {
2555 CurPtr += 4;
2556 }
2557#endif
2558
2559 // It has to be one of the bytes scanned, increment to it and read one.
2560 C = *CurPtr++;
2561 }
2562
2563 // Loop to scan the remainder.
2564 while (C != '/' && C != '\0')
2565 C = *CurPtr++;
2566
2567 if (C == '/') {
2568 FoundSlash:
2569 if (CurPtr[-2] == '*') // We found the final */. We're done!
2570 break;
2571
2572 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2573 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2574 // We found the final */, though it had an escaped newline between the
2575 // * and /. We're done!
2576 break;
2577 }
2578 }
2579 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2580 // If this is a /* inside of the comment, emit a warning. Don't do this
2581 // if this is a /*/, which will end the comment. This misses cases with
2582 // embedded escaped newlines, but oh well.
2583 if (!isLexingRawMode())
2584 Diag(CurPtr-1, diag::warn_nested_block_comment);
2585 }
2586 } else if (C == 0 && CurPtr == BufferEnd+1) {
2587 if (!isLexingRawMode())
2588 Diag(BufferPtr, diag::err_unterminated_block_comment);
2589 // Note: the user probably forgot a */. We could continue immediately
2590 // after the /*, but this would involve lexing a lot of what really is the
2591 // comment, which surely would confuse the parser.
2592 --CurPtr;
2593
2594 // KeepWhitespaceMode should return this broken comment as a token. Since
2595 // it isn't a well formed comment, just return it as an 'unknown' token.
2596 if (isKeepWhitespaceMode()) {
2597 FormTokenWithChars(Result, CurPtr, tok::unknown);
2598 return true;
2599 }
2600
2601 BufferPtr = CurPtr;
2602 return false;
2603 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2604 PP->CodeCompleteNaturalLanguage();
2605 cutOffLexing();
2606 return false;
2607 }
2608
2609 C = *CurPtr++;
2610 }
2611
2612 // Notify comment handlers about the comment unless we're in a #if 0 block.
2613 if (PP && !isLexingRawMode() &&
2614 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2615 getSourceLocation(CurPtr)))) {
2616 BufferPtr = CurPtr;
2617 return true; // A token has to be returned.
2618 }
2619
2620 // If we are returning comments as tokens, return this comment as a token.
2621 if (inKeepCommentMode()) {
2622 FormTokenWithChars(Result, CurPtr, tok::comment);
2623 return true;
2624 }
2625
2626 // It is common for the tokens immediately after a /**/ comment to be
2627 // whitespace. Instead of going through the big switch, handle it
2628 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2629 // have already returned above with the comment as a token.
2630 if (isHorizontalWhitespace(*CurPtr)) {
2631 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2632 return false;
2633 }
2634
2635 // Otherwise, just return so that the next character will be lexed as a token.
2636 BufferPtr = CurPtr;
2637 Result.setFlag(Token::LeadingSpace);
2638 return false;
2639}
2640
2641//===----------------------------------------------------------------------===//
2642// Primary Lexing Entry Points
2643//===----------------------------------------------------------------------===//
2644
2645/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2646/// uninterpreted string. This switches the lexer out of directive mode.
2647void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2648 assert(ParsingPreprocessorDirective && ParsingFilename == false &&((ParsingPreprocessorDirective && ParsingFilename == false
&& "Must be in a preprocessing directive!") ? static_cast
<void> (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2649, __PRETTY_FUNCTION__))
1
Assuming the condition is true
2
Assuming the condition is true
3
'?' condition is true
2649 "Must be in a preprocessing directive!")((ParsingPreprocessorDirective && ParsingFilename == false
&& "Must be in a preprocessing directive!") ? static_cast
<void> (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2649, __PRETTY_FUNCTION__))
;
2650 Token Tmp;
2651
2652 // CurPtr - Cache BufferPtr in an automatic variable.
2653 const char *CurPtr = BufferPtr;
2654 while (true) {
4
Loop condition is true. Entering loop body
2655 char Char = getAndAdvanceChar(CurPtr, Tmp);
5
Calling 'Lexer::getAndAdvanceChar'
2656 switch (Char) {
2657 default:
2658 if (Result)
2659 Result->push_back(Char);
2660 break;
2661 case 0: // Null.
2662 // Found end of file?
2663 if (CurPtr-1 != BufferEnd) {
2664 if (isCodeCompletionPoint(CurPtr-1)) {
2665 PP->CodeCompleteNaturalLanguage();
2666 cutOffLexing();
2667 return;
2668 }
2669
2670 // Nope, normal character, continue.
2671 if (Result)
2672 Result->push_back(Char);
2673 break;
2674 }
2675 // FALL THROUGH.
2676 LLVM_FALLTHROUGH[[clang::fallthrough]];
2677 case '\r':
2678 case '\n':
2679 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2680 assert(CurPtr[-1] == Char && "Trigraphs for newline?")((CurPtr[-1] == Char && "Trigraphs for newline?") ? static_cast
<void> (0) : __assert_fail ("CurPtr[-1] == Char && \"Trigraphs for newline?\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2680, __PRETTY_FUNCTION__))
;
2681 BufferPtr = CurPtr-1;
2682
2683 // Next, lex the character, which should handle the EOD transition.
2684 Lex(Tmp);
2685 if (Tmp.is(tok::code_completion)) {
2686 if (PP)
2687 PP->CodeCompleteNaturalLanguage();
2688 Lex(Tmp);
2689 }
2690 assert(Tmp.is(tok::eod) && "Unexpected token!")((Tmp.is(tok::eod) && "Unexpected token!") ? static_cast
<void> (0) : __assert_fail ("Tmp.is(tok::eod) && \"Unexpected token!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2690, __PRETTY_FUNCTION__))
;
2691
2692 // Finally, we're done;
2693 return;
2694 }
2695 }
2696}
2697
2698/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2699/// condition, reporting diagnostics and handling other edge cases as required.
2700/// This returns true if Result contains a token, false if PP.Lex should be
2701/// called again.
2702bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2703 // If we hit the end of the file while parsing a preprocessor directive,
2704 // end the preprocessor directive first. The next token returned will
2705 // then be the end of file.
2706 if (ParsingPreprocessorDirective) {
2707 // Done parsing the "line".
2708 ParsingPreprocessorDirective = false;
2709 // Update the location of token as well as BufferPtr.
2710 FormTokenWithChars(Result, CurPtr, tok::eod);
2711
2712 // Restore comment saving mode, in case it was disabled for directive.
2713 if (PP)
2714 resetExtendedTokenMode();
2715 return true; // Have a token.
2716 }
2717
2718 // If we are in raw mode, return this event as an EOF token. Let the caller
2719 // that put us in raw mode handle the event.
2720 if (isLexingRawMode()) {
2721 Result.startToken();
2722 BufferPtr = BufferEnd;
2723 FormTokenWithChars(Result, BufferEnd, tok::eof);
2724 return true;
2725 }
2726
2727 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2728 PP->setRecordedPreambleConditionalStack(ConditionalStack);
2729 ConditionalStack.clear();
2730 }
2731
2732 // Issue diagnostics for unterminated #if and missing newline.
2733
2734 // If we are in a #if directive, emit an error.
2735 while (!ConditionalStack.empty()) {
2736 if (PP->getCodeCompletionFileLoc() != FileLoc)
2737 PP->Diag(ConditionalStack.back().IfLoc,
2738 diag::err_pp_unterminated_conditional);
2739 ConditionalStack.pop_back();
2740 }
2741
2742 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2743 // a pedwarn.
2744 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2745 DiagnosticsEngine &Diags = PP->getDiagnostics();
2746 SourceLocation EndLoc = getSourceLocation(BufferEnd);
2747 unsigned DiagID;
2748
2749 if (LangOpts.CPlusPlus11) {
2750 // C++11 [lex.phases] 2.2 p2
2751 // Prefer the C++98 pedantic compatibility warning over the generic,
2752 // non-extension, user-requested "missing newline at EOF" warning.
2753 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2754 DiagID = diag::warn_cxx98_compat_no_newline_eof;
2755 } else {
2756 DiagID = diag::warn_no_newline_eof;
2757 }
2758 } else {
2759 DiagID = diag::ext_no_newline_eof;
2760 }
2761
2762 Diag(BufferEnd, DiagID)
2763 << FixItHint::CreateInsertion(EndLoc, "\n");
2764 }
2765
2766 BufferPtr = CurPtr;
2767
2768 // Finally, let the preprocessor handle this.
2769 return PP->HandleEndOfFile(Result, isPragmaLexer());
2770}
2771
2772/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2773/// the specified lexer will return a tok::l_paren token, 0 if it is something
2774/// else and 2 if there are no more tokens in the buffer controlled by the
2775/// lexer.
2776unsigned Lexer::isNextPPTokenLParen() {
2777 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?")((!LexingRawMode && "How can we expand a macro from a skipping buffer?"
) ? static_cast<void> (0) : __assert_fail ("!LexingRawMode && \"How can we expand a macro from a skipping buffer?\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2777, __PRETTY_FUNCTION__))
;
2778
2779 // Switch to 'skipping' mode. This will ensure that we can lex a token
2780 // without emitting diagnostics, disables macro expansion, and will cause EOF
2781 // to return an EOF token instead of popping the include stack.
2782 LexingRawMode = true;
2783
2784 // Save state that can be changed while lexing so that we can restore it.
2785 const char *TmpBufferPtr = BufferPtr;
2786 bool inPPDirectiveMode = ParsingPreprocessorDirective;
2787 bool atStartOfLine = IsAtStartOfLine;
2788 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2789 bool leadingSpace = HasLeadingSpace;
2790
2791 Token Tok;
2792 Lex(Tok);
2793
2794 // Restore state that may have changed.
2795 BufferPtr = TmpBufferPtr;
2796 ParsingPreprocessorDirective = inPPDirectiveMode;
2797 HasLeadingSpace = leadingSpace;
2798 IsAtStartOfLine = atStartOfLine;
2799 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2800
2801 // Restore the lexer back to non-skipping mode.
2802 LexingRawMode = false;
2803
2804 if (Tok.is(tok::eof))
2805 return 2;
2806 return Tok.is(tok::l_paren);
2807}
2808
2809/// Find the end of a version control conflict marker.
2810static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2811 ConflictMarkerKind CMK) {
2812 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2813 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2814 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2815 size_t Pos = RestOfBuffer.find(Terminator);
2816 while (Pos != StringRef::npos) {
2817 // Must occur at start of line.
2818 if (Pos == 0 ||
2819 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
2820 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2821 Pos = RestOfBuffer.find(Terminator);
2822 continue;
2823 }
2824 return RestOfBuffer.data()+Pos;
2825 }
2826 return nullptr;
2827}
2828
2829/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2830/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2831/// and recover nicely. This returns true if it is a conflict marker and false
2832/// if not.
2833bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2834 // Only a conflict marker if it starts at the beginning of a line.
2835 if (CurPtr != BufferStart &&
2836 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2837 return false;
2838
2839 // Check to see if we have <<<<<<< or >>>>.
2840 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2841 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
2842 return false;
2843
2844 // If we have a situation where we don't care about conflict markers, ignore
2845 // it.
2846 if (CurrentConflictMarkerState || isLexingRawMode())
2847 return false;
2848
2849 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
2850
2851 // Check to see if there is an ending marker somewhere in the buffer at the
2852 // start of a line to terminate this conflict marker.
2853 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2854 // We found a match. We are really in a conflict marker.
2855 // Diagnose this, and ignore to the end of line.
2856 Diag(CurPtr, diag::err_conflict_marker);
2857 CurrentConflictMarkerState = Kind;
2858
2859 // Skip ahead to the end of line. We know this exists because the
2860 // end-of-conflict marker starts with \r or \n.
2861 while (*CurPtr != '\r' && *CurPtr != '\n') {
2862 assert(CurPtr != BufferEnd && "Didn't find end of line")((CurPtr != BufferEnd && "Didn't find end of line") ?
static_cast<void> (0) : __assert_fail ("CurPtr != BufferEnd && \"Didn't find end of line\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2862, __PRETTY_FUNCTION__))
;
2863 ++CurPtr;
2864 }
2865 BufferPtr = CurPtr;
2866 return true;
2867 }
2868
2869 // No end of conflict marker found.
2870 return false;
2871}
2872
2873/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2874/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2875/// is the end of a conflict marker. Handle it by ignoring up until the end of
2876/// the line. This returns true if it is a conflict marker and false if not.
2877bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2878 // Only a conflict marker if it starts at the beginning of a line.
2879 if (CurPtr != BufferStart &&
2880 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2881 return false;
2882
2883 // If we have a situation where we don't care about conflict markers, ignore
2884 // it.
2885 if (!CurrentConflictMarkerState || isLexingRawMode())
2886 return false;
2887
2888 // Check to see if we have the marker (4 characters in a row).
2889 for (unsigned i = 1; i != 4; ++i)
2890 if (CurPtr[i] != CurPtr[0])
2891 return false;
2892
2893 // If we do have it, search for the end of the conflict marker. This could
2894 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
2895 // be the end of conflict marker.
2896 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2897 CurrentConflictMarkerState)) {
2898 CurPtr = End;
2899
2900 // Skip ahead to the end of line.
2901 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2902 ++CurPtr;
2903
2904 BufferPtr = CurPtr;
2905
2906 // No longer in the conflict marker.
2907 CurrentConflictMarkerState = CMK_None;
2908 return true;
2909 }
2910
2911 return false;
2912}
2913
2914static const char *findPlaceholderEnd(const char *CurPtr,
2915 const char *BufferEnd) {
2916 if (CurPtr == BufferEnd)
2917 return nullptr;
2918 BufferEnd -= 1; // Scan until the second last character.
2919 for (; CurPtr != BufferEnd; ++CurPtr) {
2920 if (CurPtr[0] == '#' && CurPtr[1] == '>')
2921 return CurPtr + 2;
2922 }
2923 return nullptr;
2924}
2925
2926bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2927 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!")((CurPtr[-1] == '<' && CurPtr[0] == '#' &&
"Not a placeholder!") ? static_cast<void> (0) : __assert_fail
("CurPtr[-1] == '<' && CurPtr[0] == '#' && \"Not a placeholder!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 2927, __PRETTY_FUNCTION__))
;
2928 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
2929 return false;
2930 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2931 if (!End)
2932 return false;
2933 const char *Start = CurPtr - 1;
2934 if (!LangOpts.AllowEditorPlaceholders)
2935 Diag(Start, diag::err_placeholder_in_source);
2936 Result.startToken();
2937 FormTokenWithChars(Result, End, tok::raw_identifier);
2938 Result.setRawIdentifierData(Start);
2939 PP->LookUpIdentifierInfo(Result);
2940 Result.setFlag(Token::IsEditorPlaceholder);
2941 BufferPtr = End;
2942 return true;
2943}
2944
2945bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2946 if (PP && PP->isCodeCompletionEnabled()) {
2947 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2948 return Loc == PP->getCodeCompletionLoc();
2949 }
2950
2951 return false;
2952}
2953
2954uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2955 Token *Result) {
2956 unsigned CharSize;
2957 char Kind = getCharAndSize(StartPtr, CharSize);
2958
2959 unsigned NumHexDigits;
2960 if (Kind == 'u')
2961 NumHexDigits = 4;
2962 else if (Kind == 'U')
2963 NumHexDigits = 8;
2964 else
2965 return 0;
2966
2967 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
2968 if (Result && !isLexingRawMode())
2969 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2970 return 0;
2971 }
2972
2973 const char *CurPtr = StartPtr + CharSize;
2974 const char *KindLoc = &CurPtr[-1];
2975
2976 uint32_t CodePoint = 0;
2977 for (unsigned i = 0; i < NumHexDigits; ++i) {
2978 char C = getCharAndSize(CurPtr, CharSize);
2979
2980 unsigned Value = llvm::hexDigitValue(C);
2981 if (Value == -1U) {
2982 if (Result && !isLexingRawMode()) {
2983 if (i == 0) {
2984 Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2985 << StringRef(KindLoc, 1);
2986 } else {
2987 Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2988
2989 // If the user wrote \U1234, suggest a fixit to \u.
2990 if (i == 4 && NumHexDigits == 8) {
2991 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
2992 Diag(KindLoc, diag::note_ucn_four_not_eight)
2993 << FixItHint::CreateReplacement(URange, "u");
2994 }
2995 }
2996 }
2997
2998 return 0;
2999 }
3000
3001 CodePoint <<= 4;
3002 CodePoint += Value;
3003
3004 CurPtr += CharSize;
3005 }
3006
3007 if (Result) {
3008 Result->setFlag(Token::HasUCN);
3009 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
3010 StartPtr = CurPtr;
3011 else
3012 while (StartPtr != CurPtr)
3013 (void)getAndAdvanceChar(StartPtr, *Result);
3014 } else {
3015 StartPtr = CurPtr;
3016 }
3017
3018 // Don't apply C family restrictions to UCNs in assembly mode
3019 if (LangOpts.AsmPreprocessor)
3020 return CodePoint;
3021
3022 // C99 6.4.3p2: A universal character name shall not specify a character whose
3023 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3024 // 0060 (`), nor one in the range D800 through DFFF inclusive.)
3025 // C++11 [lex.charset]p2: If the hexadecimal value for a
3026 // universal-character-name corresponds to a surrogate code point (in the
3027 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3028 // if the hexadecimal value for a universal-character-name outside the
3029 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3030 // string literal corresponds to a control character (in either of the
3031 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3032 // basic source character set, the program is ill-formed.
3033 if (CodePoint < 0xA0) {
3034 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
3035 return CodePoint;
3036
3037 // We don't use isLexingRawMode() here because we need to warn about bad
3038 // UCNs even when skipping preprocessing tokens in a #if block.
3039 if (Result && PP) {
3040 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3041 Diag(BufferPtr, diag::err_ucn_control_character);
3042 else {
3043 char C = static_cast<char>(CodePoint);
3044 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3045 }
3046 }
3047
3048 return 0;
3049 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3050 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3051 // We don't use isLexingRawMode() here because we need to diagnose bad
3052 // UCNs even when skipping preprocessing tokens in a #if block.
3053 if (Result && PP) {
3054 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3055 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3056 else
3057 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3058 }
3059 return 0;
3060 }
3061
3062 return CodePoint;
3063}
3064
3065bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3066 const char *CurPtr) {
3067 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3068 UnicodeWhitespaceCharRanges);
3069 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3070 UnicodeWhitespaceChars.contains(C)) {
3071 Diag(BufferPtr, diag::ext_unicode_whitespace)
3072 << makeCharRange(*this, BufferPtr, CurPtr);
3073
3074 Result.setFlag(Token::LeadingSpace);
3075 return true;
3076 }
3077 return false;
3078}
3079
3080bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3081 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
3082 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3083 !PP->isPreprocessedOutput()) {
3084 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3085 makeCharRange(*this, BufferPtr, CurPtr),
3086 /*IsFirst=*/true);
3087 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
3088 makeCharRange(*this, BufferPtr, CurPtr));
3089 }
3090
3091 MIOpt.ReadToken();
3092 return LexIdentifier(Result, CurPtr);
3093 }
3094
3095 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3096 !PP->isPreprocessedOutput() &&
3097 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
3098 // Non-ASCII characters tend to creep into source code unintentionally.
3099 // Instead of letting the parser complain about the unknown token,
3100 // just drop the character.
3101 // Note that we can /only/ do this when the non-ASCII character is actually
3102 // spelled as Unicode, not written as a UCN. The standard requires that
3103 // we not throw away any possible preprocessor tokens, but there's a
3104 // loophole in the mapping of Unicode characters to basic character set
3105 // characters that allows us to map these particular characters to, say,
3106 // whitespace.
3107 Diag(BufferPtr, diag::err_non_ascii)
3108 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3109
3110 BufferPtr = CurPtr;
3111 return false;
3112 }
3113
3114 // Otherwise, we have an explicit UCN or a character that's unlikely to show
3115 // up by accident.
3116 MIOpt.ReadToken();
3117 FormTokenWithChars(Result, CurPtr, tok::unknown);
3118 return true;
3119}
3120
3121void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3122 IsAtStartOfLine = Result.isAtStartOfLine();
3123 HasLeadingSpace = Result.hasLeadingSpace();
3124 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3125 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3126}
3127
3128bool Lexer::Lex(Token &Result) {
3129 // Start a new token.
3130 Result.startToken();
3131
3132 // Set up misc whitespace flags for LexTokenInternal.
3133 if (IsAtStartOfLine) {
3134 Result.setFlag(Token::StartOfLine);
3135 IsAtStartOfLine = false;
3136 }
3137
3138 if (HasLeadingSpace) {
3139 Result.setFlag(Token::LeadingSpace);
3140 HasLeadingSpace = false;
3141 }
3142
3143 if (HasLeadingEmptyMacro) {
3144 Result.setFlag(Token::LeadingEmptyMacro);
3145 HasLeadingEmptyMacro = false;
3146 }
3147
3148 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3149 IsAtPhysicalStartOfLine = false;
3150 bool isRawLex = isLexingRawMode();
3151 (void) isRawLex;
3152 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3153 // (After the LexTokenInternal call, the lexer might be destroyed.)
3154 assert((returnedToken || !isRawLex) && "Raw lex must succeed")(((returnedToken || !isRawLex) && "Raw lex must succeed"
) ? static_cast<void> (0) : __assert_fail ("(returnedToken || !isRawLex) && \"Raw lex must succeed\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 3154, __PRETTY_FUNCTION__))
;
3155 return returnedToken;
3156}
3157
3158/// LexTokenInternal - This implements a simple C family lexer. It is an
3159/// extremely performance critical piece of code. This assumes that the buffer
3160/// has a null character at the end of the file. This returns a preprocessing
3161/// token, not a normal token, as such, it is an internal interface. It assumes
3162/// that the Flags of result have been cleared before calling this.
3163bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3164LexNextToken:
3165 // New token, can't need cleaning yet.
3166 Result.clearFlag(Token::NeedsCleaning);
3167 Result.setIdentifierInfo(nullptr);
3168
3169 // CurPtr - Cache BufferPtr in an automatic variable.
3170 const char *CurPtr = BufferPtr;
3171
3172 // Small amounts of horizontal whitespace is very common between tokens.
3173 if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
3174 ++CurPtr;
3175 while ((*CurPtr == ' ') || (*CurPtr == '\t'))
3176 ++CurPtr;
3177
3178 // If we are keeping whitespace and other tokens, just return what we just
3179 // skipped. The next lexer invocation will return the token after the
3180 // whitespace.
3181 if (isKeepWhitespaceMode()) {
3182 FormTokenWithChars(Result, CurPtr, tok::unknown);
3183 // FIXME: The next token will not have LeadingSpace set.
3184 return true;
3185 }
3186
3187 BufferPtr = CurPtr;
3188 Result.setFlag(Token::LeadingSpace);
3189 }
3190
3191 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3192
3193 // Read a character, advancing over it.
3194 char Char = getAndAdvanceChar(CurPtr, Result);
3195 tok::TokenKind Kind;
3196
3197 switch (Char) {
3198 case 0: // Null.
3199 // Found end of file?
3200 if (CurPtr-1 == BufferEnd)
3201 return LexEndOfFile(Result, CurPtr-1);
3202
3203 // Check if we are performing code completion.
3204 if (isCodeCompletionPoint(CurPtr-1)) {
3205 // Return the code-completion token.
3206 Result.startToken();
3207 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3208 return true;
3209 }
3210
3211 if (!isLexingRawMode())
3212 Diag(CurPtr-1, diag::null_in_file);
3213 Result.setFlag(Token::LeadingSpace);
3214 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3215 return true; // KeepWhitespaceMode
3216
3217 // We know the lexer hasn't changed, so just try again with this lexer.
3218 // (We manually eliminate the tail call to avoid recursion.)
3219 goto LexNextToken;
3220
3221 case 26: // DOS & CP/M EOF: "^Z".
3222 // If we're in Microsoft extensions mode, treat this as end of file.
3223 if (LangOpts.MicrosoftExt) {
3224 if (!isLexingRawMode())
3225 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3226 return LexEndOfFile(Result, CurPtr-1);
3227 }
3228
3229 // If Microsoft extensions are disabled, this is just random garbage.
3230 Kind = tok::unknown;
3231 break;
3232
3233 case '\r':
3234 if (CurPtr[0] == '\n')
3235 Char = getAndAdvanceChar(CurPtr, Result);
3236 LLVM_FALLTHROUGH[[clang::fallthrough]];
3237 case '\n':
3238 // If we are inside a preprocessor directive and we see the end of line,
3239 // we know we are done with the directive, so return an EOD token.
3240 if (ParsingPreprocessorDirective) {
3241 // Done parsing the "line".
3242 ParsingPreprocessorDirective = false;
3243
3244 // Restore comment saving mode, in case it was disabled for directive.
3245 if (PP)
3246 resetExtendedTokenMode();
3247
3248 // Since we consumed a newline, we are back at the start of a line.
3249 IsAtStartOfLine = true;
3250 IsAtPhysicalStartOfLine = true;
3251
3252 Kind = tok::eod;
3253 break;
3254 }
3255
3256 // No leading whitespace seen so far.
3257 Result.clearFlag(Token::LeadingSpace);
3258
3259 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3260 return true; // KeepWhitespaceMode
3261
3262 // We only saw whitespace, so just try again with this lexer.
3263 // (We manually eliminate the tail call to avoid recursion.)
3264 goto LexNextToken;
3265 case ' ':
3266 case '\t':
3267 case '\f':
3268 case '\v':
3269 SkipHorizontalWhitespace:
3270 Result.setFlag(Token::LeadingSpace);
3271 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3272 return true; // KeepWhitespaceMode
3273
3274 SkipIgnoredUnits:
3275 CurPtr = BufferPtr;
3276
3277 // If the next token is obviously a // or /* */ comment, skip it efficiently
3278 // too (without going through the big switch stmt).
3279 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3280 LangOpts.LineComment &&
3281 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3282 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3283 return true; // There is a token to return.
3284 goto SkipIgnoredUnits;
3285 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3286 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3287 return true; // There is a token to return.
3288 goto SkipIgnoredUnits;
3289 } else if (isHorizontalWhitespace(*CurPtr)) {
3290 goto SkipHorizontalWhitespace;
3291 }
3292 // We only saw whitespace, so just try again with this lexer.
3293 // (We manually eliminate the tail call to avoid recursion.)
3294 goto LexNextToken;
3295
3296 // C99 6.4.4.1: Integer Constants.
3297 // C99 6.4.4.2: Floating Constants.
3298 case '0': case '1': case '2': case '3': case '4':
3299 case '5': case '6': case '7': case '8': case '9':
3300 // Notify MIOpt that we read a non-whitespace/non-comment token.
3301 MIOpt.ReadToken();
3302 return LexNumericConstant(Result, CurPtr);
3303
3304 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3305 // Notify MIOpt that we read a non-whitespace/non-comment token.
3306 MIOpt.ReadToken();
3307
3308 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3309 Char = getCharAndSize(CurPtr, SizeTmp);
3310
3311 // UTF-16 string literal
3312 if (Char == '"')
3313 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3314 tok::utf16_string_literal);
3315
3316 // UTF-16 character constant
3317 if (Char == '\'')
3318 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3319 tok::utf16_char_constant);
3320
3321 // UTF-16 raw string literal
3322 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3323 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3324 return LexRawStringLiteral(Result,
3325 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3326 SizeTmp2, Result),
3327 tok::utf16_string_literal);
3328
3329 if (Char == '8') {
3330 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3331
3332 // UTF-8 string literal
3333 if (Char2 == '"')
3334 return LexStringLiteral(Result,
3335 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3336 SizeTmp2, Result),
3337 tok::utf8_string_literal);
3338 if (Char2 == '\'' && LangOpts.CPlusPlus17)
3339 return LexCharConstant(
3340 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3341 SizeTmp2, Result),
3342 tok::utf8_char_constant);
3343
3344 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3345 unsigned SizeTmp3;
3346 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3347 // UTF-8 raw string literal
3348 if (Char3 == '"') {
3349 return LexRawStringLiteral(Result,
3350 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3351 SizeTmp2, Result),
3352 SizeTmp3, Result),
3353 tok::utf8_string_literal);
3354 }
3355 }
3356 }
3357 }
3358
3359 // treat u like the start of an identifier.
3360 return LexIdentifier(Result, CurPtr);
3361
3362 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
3363 // Notify MIOpt that we read a non-whitespace/non-comment token.
3364 MIOpt.ReadToken();
3365
3366 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3367 Char = getCharAndSize(CurPtr, SizeTmp);
3368
3369 // UTF-32 string literal
3370 if (Char == '"')
3371 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3372 tok::utf32_string_literal);
3373
3374 // UTF-32 character constant
3375 if (Char == '\'')
3376 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3377 tok::utf32_char_constant);
3378
3379 // UTF-32 raw string literal
3380 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3381 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3382 return LexRawStringLiteral(Result,
3383 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3384 SizeTmp2, Result),
3385 tok::utf32_string_literal);
3386 }
3387
3388 // treat U like the start of an identifier.
3389 return LexIdentifier(Result, CurPtr);
3390
3391 case 'R': // Identifier or C++0x raw string literal
3392 // Notify MIOpt that we read a non-whitespace/non-comment token.
3393 MIOpt.ReadToken();
3394
3395 if (LangOpts.CPlusPlus11) {
3396 Char = getCharAndSize(CurPtr, SizeTmp);
3397
3398 if (Char == '"')
3399 return LexRawStringLiteral(Result,
3400 ConsumeChar(CurPtr, SizeTmp, Result),
3401 tok::string_literal);
3402 }
3403
3404 // treat R like the start of an identifier.
3405 return LexIdentifier(Result, CurPtr);
3406
3407 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3408 // Notify MIOpt that we read a non-whitespace/non-comment token.
3409 MIOpt.ReadToken();
3410 Char = getCharAndSize(CurPtr, SizeTmp);
3411
3412 // Wide string literal.
3413 if (Char == '"')
3414 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3415 tok::wide_string_literal);
3416
3417 // Wide raw string literal.
3418 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3419 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3420 return LexRawStringLiteral(Result,
3421 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3422 SizeTmp2, Result),
3423 tok::wide_string_literal);
3424
3425 // Wide character constant.
3426 if (Char == '\'')
3427 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3428 tok::wide_char_constant);
3429 // FALL THROUGH, treating L like the start of an identifier.
3430 LLVM_FALLTHROUGH[[clang::fallthrough]];
3431
3432 // C99 6.4.2: Identifiers.
3433 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3434 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3435 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3436 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3437 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3438 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3439 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3440 case 'v': case 'w': case 'x': case 'y': case 'z':
3441 case '_':
3442 // Notify MIOpt that we read a non-whitespace/non-comment token.
3443 MIOpt.ReadToken();
3444 return LexIdentifier(Result, CurPtr);
3445
3446 case '$': // $ in identifiers.
3447 if (LangOpts.DollarIdents) {
3448 if (!isLexingRawMode())
3449 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3450 // Notify MIOpt that we read a non-whitespace/non-comment token.
3451 MIOpt.ReadToken();
3452 return LexIdentifier(Result, CurPtr);
3453 }
3454
3455 Kind = tok::unknown;
3456 break;
3457
3458 // C99 6.4.4: Character Constants.
3459 case '\'':
3460 // Notify MIOpt that we read a non-whitespace/non-comment token.
3461 MIOpt.ReadToken();
3462 return LexCharConstant(Result, CurPtr, tok::char_constant);
3463
3464 // C99 6.4.5: String Literals.
3465 case '"':
3466 // Notify MIOpt that we read a non-whitespace/non-comment token.
3467 MIOpt.ReadToken();
3468 return LexStringLiteral(Result, CurPtr,
3469 ParsingFilename ? tok::header_name
3470 : tok::string_literal);
3471
3472 // C99 6.4.6: Punctuators.
3473 case '?':
3474 Kind = tok::question;
3475 break;
3476 case '[':
3477 Kind = tok::l_square;
3478 break;
3479 case ']':
3480 Kind = tok::r_square;
3481 break;
3482 case '(':
3483 Kind = tok::l_paren;
3484 break;
3485 case ')':
3486 Kind = tok::r_paren;
3487 break;
3488 case '{':
3489 Kind = tok::l_brace;
3490 break;
3491 case '}':
3492 Kind = tok::r_brace;
3493 break;
3494 case '.':
3495 Char = getCharAndSize(CurPtr, SizeTmp);
3496 if (Char >= '0' && Char <= '9') {
3497 // Notify MIOpt that we read a non-whitespace/non-comment token.
3498 MIOpt.ReadToken();
3499
3500 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3501 } else if (LangOpts.CPlusPlus && Char == '*') {
3502 Kind = tok::periodstar;
3503 CurPtr += SizeTmp;
3504 } else if (Char == '.' &&
3505 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3506 Kind = tok::ellipsis;
3507 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3508 SizeTmp2, Result);
3509 } else {
3510 Kind = tok::period;
3511 }
3512 break;
3513 case '&':
3514 Char = getCharAndSize(CurPtr, SizeTmp);
3515 if (Char == '&') {
3516 Kind = tok::ampamp;
3517 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3518 } else if (Char == '=') {
3519 Kind = tok::ampequal;
3520 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3521 } else {
3522 Kind = tok::amp;
3523 }
3524 break;
3525 case '*':
3526 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3527 Kind = tok::starequal;
3528 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3529 } else {
3530 Kind = tok::star;
3531 }
3532 break;
3533 case '+':
3534 Char = getCharAndSize(CurPtr, SizeTmp);
3535 if (Char == '+') {
3536 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3537 Kind = tok::plusplus;
3538 } else if (Char == '=') {
3539 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3540 Kind = tok::plusequal;
3541 } else {
3542 Kind = tok::plus;
3543 }
3544 break;
3545 case '-':
3546 Char = getCharAndSize(CurPtr, SizeTmp);
3547 if (Char == '-') { // --
3548 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3549 Kind = tok::minusminus;
3550 } else if (Char == '>' && LangOpts.CPlusPlus &&
3551 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3552 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3553 SizeTmp2, Result);
3554 Kind = tok::arrowstar;
3555 } else if (Char == '>') { // ->
3556 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3557 Kind = tok::arrow;
3558 } else if (Char == '=') { // -=
3559 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3560 Kind = tok::minusequal;
3561 } else {
3562 Kind = tok::minus;
3563 }
3564 break;
3565 case '~':
3566 Kind = tok::tilde;
3567 break;
3568 case '!':
3569 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3570 Kind = tok::exclaimequal;
3571 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3572 } else {
3573 Kind = tok::exclaim;
3574 }
3575 break;
3576 case '/':
3577 // 6.4.9: Comments
3578 Char = getCharAndSize(CurPtr, SizeTmp);
3579 if (Char == '/') { // Line comment.
3580 // Even if Line comments are disabled (e.g. in C89 mode), we generally
3581 // want to lex this as a comment. There is one problem with this though,
3582 // that in one particular corner case, this can change the behavior of the
3583 // resultant program. For example, In "foo //**/ bar", C89 would lex
3584 // this as "foo / bar" and languages with Line comments would lex it as
3585 // "foo". Check to see if the character after the second slash is a '*'.
3586 // If so, we will lex that as a "/" instead of the start of a comment.
3587 // However, we never do this if we are just preprocessing.
3588 bool TreatAsComment = LangOpts.LineComment &&
3589 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3590 if (!TreatAsComment)
3591 if (!(PP && PP->isPreprocessedOutput()))
3592 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3593
3594 if (TreatAsComment) {
3595 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3596 TokAtPhysicalStartOfLine))
3597 return true; // There is a token to return.
3598
3599 // It is common for the tokens immediately after a // comment to be
3600 // whitespace (indentation for the next line). Instead of going through
3601 // the big switch, handle it efficiently now.
3602 goto SkipIgnoredUnits;
3603 }
3604 }
3605
3606 if (Char == '*') { // /**/ comment.
3607 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3608 TokAtPhysicalStartOfLine))
3609 return true; // There is a token to return.
3610
3611 // We only saw whitespace, so just try again with this lexer.
3612 // (We manually eliminate the tail call to avoid recursion.)
3613 goto LexNextToken;
3614 }
3615
3616 if (Char == '=') {
3617 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3618 Kind = tok::slashequal;
3619 } else {
3620 Kind = tok::slash;
3621 }
3622 break;
3623 case '%':
3624 Char = getCharAndSize(CurPtr, SizeTmp);
3625 if (Char == '=') {
3626 Kind = tok::percentequal;
3627 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3628 } else if (LangOpts.Digraphs && Char == '>') {
3629 Kind = tok::r_brace; // '%>' -> '}'
3630 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3631 } else if (LangOpts.Digraphs && Char == ':') {
3632 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3633 Char = getCharAndSize(CurPtr, SizeTmp);
3634 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3635 Kind = tok::hashhash; // '%:%:' -> '##'
3636 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3637 SizeTmp2, Result);
3638 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
3639 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3640 if (!isLexingRawMode())
3641 Diag(BufferPtr, diag::ext_charize_microsoft);
3642 Kind = tok::hashat;
3643 } else { // '%:' -> '#'
3644 // We parsed a # character. If this occurs at the start of the line,
3645 // it's actually the start of a preprocessing directive. Callback to
3646 // the preprocessor to handle it.
3647 // TODO: -fpreprocessed mode??
3648 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3649 goto HandleDirective;
3650
3651 Kind = tok::hash;
3652 }
3653 } else {
3654 Kind = tok::percent;
3655 }
3656 break;
3657 case '<':
3658 Char = getCharAndSize(CurPtr, SizeTmp);
3659 if (ParsingFilename) {
3660 return LexAngledStringLiteral(Result, CurPtr);
3661 } else if (Char == '<') {
3662 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3663 if (After == '=') {
3664 Kind = tok::lesslessequal;
3665 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3666 SizeTmp2, Result);
3667 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
3668 // If this is actually a '<<<<<<<' version control conflict marker,
3669 // recognize it as such and recover nicely.
3670 goto LexNextToken;
3671 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
3672 // If this is '<<<<' and we're in a Perforce-style conflict marker,
3673 // ignore it.
3674 goto LexNextToken;
3675 } else if (LangOpts.CUDA && After == '<') {
3676 Kind = tok::lesslessless;
3677 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3678 SizeTmp2, Result);
3679 } else {
3680 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3681 Kind = tok::lessless;
3682 }
3683 } else if (Char == '=') {
3684 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3685 if (After == '>') {
3686 if (getLangOpts().CPlusPlus2a) {
3687 if (!isLexingRawMode())
3688 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3689 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3690 SizeTmp2, Result);
3691 Kind = tok::spaceship;
3692 break;
3693 }
3694 // Suggest adding a space between the '<=' and the '>' to avoid a
3695 // change in semantics if this turns up in C++ <=17 mode.
3696 if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
3697 Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
3698 << FixItHint::CreateInsertion(
3699 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3700 }
3701 }
3702 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3703 Kind = tok::lessequal;
3704 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
3705 if (LangOpts.CPlusPlus11 &&
3706 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3707 // C++0x [lex.pptoken]p3:
3708 // Otherwise, if the next three characters are <:: and the subsequent
3709 // character is neither : nor >, the < is treated as a preprocessor
3710 // token by itself and not as the first character of the alternative
3711 // token <:.
3712 unsigned SizeTmp3;
3713 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3714 if (After != ':' && After != '>') {
3715 Kind = tok::less;
3716 if (!isLexingRawMode())
3717 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3718 break;
3719 }
3720 }
3721
3722 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3723 Kind = tok::l_square;
3724 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
3725 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3726 Kind = tok::l_brace;
3727 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
3728 lexEditorPlaceholder(Result, CurPtr)) {
3729 return true;
3730 } else {
3731 Kind = tok::less;
3732 }
3733 break;
3734 case '>':
3735 Char = getCharAndSize(CurPtr, SizeTmp);
3736 if (Char == '=') {
3737 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3738 Kind = tok::greaterequal;
3739 } else if (Char == '>') {
3740 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3741 if (After == '=') {
3742 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3743 SizeTmp2, Result);
3744 Kind = tok::greatergreaterequal;
3745 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
3746 // If this is actually a '>>>>' conflict marker, recognize it as such
3747 // and recover nicely.
3748 goto LexNextToken;
3749 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
3750 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3751 goto LexNextToken;
3752 } else if (LangOpts.CUDA && After == '>') {
3753 Kind = tok::greatergreatergreater;
3754 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3755 SizeTmp2, Result);
3756 } else {
3757 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3758 Kind = tok::greatergreater;
3759 }
3760 } else {
3761 Kind = tok::greater;
3762 }
3763 break;
3764 case '^':
3765 Char = getCharAndSize(CurPtr, SizeTmp);
3766 if (Char == '=') {
3767 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3768 Kind = tok::caretequal;
3769 } else if (LangOpts.OpenCL && Char == '^') {
3770 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3771 Kind = tok::caretcaret;
3772 } else {
3773 Kind = tok::caret;
3774 }
3775 break;
3776 case '|':
3777 Char = getCharAndSize(CurPtr, SizeTmp);
3778 if (Char == '=') {
3779 Kind = tok::pipeequal;
3780 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3781 } else if (Char == '|') {
3782 // If this is '|||||||' and we're in a conflict marker, ignore it.
3783 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
3784 goto LexNextToken;
3785 Kind = tok::pipepipe;
3786 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3787 } else {
3788 Kind = tok::pipe;
3789 }
3790 break;
3791 case ':':
3792 Char = getCharAndSize(CurPtr, SizeTmp);
3793 if (LangOpts.Digraphs && Char == '>') {
3794 Kind = tok::r_square; // ':>' -> ']'
3795 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3796 } else if ((LangOpts.CPlusPlus ||
3797 LangOpts.DoubleSquareBracketAttributes) &&
3798 Char == ':') {
3799 Kind = tok::coloncolon;
3800 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3801 } else {
3802 Kind = tok::colon;
3803 }
3804 break;
3805 case ';':
3806 Kind = tok::semi;
3807 break;
3808 case '=':
3809 Char = getCharAndSize(CurPtr, SizeTmp);
3810 if (Char == '=') {
3811 // If this is '====' and we're in a conflict marker, ignore it.
3812 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
3813 goto LexNextToken;
3814
3815 Kind = tok::equalequal;
3816 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3817 } else {
3818 Kind = tok::equal;
3819 }
3820 break;
3821 case ',':
3822 Kind = tok::comma;
3823 break;
3824 case '#':
3825 Char = getCharAndSize(CurPtr, SizeTmp);
3826 if (Char == '#') {
3827 Kind = tok::hashhash;
3828 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3829 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
3830 Kind = tok::hashat;
3831 if (!isLexingRawMode())
3832 Diag(BufferPtr, diag::ext_charize_microsoft);
3833 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3834 } else {
3835 // We parsed a # character. If this occurs at the start of the line,
3836 // it's actually the start of a preprocessing directive. Callback to
3837 // the preprocessor to handle it.
3838 // TODO: -fpreprocessed mode??
3839 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3840 goto HandleDirective;
3841
3842 Kind = tok::hash;
3843 }
3844 break;
3845
3846 case '@':
3847 // Objective C support.
3848 if (CurPtr[-1] == '@' && LangOpts.ObjC)
3849 Kind = tok::at;
3850 else
3851 Kind = tok::unknown;
3852 break;
3853
3854 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3855 case '\\':
3856 if (!LangOpts.AsmPreprocessor) {
3857 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3858 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3859 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3860 return true; // KeepWhitespaceMode
3861
3862 // We only saw whitespace, so just try again with this lexer.
3863 // (We manually eliminate the tail call to avoid recursion.)
3864 goto LexNextToken;
3865 }
3866
3867 return LexUnicode(Result, CodePoint, CurPtr);
3868 }
3869 }
3870
3871 Kind = tok::unknown;
3872 break;
3873
3874 default: {
3875 if (isASCII(Char)) {
3876 Kind = tok::unknown;
3877 break;
3878 }
3879
3880 llvm::UTF32 CodePoint;
3881
3882 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3883 // an escaped newline.
3884 --CurPtr;
3885 llvm::ConversionResult Status =
3886 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3887 (const llvm::UTF8 *)BufferEnd,
3888 &CodePoint,
3889 llvm::strictConversion);
3890 if (Status == llvm::conversionOK) {
3891 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3892 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3893 return true; // KeepWhitespaceMode
3894
3895 // We only saw whitespace, so just try again with this lexer.
3896 // (We manually eliminate the tail call to avoid recursion.)
3897 goto LexNextToken;
3898 }
3899 return LexUnicode(Result, CodePoint, CurPtr);
3900 }
3901
3902 if (isLexingRawMode() || ParsingPreprocessorDirective ||
3903 PP->isPreprocessedOutput()) {
3904 ++CurPtr;
3905 Kind = tok::unknown;
3906 break;
3907 }
3908
3909 // Non-ASCII characters tend to creep into source code unintentionally.
3910 // Instead of letting the parser complain about the unknown token,
3911 // just diagnose the invalid UTF-8, then drop the character.
3912 Diag(CurPtr, diag::err_invalid_utf8);
3913
3914 BufferPtr = CurPtr+1;
3915 // We're pretending the character didn't exist, so just try again with
3916 // this lexer.
3917 // (We manually eliminate the tail call to avoid recursion.)
3918 goto LexNextToken;
3919 }
3920 }
3921
3922 // Notify MIOpt that we read a non-whitespace/non-comment token.
3923 MIOpt.ReadToken();
3924
3925 // Update the location of token as well as BufferPtr.
3926 FormTokenWithChars(Result, CurPtr, Kind);
3927 return true;
3928
3929HandleDirective:
3930 // We parsed a # character and it's the start of a preprocessing directive.
3931
3932 FormTokenWithChars(Result, CurPtr, tok::hash);
3933 PP->HandleDirective(Result);
3934
3935 if (PP->hadModuleLoaderFatalFailure()) {
3936 // With a fatal failure in the module loader, we abort parsing.
3937 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof")((Result.is(tok::eof) && "Preprocessor did not set tok:eof"
) ? static_cast<void> (0) : __assert_fail ("Result.is(tok::eof) && \"Preprocessor did not set tok:eof\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/lib/Lex/Lexer.cpp"
, 3937, __PRETTY_FUNCTION__))
;
3938 return true;
3939 }
3940
3941 // We parsed the directive; lex a token with the new state.
3942 return false;
3943}

/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Lexer.h

1//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the Lexer interface.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_LEX_LEXER_H
14#define LLVM_CLANG_LEX_LEXER_H
15
16#include "clang/Basic/LangOptions.h"
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/TokenKinds.h"
19#include "clang/Lex/PreprocessorLexer.h"
20#include "clang/Lex/Token.h"
21#include "llvm/ADT/Optional.h"
22#include "llvm/ADT/SmallVector.h"
23#include "llvm/ADT/StringRef.h"
24#include <cassert>
25#include <cstdint>
26#include <string>
27
28namespace llvm {
29
30class MemoryBuffer;
31
32} // namespace llvm
33
34namespace clang {
35
36class DiagnosticBuilder;
37class Preprocessor;
38class SourceManager;
39
40/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
41/// recovering from.
42enum ConflictMarkerKind {
43 /// Not within a conflict marker.
44 CMK_None,
45
46 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
47 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
48 CMK_Normal,
49
50 /// A Perforce-style conflict marker, initiated by 4 ">"s,
51 /// separated by 4 "="s, and terminated by 4 "<"s.
52 CMK_Perforce
53};
54
55/// Describes the bounds (start, size) of the preamble and a flag required by
56/// PreprocessorOptions::PrecompiledPreambleBytes.
57/// The preamble includes the BOM, if any.
58struct PreambleBounds {
59 /// Size of the preamble in bytes.
60 unsigned Size;
61
62 /// Whether the preamble ends at the start of a new line.
63 ///
64 /// Used to inform the lexer as to whether it's starting at the beginning of
65 /// a line after skipping the preamble.
66 bool PreambleEndsAtStartOfLine;
67
68 PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
69 : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
70};
71
72/// Lexer - This provides a simple interface that turns a text buffer into a
73/// stream of tokens. This provides no support for file reading or buffering,
74/// or buffering/seeking of tokens, only forward lexing is supported. It relies
75/// on the specified Preprocessor object to handle preprocessor directives, etc.
76class Lexer : public PreprocessorLexer {
77 friend class Preprocessor;
78
79 void anchor() override;
80
81 //===--------------------------------------------------------------------===//
82 // Constant configuration values for this lexer.
83
84 // Start of the buffer.
85 const char *BufferStart;
86
87 // End of the buffer.
88 const char *BufferEnd;
89
90 // Location for start of file.
91 SourceLocation FileLoc;
92
93 // LangOpts enabled by this language (cache).
94 LangOptions LangOpts;
95
96 // True if lexer for _Pragma handling.
97 bool Is_PragmaLexer;
98
99 //===--------------------------------------------------------------------===//
100 // Context-specific lexing flags set by the preprocessor.
101 //
102
103 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
104 /// and return them as tokens. This is used for -C and -CC modes, and
105 /// whitespace preservation can be useful for some clients that want to lex
106 /// the file in raw mode and get every character from the file.
107 ///
108 /// When this is set to 2 it returns comments and whitespace. When set to 1
109 /// it returns comments, when it is set to 0 it returns normal tokens only.
110 unsigned char ExtendedTokenMode;
111
112 //===--------------------------------------------------------------------===//
113 // Context that changes as the file is lexed.
114 // NOTE: any state that mutates when in raw mode must have save/restore code
115 // in Lexer::isNextPPTokenLParen.
116
117 // BufferPtr - Current pointer into the buffer. This is the next character
118 // to be lexed.
119 const char *BufferPtr;
120
121 // IsAtStartOfLine - True if the next lexed token should get the "start of
122 // line" flag set on it.
123 bool IsAtStartOfLine;
124
125 bool IsAtPhysicalStartOfLine;
126
127 bool HasLeadingSpace;
128
129 bool HasLeadingEmptyMacro;
130
131 // CurrentConflictMarkerState - The kind of conflict marker we are handling.
132 ConflictMarkerKind CurrentConflictMarkerState;
133
134 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
135
136public:
137 /// Lexer constructor - Create a new lexer object for the specified buffer
138 /// with the specified preprocessor managing the lexing process. This lexer
139 /// assumes that the associated file buffer and Preprocessor objects will
140 /// outlive it, so it doesn't take ownership of either of them.
141 Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP);
142
143 /// Lexer constructor - Create a new raw lexer object. This object is only
144 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
145 /// text range will outlive it, so it doesn't take ownership of it.
146 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
147 const char *BufStart, const char *BufPtr, const char *BufEnd);
148
149 /// Lexer constructor - Create a new raw lexer object. This object is only
150 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
151 /// text range will outlive it, so it doesn't take ownership of it.
152 Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
153 const SourceManager &SM, const LangOptions &LangOpts);
154
155 Lexer(const Lexer &) = delete;
156 Lexer &operator=(const Lexer &) = delete;
157
158 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
159 /// _Pragma expansion. This has a variety of magic semantics that this method
160 /// sets up. It returns a new'd Lexer that must be delete'd when done.
161 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
162 SourceLocation ExpansionLocStart,
163 SourceLocation ExpansionLocEnd,
164 unsigned TokLen, Preprocessor &PP);
165
166 /// getLangOpts - Return the language features currently enabled.
167 /// NOTE: this lexer modifies features as a file is parsed!
168 const LangOptions &getLangOpts() const { return LangOpts; }
169
170 /// getFileLoc - Return the File Location for the file we are lexing out of.
171 /// The physical location encodes the location where the characters come from,
172 /// the virtual location encodes where we should *claim* the characters came
173 /// from. Currently this is only used by _Pragma handling.
174 SourceLocation getFileLoc() const { return FileLoc; }
175
176private:
177 /// Lex - Return the next token in the file. If this is the end of file, it
178 /// return the tok::eof token. This implicitly involves the preprocessor.
179 bool Lex(Token &Result);
180
181public:
182 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
183 bool isPragmaLexer() const { return Is_PragmaLexer; }
184
185private:
186 /// IndirectLex - An indirect call to 'Lex' that can be invoked via
187 /// the PreprocessorLexer interface.
188 void IndirectLex(Token &Result) override { Lex(Result); }
189
190public:
191 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
192 /// associated preprocessor object. Return true if the 'next character to
193 /// read' pointer points at the end of the lexer buffer, false otherwise.
194 bool LexFromRawLexer(Token &Result) {
195 assert(LexingRawMode && "Not already in raw mode!")((LexingRawMode && "Not already in raw mode!") ? static_cast
<void> (0) : __assert_fail ("LexingRawMode && \"Not already in raw mode!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Lexer.h"
, 195, __PRETTY_FUNCTION__))
;
196 Lex(Result);
197 // Note that lexing to the end of the buffer doesn't implicitly delete the
198 // lexer when in raw mode.
199 return BufferPtr == BufferEnd;
200 }
201
202 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
203 /// every character in the file, including whitespace and comments. This
204 /// should only be used in raw mode, as the preprocessor is not prepared to
205 /// deal with the excess tokens.
206 bool isKeepWhitespaceMode() const {
207 return ExtendedTokenMode > 1;
208 }
209
210 /// SetKeepWhitespaceMode - This method lets clients enable or disable
211 /// whitespace retention mode.
212 void SetKeepWhitespaceMode(bool Val) {
213 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&(((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
"Can only retain whitespace in raw mode or -traditional-cpp"
) ? static_cast<void> (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Lexer.h"
, 214, __PRETTY_FUNCTION__))
214 "Can only retain whitespace in raw mode or -traditional-cpp")(((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
"Can only retain whitespace in raw mode or -traditional-cpp"
) ? static_cast<void> (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Lexer.h"
, 214, __PRETTY_FUNCTION__))
;
215 ExtendedTokenMode = Val ? 2 : 0;
216 }
217
218 /// inKeepCommentMode - Return true if the lexer should return comments as
219 /// tokens.
220 bool inKeepCommentMode() const {
221 return ExtendedTokenMode > 0;
222 }
223
224 /// SetCommentRetentionMode - Change the comment retention mode of the lexer
225 /// to the specified mode. This is really only useful when lexing in raw
226 /// mode, because otherwise the lexer needs to manage this.
227 void SetCommentRetentionState(bool Mode) {
228 assert(!isKeepWhitespaceMode() &&((!isKeepWhitespaceMode() && "Can't play with comment retention state when retaining whitespace"
) ? static_cast<void> (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Lexer.h"
, 229, __PRETTY_FUNCTION__))
229 "Can't play with comment retention state when retaining whitespace")((!isKeepWhitespaceMode() && "Can't play with comment retention state when retaining whitespace"
) ? static_cast<void> (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Lexer.h"
, 229, __PRETTY_FUNCTION__))
;
230 ExtendedTokenMode = Mode ? 1 : 0;
231 }
232
233 /// Sets the extended token mode back to its initial value, according to the
234 /// language options and preprocessor. This controls whether the lexer
235 /// produces comment and whitespace tokens.
236 ///
237 /// This requires the lexer to have an associated preprocessor. A standalone
238 /// lexer has nothing to reset to.
239 void resetExtendedTokenMode();
240
241 /// Gets source code buffer.
242 StringRef getBuffer() const {
243 return StringRef(BufferStart, BufferEnd - BufferStart);
244 }
245
246 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
247 /// uninterpreted string. This switches the lexer out of directive mode.
248 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
249
250
251 /// Diag - Forwarding function for diagnostics. This translate a source
252 /// position in the current buffer into a SourceLocation object for rendering.
253 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
254
255 /// getSourceLocation - Return a source location identifier for the specified
256 /// offset in the current file.
257 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
258
259 /// getSourceLocation - Return a source location for the next character in
260 /// the current file.
261 SourceLocation getSourceLocation() override {
262 return getSourceLocation(BufferPtr);
263 }
264
265 /// Return the current location in the buffer.
266 const char *getBufferLocation() const { return BufferPtr; }
267
268 /// Stringify - Convert the specified string into a C string by i) escaping
269 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
270 /// If Charify is true, this escapes the ' character instead of ".
271 static std::string Stringify(StringRef Str, bool Charify = false);
272
273 /// Stringify - Convert the specified string into a C string by i) escaping
274 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
275 static void Stringify(SmallVectorImpl<char> &Str);
276
277 /// getSpelling - This method is used to get the spelling of a token into a
278 /// preallocated buffer, instead of as an std::string. The caller is required
279 /// to allocate enough space for the token, which is guaranteed to be at least
280 /// Tok.getLength() bytes long. The length of the actual result is returned.
281 ///
282 /// Note that this method may do two possible things: it may either fill in
283 /// the buffer specified with characters, or it may *change the input pointer*
284 /// to point to a constant buffer with the data already in it (avoiding a
285 /// copy). The caller is not allowed to modify the returned buffer pointer
286 /// if an internal buffer is returned.
287 static unsigned getSpelling(const Token &Tok, const char *&Buffer,
288 const SourceManager &SourceMgr,
289 const LangOptions &LangOpts,
290 bool *Invalid = nullptr);
291
292 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
293 /// token is the characters used to represent the token in the source file
294 /// after trigraph expansion and escaped-newline folding. In particular, this
295 /// wants to get the true, uncanonicalized, spelling of things like digraphs
296 /// UCNs, etc.
297 static std::string getSpelling(const Token &Tok,
298 const SourceManager &SourceMgr,
299 const LangOptions &LangOpts,
300 bool *Invalid = nullptr);
301
302 /// getSpelling - This method is used to get the spelling of the
303 /// token at the given source location. If, as is usually true, it
304 /// is not necessary to copy any data, then the returned string may
305 /// not point into the provided buffer.
306 ///
307 /// This method lexes at the expansion depth of the given
308 /// location and does not jump to the expansion or spelling
309 /// location.
310 static StringRef getSpelling(SourceLocation loc,
311 SmallVectorImpl<char> &buffer,
312 const SourceManager &SM,
313 const LangOptions &options,
314 bool *invalid = nullptr);
315
316 /// MeasureTokenLength - Relex the token at the specified location and return
317 /// its length in bytes in the input file. If the token needs cleaning (e.g.
318 /// includes a trigraph or an escaped newline) then this count includes bytes
319 /// that are part of that.
320 static unsigned MeasureTokenLength(SourceLocation Loc,
321 const SourceManager &SM,
322 const LangOptions &LangOpts);
323
324 /// Relex the token at the specified location.
325 /// \returns true if there was a failure, false on success.
326 static bool getRawToken(SourceLocation Loc, Token &Result,
327 const SourceManager &SM,
328 const LangOptions &LangOpts,
329 bool IgnoreWhiteSpace = false);
330
331 /// Given a location any where in a source buffer, find the location
332 /// that corresponds to the beginning of the token in which the original
333 /// source location lands.
334 static SourceLocation GetBeginningOfToken(SourceLocation Loc,
335 const SourceManager &SM,
336 const LangOptions &LangOpts);
337
338 /// Get the physical length (including trigraphs and escaped newlines) of the
339 /// first \p Characters characters of the token starting at TokStart.
340 static unsigned getTokenPrefixLength(SourceLocation TokStart,
341 unsigned CharNo,
342 const SourceManager &SM,
343 const LangOptions &LangOpts);
344
345 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
346 /// location at the start of a token, return a new location that specifies a
347 /// character within the token. This handles trigraphs and escaped newlines.
348 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
349 unsigned Characters,
350 const SourceManager &SM,
351 const LangOptions &LangOpts) {
352 return TokStart.getLocWithOffset(
353 getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
354 }
355
356 /// Computes the source location just past the end of the
357 /// token at this source location.
358 ///
359 /// This routine can be used to produce a source location that
360 /// points just past the end of the token referenced by \p Loc, and
361 /// is generally used when a diagnostic needs to point just after a
362 /// token where it expected something different that it received. If
363 /// the returned source location would not be meaningful (e.g., if
364 /// it points into a macro), this routine returns an invalid
365 /// source location.
366 ///
367 /// \param Offset an offset from the end of the token, where the source
368 /// location should refer to. The default offset (0) produces a source
369 /// location pointing just past the end of the token; an offset of 1 produces
370 /// a source location pointing to the last character in the token, etc.
371 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
372 const SourceManager &SM,
373 const LangOptions &LangOpts);
374
375 /// Given a token range, produce a corresponding CharSourceRange that
376 /// is not a token range. This allows the source range to be used by
377 /// components that don't have access to the lexer and thus can't find the
378 /// end of the range for themselves.
379 static CharSourceRange getAsCharRange(SourceRange Range,
380 const SourceManager &SM,
381 const LangOptions &LangOpts) {
382 SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
383 return End.isInvalid() ? CharSourceRange()
384 : CharSourceRange::getCharRange(
385 Range.getBegin(), End);
386 }
387 static CharSourceRange getAsCharRange(CharSourceRange Range,
388 const SourceManager &SM,
389 const LangOptions &LangOpts) {
390 return Range.isTokenRange()
391 ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
392 : Range;
393 }
394
395 /// Returns true if the given MacroID location points at the first
396 /// token of the macro expansion.
397 ///
398 /// \param MacroBegin If non-null and function returns true, it is set to
399 /// begin location of the macro.
400 static bool isAtStartOfMacroExpansion(SourceLocation loc,
401 const SourceManager &SM,
402 const LangOptions &LangOpts,
403 SourceLocation *MacroBegin = nullptr);
404
405 /// Returns true if the given MacroID location points at the last
406 /// token of the macro expansion.
407 ///
408 /// \param MacroEnd If non-null and function returns true, it is set to
409 /// end location of the macro.
410 static bool isAtEndOfMacroExpansion(SourceLocation loc,
411 const SourceManager &SM,
412 const LangOptions &LangOpts,
413 SourceLocation *MacroEnd = nullptr);
414
415 /// Accepts a range and returns a character range with file locations.
416 ///
417 /// Returns a null range if a part of the range resides inside a macro
418 /// expansion or the range does not reside on the same FileID.
419 ///
420 /// This function is trying to deal with macros and return a range based on
421 /// file locations. The cases where it can successfully handle macros are:
422 ///
423 /// -begin or end range lies at the start or end of a macro expansion, in
424 /// which case the location will be set to the expansion point, e.g:
425 /// \#define M 1 2
426 /// a M
427 /// If you have a range [a, 2] (where 2 came from the macro), the function
428 /// will return a range for "a M"
429 /// if you have range [a, 1], the function will fail because the range
430 /// overlaps with only a part of the macro
431 ///
432 /// -The macro is a function macro and the range can be mapped to the macro
433 /// arguments, e.g:
434 /// \#define M 1 2
435 /// \#define FM(x) x
436 /// FM(a b M)
437 /// if you have range [b, 2], the function will return the file range "b M"
438 /// inside the macro arguments.
439 /// if you have range [a, 2], the function will return the file range
440 /// "FM(a b M)" since the range includes all of the macro expansion.
441 static CharSourceRange makeFileCharRange(CharSourceRange Range,
442 const SourceManager &SM,
443 const LangOptions &LangOpts);
444
445 /// Returns a string for the source that the range encompasses.
446 static StringRef getSourceText(CharSourceRange Range,
447 const SourceManager &SM,
448 const LangOptions &LangOpts,
449 bool *Invalid = nullptr);
450
451 /// Retrieve the name of the immediate macro expansion.
452 ///
453 /// This routine starts from a source location, and finds the name of the macro
454 /// responsible for its immediate expansion. It looks through any intervening
455 /// macro argument expansions to compute this. It returns a StringRef which
456 /// refers to the SourceManager-owned buffer of the source where that macro
457 /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
458 static StringRef getImmediateMacroName(SourceLocation Loc,
459 const SourceManager &SM,
460 const LangOptions &LangOpts);
461
462 /// Retrieve the name of the immediate macro expansion.
463 ///
464 /// This routine starts from a source location, and finds the name of the
465 /// macro responsible for its immediate expansion. It looks through any
466 /// intervening macro argument expansions to compute this. It returns a
467 /// StringRef which refers to the SourceManager-owned buffer of the source
468 /// where that macro name is spelled. Thus, the result shouldn't out-live
469 /// that SourceManager.
470 ///
471 /// This differs from Lexer::getImmediateMacroName in that any macro argument
472 /// location will result in the topmost function macro that accepted it.
473 /// e.g.
474 /// \code
475 /// MAC1( MAC2(foo) )
476 /// \endcode
477 /// for location of 'foo' token, this function will return "MAC1" while
478 /// Lexer::getImmediateMacroName will return "MAC2".
479 static StringRef getImmediateMacroNameForDiagnostics(
480 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
481
482 /// Compute the preamble of the given file.
483 ///
484 /// The preamble of a file contains the initial comments, include directives,
485 /// and other preprocessor directives that occur before the code in this
486 /// particular file actually begins. The preamble of the main source file is
487 /// a potential prefix header.
488 ///
489 /// \param Buffer The memory buffer containing the file's contents.
490 ///
491 /// \param MaxLines If non-zero, restrict the length of the preamble
492 /// to fewer than this number of lines.
493 ///
494 /// \returns The offset into the file where the preamble ends and the rest
495 /// of the file begins along with a boolean value indicating whether
496 /// the preamble ends at the beginning of a new line.
497 static PreambleBounds ComputePreamble(StringRef Buffer,
498 const LangOptions &LangOpts,
499 unsigned MaxLines = 0);
500
501 /// Finds the token that comes right after the given location.
502 ///
503 /// Returns the next token, or none if the location is inside a macro.
504 static Optional<Token> findNextToken(SourceLocation Loc,
505 const SourceManager &SM,
506 const LangOptions &LangOpts);
507
508 /// Checks that the given token is the first token that occurs after
509 /// the given location (this excludes comments and whitespace). Returns the
510 /// location immediately after the specified token. If the token is not found
511 /// or the location is inside a macro, the returned source location will be
512 /// invalid.
513 static SourceLocation findLocationAfterToken(SourceLocation loc,
514 tok::TokenKind TKind,
515 const SourceManager &SM,
516 const LangOptions &LangOpts,
517 bool SkipTrailingWhitespaceAndNewLine);
518
519 /// Returns true if the given character could appear in an identifier.
520 static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
521
522 /// Checks whether new line pointed by Str is preceded by escape
523 /// sequence.
524 static bool isNewLineEscaped(const char *BufferStart, const char *Str);
525
526 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
527 /// emit a warning.
528 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
529 const LangOptions &LangOpts) {
530 // If this is not a trigraph and not a UCN or escaped newline, return
531 // quickly.
532 if (isObviouslySimpleCharacter(Ptr[0])) {
533 Size = 1;
534 return *Ptr;
535 }
536
537 Size = 0;
538 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
539 }
540
541 /// Returns the leading whitespace for line that corresponds to the given
542 /// location \p Loc.
543 static StringRef getIndentationForLine(SourceLocation Loc,
544 const SourceManager &SM);
545
546private:
547 //===--------------------------------------------------------------------===//
548 // Internal implementation interfaces.
549
550 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
551 /// by Lex.
552 ///
553 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
554
555 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
556
557 /// Given that a token begins with the Unicode character \p C, figure out
558 /// what kind of token it is and dispatch to the appropriate lexing helper
559 /// function.
560 bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
561
562 /// FormTokenWithChars - When we lex a token, we have identified a span
563 /// starting at BufferPtr, going to TokEnd that forms the token. This method
564 /// takes that range and assigns it to the token as its location and size. In
565 /// addition, since tokens cannot overlap, this also updates BufferPtr to be
566 /// TokEnd.
567 void FormTokenWithChars(Token &Result, const char *TokEnd,
568 tok::TokenKind Kind) {
569 unsigned TokLen = TokEnd-BufferPtr;
570 Result.setLength(TokLen);
571 Result.setLocation(getSourceLocation(BufferPtr, TokLen));
572 Result.setKind(Kind);
573 BufferPtr = TokEnd;
574 }
575
576 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
577 /// tok::l_paren token, 0 if it is something else and 2 if there are no more
578 /// tokens in the buffer controlled by this lexer.
579 unsigned isNextPPTokenLParen();
580
581 //===--------------------------------------------------------------------===//
582 // Lexer character reading interfaces.
583
584 // This lexer is built on two interfaces for reading characters, both of which
585 // automatically provide phase 1/2 translation. getAndAdvanceChar is used
586 // when we know that we will be reading a character from the input buffer and
587 // that this character will be part of the result token. This occurs in (f.e.)
588 // string processing, because we know we need to read until we find the
589 // closing '"' character.
590 //
591 // The second interface is the combination of getCharAndSize with
592 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
593 // returning it and its size. If the lexer decides that this character is
594 // part of the current token, it calls ConsumeChar on it. This two stage
595 // approach allows us to emit diagnostics for characters (e.g. warnings about
596 // trigraphs), knowing that they only are emitted if the character is
597 // consumed.
598
599 /// isObviouslySimpleCharacter - Return true if the specified character is
600 /// obviously the same in translation phase 1 and translation phase 3. This
601 /// can return false for characters that end up being the same, but it will
602 /// never return true for something that needs to be mapped.
603 static bool isObviouslySimpleCharacter(char C) {
604 return C != '?' && C != '\\';
605 }
606
607 /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
608 /// advance over it, and return it. This is tricky in several cases. Here we
609 /// just handle the trivial case and fall-back to the non-inlined
610 /// getCharAndSizeSlow method to handle the hard case.
611 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
612 // If this is not a trigraph and not a UCN or escaped newline, return
613 // quickly.
614 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
6
Taking false branch
615
616 unsigned Size = 0;
617 char C = getCharAndSizeSlow(Ptr, Size, &Tok);
7
Calling 'Lexer::getCharAndSizeSlow'
618 Ptr += Size;
619 return C;
620 }
621
622 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
623 /// and added to a given token, check to see if there are diagnostics that
624 /// need to be emitted or flags that need to be set on the token. If so, do
625 /// it.
626 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
627 // Normal case, we consumed exactly one token. Just return it.
628 if (Size == 1)
629 return Ptr+Size;
630
631 // Otherwise, re-lex the character with a current token, allowing
632 // diagnostics to be emitted and flags to be set.
633 Size = 0;
634 getCharAndSizeSlow(Ptr, Size, &Tok);
635 return Ptr+Size;
636 }
637
638 /// getCharAndSize - Peek a single 'character' from the specified buffer,
639 /// get its size, and return it. This is tricky in several cases. Here we
640 /// just handle the trivial case and fall-back to the non-inlined
641 /// getCharAndSizeSlow method to handle the hard case.
642 inline char getCharAndSize(const char *Ptr, unsigned &Size) {
643 // If this is not a trigraph and not a UCN or escaped newline, return
644 // quickly.
645 if (isObviouslySimpleCharacter(Ptr[0])) {
646 Size = 1;
647 return *Ptr;
648 }
649
650 Size = 0;
651 return getCharAndSizeSlow(Ptr, Size);
652 }
653
654 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
655 /// method.
656 char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
657 Token *Tok = nullptr);
658
659 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
660 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
661 /// to this function.
662 static unsigned getEscapedNewLineSize(const char *P);
663
664 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
665 /// them), skip over them and return the first non-escaped-newline found,
666 /// otherwise return P.
667 static const char *SkipEscapedNewLines(const char *P);
668
669 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
670 /// diagnostic.
671 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
672 const LangOptions &LangOpts);
673
674 //===--------------------------------------------------------------------===//
675 // Other lexer functions.
676
677 void SetByteOffset(unsigned Offset, bool StartOfLine);
678
679 void PropagateLineStartLeadingSpaceInfo(Token &Result);
680
681 const char *LexUDSuffix(Token &Result, const char *CurPtr,
682 bool IsStringLiteral);
683
684 // Helper functions to lex the remainder of a token of the specific type.
685 bool LexIdentifier (Token &Result, const char *CurPtr);
686 bool LexNumericConstant (Token &Result, const char *CurPtr);
687 bool LexStringLiteral (Token &Result, const char *CurPtr,
688 tok::TokenKind Kind);
689 bool LexRawStringLiteral (Token &Result, const char *CurPtr,
690 tok::TokenKind Kind);
691 bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
692 bool LexCharConstant (Token &Result, const char *CurPtr,
693 tok::TokenKind Kind);
694 bool LexEndOfFile (Token &Result, const char *CurPtr);
695 bool SkipWhitespace (Token &Result, const char *CurPtr,
696 bool &TokAtPhysicalStartOfLine);
697 bool SkipLineComment (Token &Result, const char *CurPtr,
698 bool &TokAtPhysicalStartOfLine);
699 bool SkipBlockComment (Token &Result, const char *CurPtr,
700 bool &TokAtPhysicalStartOfLine);
701 bool SaveLineComment (Token &Result, const char *CurPtr);
702
703 bool IsStartOfConflictMarker(const char *CurPtr);
704 bool HandleEndOfConflictMarker(const char *CurPtr);
705
706 bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
707
708 bool isCodeCompletionPoint(const char *CurPtr) const;
709 void cutOffLexing() { BufferPtr = BufferEnd; }
710
711 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
712
713 void codeCompleteIncludedFile(const char *PathStart,
714 const char *CompletionPoint, bool IsAngled);
715
716 /// Read a universal character name.
717 ///
718 /// \param StartPtr The position in the source buffer after the initial '\'.
719 /// If the UCN is syntactically well-formed (but not
720 /// necessarily valid), this parameter will be updated to
721 /// point to the character after the UCN.
722 /// \param SlashLoc The position in the source buffer of the '\'.
723 /// \param Result The token being formed. Pass \c nullptr to suppress
724 /// diagnostics and handle token formation in the caller.
725 ///
726 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
727 /// invalid.
728 uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
729
730 /// Try to consume a UCN as part of an identifier at the current
731 /// location.
732 /// \param CurPtr Initially points to the range of characters in the source
733 /// buffer containing the '\'. Updated to point past the end of
734 /// the UCN on success.
735 /// \param Size The number of characters occupied by the '\' (including
736 /// trigraphs and escaped newlines).
737 /// \param Result The token being produced. Marked as containing a UCN on
738 /// success.
739 /// \return \c true if a UCN was lexed and it produced an acceptable
740 /// identifier character, \c false otherwise.
741 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
742 Token &Result);
743
744 /// Try to consume an identifier character encoded in UTF-8.
745 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
746 /// sequence. On success, updated to point past the end of it.
747 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
748 /// character was lexed, \c false otherwise.
749 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
750};
751
752} // namespace clang
753
754#endif // LLVM_CLANG_LEX_LEXER_H

/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h

1//===--- Token.h - Token interface ------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the Token interface.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_LEX_TOKEN_H
14#define LLVM_CLANG_LEX_TOKEN_H
15
16#include "clang/Basic/SourceLocation.h"
17#include "clang/Basic/TokenKinds.h"
18#include "llvm/ADT/StringRef.h"
19#include <cassert>
20
21namespace clang {
22
23class IdentifierInfo;
24
25/// Token - This structure provides full information about a lexed token.
26/// It is not intended to be space efficient, it is intended to return as much
27/// information as possible about each returned token. This is expected to be
28/// compressed into a smaller form if memory footprint is important.
29///
30/// The parser can create a special "annotation token" representing a stream of
31/// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
32/// can be represented by a single typename annotation token that carries
33/// information about the SourceRange of the tokens and the type object.
34class Token {
35 /// The location of the token. This is actually a SourceLocation.
36 unsigned Loc;
37
38 // Conceptually these next two fields could be in a union. However, this
39 // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
40 // routine. Keeping as separate members with casts until a more beautiful fix
41 // presents itself.
42
43 /// UintData - This holds either the length of the token text, when
44 /// a normal token, or the end of the SourceRange when an annotation
45 /// token.
46 unsigned UintData;
47
48 /// PtrData - This is a union of four different pointer types, which depends
49 /// on what type of token this is:
50 /// Identifiers, keywords, etc:
51 /// This is an IdentifierInfo*, which contains the uniqued identifier
52 /// spelling.
53 /// Literals: isLiteral() returns true.
54 /// This is a pointer to the start of the token in a text buffer, which
55 /// may be dirty (have trigraphs / escaped newlines).
56 /// Annotations (resolved type names, C++ scopes, etc): isAnnotation().
57 /// This is a pointer to sema-specific data for the annotation token.
58 /// Eof:
59 // This is a pointer to a Decl.
60 /// Other:
61 /// This is null.
62 void *PtrData;
63
64 /// Kind - The actual flavor of token this is.
65 tok::TokenKind Kind;
66
67 /// Flags - Bits we track about this token, members of the TokenFlags enum.
68 unsigned short Flags;
69
70public:
71 // Various flags set per token:
72 enum TokenFlags {
73 StartOfLine = 0x01, // At start of line or only after whitespace
74 // (considering the line after macro expansion).
75 LeadingSpace = 0x02, // Whitespace exists before this token (considering
76 // whitespace after macro expansion).
77 DisableExpand = 0x04, // This identifier may never be macro expanded.
78 NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
79 LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
80 HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
81 HasUCN = 0x40, // This identifier contains a UCN.
82 IgnoredComma = 0x80, // This comma is not a macro argument separator (MS).
83 StringifiedInMacro = 0x100, // This string or character literal is formed by
84 // macro stringizing or charizing operator.
85 CommaAfterElided = 0x200, // The comma following this token was elided (MS).
86 IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
87 IsReinjected = 0x800, // A phase 4 token that was produced before and
88 // re-added, e.g. via EnterTokenStream. Annotation
89 // tokens are *not* reinjected.
90 };
91
92 tok::TokenKind getKind() const { return Kind; }
93 void setKind(tok::TokenKind K) { Kind = K; }
94
95 /// is/isNot - Predicates to check if this token is a specific kind, as in
96 /// "if (Tok.is(tok::l_brace)) {...}".
97 bool is(tok::TokenKind K) const { return Kind == K; }
98 bool isNot(tok::TokenKind K) const { return Kind != K; }
99 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
100 return is(K1) || is(K2);
101 }
102 template <typename... Ts>
103 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, Ts... Ks) const {
104 return is(K1) || isOneOf(K2, Ks...);
105 }
106
107 /// Return true if this is a raw identifier (when lexing
108 /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode).
109 bool isAnyIdentifier() const {
110 return tok::isAnyIdentifier(getKind());
111 }
112
113 /// Return true if this is a "literal", like a numeric
114 /// constant, string, etc.
115 bool isLiteral() const {
116 return tok::isLiteral(getKind());
117 }
118
119 /// Return true if this is any of tok::annot_* kind tokens.
120 bool isAnnotation() const {
121 return tok::isAnnotation(getKind());
122 }
123
124 /// Return a source location identifier for the specified
125 /// offset in the current file.
126 SourceLocation getLocation() const {
127 return SourceLocation::getFromRawEncoding(Loc);
128 }
129 unsigned getLength() const {
130 assert(!isAnnotation() && "Annotation tokens have no length field")((!isAnnotation() && "Annotation tokens have no length field"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 130, __PRETTY_FUNCTION__))
;
131 return UintData;
132 }
133
134 void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); }
135 void setLength(unsigned Len) {
136 assert(!isAnnotation() && "Annotation tokens have no length field")((!isAnnotation() && "Annotation tokens have no length field"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"Annotation tokens have no length field\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 136, __PRETTY_FUNCTION__))
;
137 UintData = Len;
138 }
139
140 SourceLocation getAnnotationEndLoc() const {
141 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")((isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 141, __PRETTY_FUNCTION__))
;
142 return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc);
143 }
144 void setAnnotationEndLoc(SourceLocation L) {
145 assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token")((isAnnotation() && "Used AnnotEndLocID on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotEndLocID on non-annotation token\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 145, __PRETTY_FUNCTION__))
;
146 UintData = L.getRawEncoding();
147 }
148
149 SourceLocation getLastLoc() const {
150 return isAnnotation() ? getAnnotationEndLoc() : getLocation();
151 }
152
153 SourceLocation getEndLoc() const {
154 return isAnnotation() ? getAnnotationEndLoc()
155 : getLocation().getLocWithOffset(getLength());
156 }
157
158 /// SourceRange of the group of tokens that this annotation token
159 /// represents.
160 SourceRange getAnnotationRange() const {
161 return SourceRange(getLocation(), getAnnotationEndLoc());
162 }
163 void setAnnotationRange(SourceRange R) {
164 setLocation(R.getBegin());
165 setAnnotationEndLoc(R.getEnd());
166 }
167
168 const char *getName() const { return tok::getTokenName(Kind); }
169
170 /// Reset all flags to cleared.
171 void startToken() {
172 Kind = tok::unknown;
173 Flags = 0;
174 PtrData = nullptr;
175 UintData = 0;
176 Loc = SourceLocation().getRawEncoding();
177 }
178
179 IdentifierInfo *getIdentifierInfo() const {
180 assert(isNot(tok::raw_identifier) &&((isNot(tok::raw_identifier) && "getIdentifierInfo() on a tok::raw_identifier token!"
) ? static_cast<void> (0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 181, __PRETTY_FUNCTION__))
181 "getIdentifierInfo() on a tok::raw_identifier token!")((isNot(tok::raw_identifier) && "getIdentifierInfo() on a tok::raw_identifier token!"
) ? static_cast<void> (0) : __assert_fail ("isNot(tok::raw_identifier) && \"getIdentifierInfo() on a tok::raw_identifier token!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 181, __PRETTY_FUNCTION__))
;
182 assert(!isAnnotation() &&((!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 183, __PRETTY_FUNCTION__))
183 "getIdentifierInfo() on an annotation token!")((!isAnnotation() && "getIdentifierInfo() on an annotation token!"
) ? static_cast<void> (0) : __assert_fail ("!isAnnotation() && \"getIdentifierInfo() on an annotation token!\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 183, __PRETTY_FUNCTION__))
;
184 if (isLiteral()) return nullptr;
185 if (is(tok::eof)) return nullptr;
186 return (IdentifierInfo*) PtrData;
187 }
188 void setIdentifierInfo(IdentifierInfo *II) {
189 PtrData = (void*) II;
190 }
191
192 const void *getEofData() const {
193 assert(is(tok::eof))((is(tok::eof)) ? static_cast<void> (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 193, __PRETTY_FUNCTION__))
;
194 return reinterpret_cast<const void *>(PtrData);
195 }
196 void setEofData(const void *D) {
197 assert(is(tok::eof))((is(tok::eof)) ? static_cast<void> (0) : __assert_fail
("is(tok::eof)", "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 197, __PRETTY_FUNCTION__))
;
198 assert(!PtrData)((!PtrData) ? static_cast<void> (0) : __assert_fail ("!PtrData"
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 198, __PRETTY_FUNCTION__))
;
199 PtrData = const_cast<void *>(D);
200 }
201
202 /// getRawIdentifier - For a raw identifier token (i.e., an identifier
203 /// lexed in raw mode), returns a reference to the text substring in the
204 /// buffer if known.
205 StringRef getRawIdentifier() const {
206 assert(is(tok::raw_identifier))((is(tok::raw_identifier)) ? static_cast<void> (0) : __assert_fail
("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 206, __PRETTY_FUNCTION__))
;
207 return StringRef(reinterpret_cast<const char *>(PtrData), getLength());
208 }
209 void setRawIdentifierData(const char *Ptr) {
210 assert(is(tok::raw_identifier))((is(tok::raw_identifier)) ? static_cast<void> (0) : __assert_fail
("is(tok::raw_identifier)", "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 210, __PRETTY_FUNCTION__))
;
211 PtrData = const_cast<char*>(Ptr);
212 }
213
214 /// getLiteralData - For a literal token (numeric constant, string, etc), this
215 /// returns a pointer to the start of it in the text buffer if known, null
216 /// otherwise.
217 const char *getLiteralData() const {
218 assert(isLiteral() && "Cannot get literal data of non-literal")((isLiteral() && "Cannot get literal data of non-literal"
) ? static_cast<void> (0) : __assert_fail ("isLiteral() && \"Cannot get literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 218, __PRETTY_FUNCTION__))
;
219 return reinterpret_cast<const char*>(PtrData);
220 }
221 void setLiteralData(const char *Ptr) {
222 assert(isLiteral() && "Cannot set literal data of non-literal")((isLiteral() && "Cannot set literal data of non-literal"
) ? static_cast<void> (0) : __assert_fail ("isLiteral() && \"Cannot set literal data of non-literal\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 222, __PRETTY_FUNCTION__))
;
223 PtrData = const_cast<char*>(Ptr);
224 }
225
226 void *getAnnotationValue() const {
227 assert(isAnnotation() && "Used AnnotVal on non-annotation token")((isAnnotation() && "Used AnnotVal on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 227, __PRETTY_FUNCTION__))
;
228 return PtrData;
229 }
230 void setAnnotationValue(void *val) {
231 assert(isAnnotation() && "Used AnnotVal on non-annotation token")((isAnnotation() && "Used AnnotVal on non-annotation token"
) ? static_cast<void> (0) : __assert_fail ("isAnnotation() && \"Used AnnotVal on non-annotation token\""
, "/build/llvm-toolchain-snapshot-9~svn361465/tools/clang/include/clang/Lex/Token.h"
, 231, __PRETTY_FUNCTION__))
;
232 PtrData = val;
233 }
234
235 /// Set the specified flag.
236 void setFlag(TokenFlags Flag) {
237 Flags |= Flag;
15
The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage
238 }
239
240 /// Get the specified flag.
241 bool getFlag(TokenFlags Flag) const {
242 return (Flags & Flag) != 0;
243 }
244
245 /// Unset the specified flag.
246 void clearFlag(TokenFlags Flag) {
247 Flags &= ~Flag;
248 }
249
250 /// Return the internal represtation of the flags.
251 ///
252 /// This is only intended for low-level operations such as writing tokens to
253 /// disk.
254 unsigned getFlags() const {
255 return Flags;
256 }
257
258 /// Set a flag to either true or false.
259 void setFlagValue(TokenFlags Flag, bool Val) {
260 if (Val)
261 setFlag(Flag);
262 else
263 clearFlag(Flag);
264 }
265
266 /// isAtStartOfLine - Return true if this token is at the start of a line.
267 ///
268 bool isAtStartOfLine() const { return getFlag(StartOfLine); }
269
270 /// Return true if this token has whitespace before it.
271 ///
272 bool hasLeadingSpace() const { return getFlag(LeadingSpace); }
273
274 /// Return true if this identifier token should never
275 /// be expanded in the future, due to C99 6.10.3.4p2.
276 bool isExpandDisabled() const { return getFlag(DisableExpand); }
277
278 /// Return true if we have an ObjC keyword identifier.
279 bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const;
280
281 /// Return the ObjC keyword kind.
282 tok::ObjCKeywordKind getObjCKeywordID() const;
283
284 /// Return true if this token has trigraphs or escaped newlines in it.
285 bool needsCleaning() const { return getFlag(NeedsCleaning); }
286
287 /// Return true if this token has an empty macro before it.
288 ///
289 bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); }
290
291 /// Return true if this token is a string or character literal which
292 /// has a ud-suffix.
293 bool hasUDSuffix() const { return getFlag(HasUDSuffix); }
294
295 /// Returns true if this token contains a universal character name.
296 bool hasUCN() const { return getFlag(HasUCN); }
297
298 /// Returns true if this token is formed by macro by stringizing or charizing
299 /// operator.
300 bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); }
301
302 /// Returns true if the comma after this token was elided.
303 bool commaAfterElided() const { return getFlag(CommaAfterElided); }
304
305 /// Returns true if this token is an editor placeholder.
306 ///
307 /// Editor placeholders are produced by the code-completion engine and are
308 /// represented as characters between '<#' and '#>' in the source code. The
309 /// lexer uses identifier tokens to represent placeholders.
310 bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
311};
312
313/// Information about the conditional stack (\#if directives)
314/// currently active.
315struct PPConditionalInfo {
316 /// Location where the conditional started.
317 SourceLocation IfLoc;
318
319 /// True if this was contained in a skipping directive, e.g.,
320 /// in a "\#if 0" block.
321 bool WasSkipping;
322
323 /// True if we have emitted tokens already, and now we're in
324 /// an \#else block or something. Only useful in Skipping blocks.
325 bool FoundNonSkip;
326
327 /// True if we've seen a \#else in this block. If so,
328 /// \#elif/\#else directives are not allowed.
329 bool FoundElse;
330};
331
332} // end namespace clang
333
334#endif // LLVM_CLANG_LEX_TOKEN_H