Bug Summary

File:build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/lib/Lex/Lexer.cpp
Warning:line 1168, column 10
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name Lexer.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/lib/Lex -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/include -I tools/clang/include -I include -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-09-04-125545-48738-1 -x c++ /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/lib/Lex/Lexer.cpp

/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/lib/Lex/Lexer.cpp

1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
15#include "clang/Basic/CharInfo.h"
16#include "clang/Basic/Diagnostic.h"
17#include "clang/Basic/IdentifierTable.h"
18#include "clang/Basic/LLVM.h"
19#include "clang/Basic/LangOptions.h"
20#include "clang/Basic/SourceLocation.h"
21#include "clang/Basic/SourceManager.h"
22#include "clang/Basic/TokenKinds.h"
23#include "clang/Lex/LexDiagnostic.h"
24#include "clang/Lex/LiteralSupport.h"
25#include "clang/Lex/MultipleIncludeOpt.h"
26#include "clang/Lex/Preprocessor.h"
27#include "clang/Lex/PreprocessorOptions.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/None.h"
30#include "llvm/ADT/Optional.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/StringExtras.h"
33#include "llvm/ADT/StringRef.h"
34#include "llvm/ADT/StringSwitch.h"
35#include "llvm/Support/Compiler.h"
36#include "llvm/Support/ConvertUTF.h"
37#include "llvm/Support/MathExtras.h"
38#include "llvm/Support/MemoryBufferRef.h"
39#include "llvm/Support/NativeFormatting.h"
40#include "llvm/Support/Unicode.h"
41#include "llvm/Support/UnicodeCharRanges.h"
42#include <algorithm>
43#include <cassert>
44#include <cstddef>
45#include <cstdint>
46#include <cstring>
47#include <string>
48#include <tuple>
49#include <utility>
50
51using namespace clang;
52
53//===----------------------------------------------------------------------===//
54// Token Class Implementation
55//===----------------------------------------------------------------------===//
56
57/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
58bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
59 if (isAnnotation())
60 return false;
61 if (IdentifierInfo *II = getIdentifierInfo())
62 return II->getObjCKeywordID() == objcKey;
63 return false;
64}
65
66/// getObjCKeywordID - Return the ObjC keyword kind.
67tok::ObjCKeywordKind Token::getObjCKeywordID() const {
68 if (isAnnotation())
69 return tok::objc_not_keyword;
70 IdentifierInfo *specId = getIdentifierInfo();
71 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
72}
73
74//===----------------------------------------------------------------------===//
75// Lexer Class Implementation
76//===----------------------------------------------------------------------===//
77
78void Lexer::anchor() {}
79
80void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
81 const char *BufEnd) {
82 BufferStart = BufStart;
83 BufferPtr = BufPtr;
84 BufferEnd = BufEnd;
85
86 assert(BufEnd[0] == 0 &&(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "clang/lib/Lex/Lexer.cpp", 88, __extension__ __PRETTY_FUNCTION__
))
87 "We assume that the input buffer has a null character at the end"(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "clang/lib/Lex/Lexer.cpp", 88, __extension__ __PRETTY_FUNCTION__
))
88 " to simplify lexing!")(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "clang/lib/Lex/Lexer.cpp", 88, __extension__ __PRETTY_FUNCTION__
))
;
89
90 // Check whether we have a BOM in the beginning of the buffer. If yes - act
91 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
92 // skip the UTF-8 BOM if it's present.
93 if (BufferStart == BufferPtr) {
94 // Determine the size of the BOM.
95 StringRef Buf(BufferStart, BufferEnd - BufferStart);
96 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
97 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
98 .Default(0);
99
100 // Skip the BOM.
101 BufferPtr += BOMLength;
102 }
103
104 Is_PragmaLexer = false;
105 CurrentConflictMarkerState = CMK_None;
106
107 // Start of the file is a start of line.
108 IsAtStartOfLine = true;
109 IsAtPhysicalStartOfLine = true;
110
111 HasLeadingSpace = false;
112 HasLeadingEmptyMacro = false;
113
114 // We are not after parsing a #.
115 ParsingPreprocessorDirective = false;
116
117 // We are not after parsing #include.
118 ParsingFilename = false;
119
120 // We are not in raw mode. Raw mode disables diagnostics and interpretation
121 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
122 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
123 // or otherwise skipping over tokens.
124 LexingRawMode = false;
125
126 // Default to not keeping comments.
127 ExtendedTokenMode = 0;
128
129 NewLinePtr = nullptr;
130}
131
132/// Lexer constructor - Create a new lexer object for the specified buffer
133/// with the specified preprocessor managing the lexing process. This lexer
134/// assumes that the associated file buffer and Preprocessor objects will
135/// outlive it, so it doesn't take ownership of either of them.
136Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
137 Preprocessor &PP, bool IsFirstIncludeOfFile)
138 : PreprocessorLexer(&PP, FID),
139 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
140 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
141 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
142 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
143 InputFile.getBufferEnd());
144
145 resetExtendedTokenMode();
146}
147
148/// Lexer constructor - Create a new raw lexer object. This object is only
149/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
150/// range will outlive it, so it doesn't take ownership of it.
151Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
152 const char *BufStart, const char *BufPtr, const char *BufEnd,
153 bool IsFirstIncludeOfFile)
154 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
155 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
156 InitLexer(BufStart, BufPtr, BufEnd);
157
158 // We *are* in raw mode.
159 LexingRawMode = true;
160}
161
162/// Lexer constructor - Create a new raw lexer object. This object is only
163/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
164/// range will outlive it, so it doesn't take ownership of it.
165Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
166 const SourceManager &SM, const LangOptions &langOpts,
167 bool IsFirstIncludeOfFile)
168 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
169 FromFile.getBufferStart(), FromFile.getBufferEnd(),
170 IsFirstIncludeOfFile) {}
171
172void Lexer::resetExtendedTokenMode() {
173 assert(PP && "Cannot reset token mode without a preprocessor")(static_cast <bool> (PP && "Cannot reset token mode without a preprocessor"
) ? void (0) : __assert_fail ("PP && \"Cannot reset token mode without a preprocessor\""
, "clang/lib/Lex/Lexer.cpp", 173, __extension__ __PRETTY_FUNCTION__
))
;
174 if (LangOpts.TraditionalCPP)
175 SetKeepWhitespaceMode(true);
176 else
177 SetCommentRetentionState(PP->getCommentRetentionState());
178}
179
180/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
181/// _Pragma expansion. This has a variety of magic semantics that this method
182/// sets up. It returns a new'd Lexer that must be delete'd when done.
183///
184/// On entrance to this routine, TokStartLoc is a macro location which has a
185/// spelling loc that indicates the bytes to be lexed for the token and an
186/// expansion location that indicates where all lexed tokens should be
187/// "expanded from".
188///
189/// TODO: It would really be nice to make _Pragma just be a wrapper around a
190/// normal lexer that remaps tokens as they fly by. This would require making
191/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
192/// interface that could handle this stuff. This would pull GetMappedTokenLoc
193/// out of the critical path of the lexer!
194///
195Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
196 SourceLocation ExpansionLocStart,
197 SourceLocation ExpansionLocEnd,
198 unsigned TokLen, Preprocessor &PP) {
199 SourceManager &SM = PP.getSourceManager();
200
201 // Create the lexer as if we were going to lex the file normally.
202 FileID SpellingFID = SM.getFileID(SpellingLoc);
203 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
204 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
205
206 // Now that the lexer is created, change the start/end locations so that we
207 // just lex the subsection of the file that we want. This is lexing from a
208 // scratch buffer.
209 const char *StrData = SM.getCharacterData(SpellingLoc);
210
211 L->BufferPtr = StrData;
212 L->BufferEnd = StrData+TokLen;
213 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!")(static_cast <bool> (L->BufferEnd[0] == 0 &&
"Buffer is not nul terminated!") ? void (0) : __assert_fail (
"L->BufferEnd[0] == 0 && \"Buffer is not nul terminated!\""
, "clang/lib/Lex/Lexer.cpp", 213, __extension__ __PRETTY_FUNCTION__
))
;
214
215 // Set the SourceLocation with the remapping information. This ensures that
216 // GetMappedTokenLoc will remap the tokens as they are lexed.
217 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
218 ExpansionLocStart,
219 ExpansionLocEnd, TokLen);
220
221 // Ensure that the lexer thinks it is inside a directive, so that end \n will
222 // return an EOD token.
223 L->ParsingPreprocessorDirective = true;
224
225 // This lexer really is for _Pragma.
226 L->Is_PragmaLexer = true;
227 return L;
228}
229
230void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
231 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
232 this->IsAtStartOfLine = IsAtStartOfLine;
233 assert((BufferStart + Offset) <= BufferEnd)(static_cast <bool> ((BufferStart + Offset) <= BufferEnd
) ? void (0) : __assert_fail ("(BufferStart + Offset) <= BufferEnd"
, "clang/lib/Lex/Lexer.cpp", 233, __extension__ __PRETTY_FUNCTION__
))
;
234 BufferPtr = BufferStart + Offset;
235}
236
237template <typename T> static void StringifyImpl(T &Str, char Quote) {
238 typename T::size_type i = 0, e = Str.size();
239 while (i < e) {
240 if (Str[i] == '\\' || Str[i] == Quote) {
241 Str.insert(Str.begin() + i, '\\');
242 i += 2;
243 ++e;
244 } else if (Str[i] == '\n' || Str[i] == '\r') {
245 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
246 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
247 Str[i] != Str[i + 1]) {
248 Str[i] = '\\';
249 Str[i + 1] = 'n';
250 } else {
251 // Replace '\n' and '\r' to '\\' followed by 'n'.
252 Str[i] = '\\';
253 Str.insert(Str.begin() + i + 1, 'n');
254 ++e;
255 }
256 i += 2;
257 } else
258 ++i;
259 }
260}
261
262std::string Lexer::Stringify(StringRef Str, bool Charify) {
263 std::string Result = std::string(Str);
264 char Quote = Charify ? '\'' : '"';
265 StringifyImpl(Result, Quote);
266 return Result;
267}
268
269void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
270
271//===----------------------------------------------------------------------===//
272// Token Spelling
273//===----------------------------------------------------------------------===//
274
275/// Slow case of getSpelling. Extract the characters comprising the
276/// spelling of this token from the provided input buffer.
277static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
278 const LangOptions &LangOpts, char *Spelling) {
279 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token")(static_cast <bool> (Tok.needsCleaning() && "getSpellingSlow called on simple token"
) ? void (0) : __assert_fail ("Tok.needsCleaning() && \"getSpellingSlow called on simple token\""
, "clang/lib/Lex/Lexer.cpp", 279, __extension__ __PRETTY_FUNCTION__
))
;
280
281 size_t Length = 0;
282 const char *BufEnd = BufPtr + Tok.getLength();
283
284 if (tok::isStringLiteral(Tok.getKind())) {
285 // Munch the encoding-prefix and opening double-quote.
286 while (BufPtr < BufEnd) {
287 unsigned Size;
288 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
289 BufPtr += Size;
290
291 if (Spelling[Length - 1] == '"')
292 break;
293 }
294
295 // Raw string literals need special handling; trigraph expansion and line
296 // splicing do not occur within their d-char-sequence nor within their
297 // r-char-sequence.
298 if (Length >= 2 &&
299 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
300 // Search backwards from the end of the token to find the matching closing
301 // quote.
302 const char *RawEnd = BufEnd;
303 do --RawEnd; while (*RawEnd != '"');
304 size_t RawLength = RawEnd - BufPtr + 1;
305
306 // Everything between the quotes is included verbatim in the spelling.
307 memcpy(Spelling + Length, BufPtr, RawLength);
308 Length += RawLength;
309 BufPtr += RawLength;
310
311 // The rest of the token is lexed normally.
312 }
313 }
314
315 while (BufPtr < BufEnd) {
316 unsigned Size;
317 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
318 BufPtr += Size;
319 }
320
321 assert(Length < Tok.getLength() &&(static_cast <bool> (Length < Tok.getLength() &&
"NeedsCleaning flag set on token that didn't need cleaning!"
) ? void (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "clang/lib/Lex/Lexer.cpp", 322, __extension__ __PRETTY_FUNCTION__
))
322 "NeedsCleaning flag set on token that didn't need cleaning!")(static_cast <bool> (Length < Tok.getLength() &&
"NeedsCleaning flag set on token that didn't need cleaning!"
) ? void (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "clang/lib/Lex/Lexer.cpp", 322, __extension__ __PRETTY_FUNCTION__
))
;
323 return Length;
324}
325
326/// getSpelling() - Return the 'spelling' of this token. The spelling of a
327/// token are the characters used to represent the token in the source file
328/// after trigraph expansion and escaped-newline folding. In particular, this
329/// wants to get the true, uncanonicalized, spelling of things like digraphs
330/// UCNs, etc.
331StringRef Lexer::getSpelling(SourceLocation loc,
332 SmallVectorImpl<char> &buffer,
333 const SourceManager &SM,
334 const LangOptions &options,
335 bool *invalid) {
336 // Break down the source location.
337 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
338
339 // Try to the load the file buffer.
340 bool invalidTemp = false;
341 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
342 if (invalidTemp) {
343 if (invalid) *invalid = true;
344 return {};
345 }
346
347 const char *tokenBegin = file.data() + locInfo.second;
348
349 // Lex from the start of the given location.
350 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
351 file.begin(), tokenBegin, file.end());
352 Token token;
353 lexer.LexFromRawLexer(token);
354
355 unsigned length = token.getLength();
356
357 // Common case: no need for cleaning.
358 if (!token.needsCleaning())
359 return StringRef(tokenBegin, length);
360
361 // Hard case, we need to relex the characters into the string.
362 buffer.resize(length);
363 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
364 return StringRef(buffer.data(), buffer.size());
365}
366
367/// getSpelling() - Return the 'spelling' of this token. The spelling of a
368/// token are the characters used to represent the token in the source file
369/// after trigraph expansion and escaped-newline folding. In particular, this
370/// wants to get the true, uncanonicalized, spelling of things like digraphs
371/// UCNs, etc.
372std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
373 const LangOptions &LangOpts, bool *Invalid) {
374 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(static_cast <bool> ((int)Tok.getLength() >= 0 &&
"Token character range is bogus!") ? void (0) : __assert_fail
("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "clang/lib/Lex/Lexer.cpp", 374, __extension__ __PRETTY_FUNCTION__
))
;
375
376 bool CharDataInvalid = false;
377 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
378 &CharDataInvalid);
379 if (Invalid)
380 *Invalid = CharDataInvalid;
381 if (CharDataInvalid)
382 return {};
383
384 // If this token contains nothing interesting, return it directly.
385 if (!Tok.needsCleaning())
386 return std::string(TokStart, TokStart + Tok.getLength());
387
388 std::string Result;
389 Result.resize(Tok.getLength());
390 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
391 return Result;
392}
393
394/// getSpelling - This method is used to get the spelling of a token into a
395/// preallocated buffer, instead of as an std::string. The caller is required
396/// to allocate enough space for the token, which is guaranteed to be at least
397/// Tok.getLength() bytes long. The actual length of the token is returned.
398///
399/// Note that this method may do two possible things: it may either fill in
400/// the buffer specified with characters, or it may *change the input pointer*
401/// to point to a constant buffer with the data already in it (avoiding a
402/// copy). The caller is not allowed to modify the returned buffer pointer
403/// if an internal buffer is returned.
404unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
405 const SourceManager &SourceMgr,
406 const LangOptions &LangOpts, bool *Invalid) {
407 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(static_cast <bool> ((int)Tok.getLength() >= 0 &&
"Token character range is bogus!") ? void (0) : __assert_fail
("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "clang/lib/Lex/Lexer.cpp", 407, __extension__ __PRETTY_FUNCTION__
))
;
408
409 const char *TokStart = nullptr;
410 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
411 if (Tok.is(tok::raw_identifier))
412 TokStart = Tok.getRawIdentifier().data();
413 else if (!Tok.hasUCN()) {
414 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
415 // Just return the string from the identifier table, which is very quick.
416 Buffer = II->getNameStart();
417 return II->getLength();
418 }
419 }
420
421 // NOTE: this can be checked even after testing for an IdentifierInfo.
422 if (Tok.isLiteral())
423 TokStart = Tok.getLiteralData();
424
425 if (!TokStart) {
426 // Compute the start of the token in the input lexer buffer.
427 bool CharDataInvalid = false;
428 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
429 if (Invalid)
430 *Invalid = CharDataInvalid;
431 if (CharDataInvalid) {
432 Buffer = "";
433 return 0;
434 }
435 }
436
437 // If this token contains nothing interesting, return it directly.
438 if (!Tok.needsCleaning()) {
439 Buffer = TokStart;
440 return Tok.getLength();
441 }
442
443 // Otherwise, hard case, relex the characters into the string.
444 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
445}
446
447/// MeasureTokenLength - Relex the token at the specified location and return
448/// its length in bytes in the input file. If the token needs cleaning (e.g.
449/// includes a trigraph or an escaped newline) then this count includes bytes
450/// that are part of that.
451unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
452 const SourceManager &SM,
453 const LangOptions &LangOpts) {
454 Token TheTok;
455 if (getRawToken(Loc, TheTok, SM, LangOpts))
456 return 0;
457 return TheTok.getLength();
458}
459
460/// Relex the token at the specified location.
461/// \returns true if there was a failure, false on success.
462bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
463 const SourceManager &SM,
464 const LangOptions &LangOpts,
465 bool IgnoreWhiteSpace) {
466 // TODO: this could be special cased for common tokens like identifiers, ')',
467 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
468 // all obviously single-char tokens. This could use
469 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
470 // something.
471
472 // If this comes from a macro expansion, we really do want the macro name, not
473 // the token this macro expanded to.
474 Loc = SM.getExpansionLoc(Loc);
475 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
476 bool Invalid = false;
477 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
478 if (Invalid)
479 return true;
480
481 const char *StrData = Buffer.data()+LocInfo.second;
482
483 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
484 return true;
485
486 // Create a lexer starting at the beginning of this token.
487 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
488 Buffer.begin(), StrData, Buffer.end());
489 TheLexer.SetCommentRetentionState(true);
490 TheLexer.LexFromRawLexer(Result);
491 return false;
492}
493
494/// Returns the pointer that points to the beginning of line that contains
495/// the given offset, or null if the offset if invalid.
496static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
497 const char *BufStart = Buffer.data();
498 if (Offset >= Buffer.size())
499 return nullptr;
500
501 const char *LexStart = BufStart + Offset;
502 for (; LexStart != BufStart; --LexStart) {
503 if (isVerticalWhitespace(LexStart[0]) &&
504 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
505 // LexStart should point at first character of logical line.
506 ++LexStart;
507 break;
508 }
509 }
510 return LexStart;
511}
512
513static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
514 const SourceManager &SM,
515 const LangOptions &LangOpts) {
516 assert(Loc.isFileID())(static_cast <bool> (Loc.isFileID()) ? void (0) : __assert_fail
("Loc.isFileID()", "clang/lib/Lex/Lexer.cpp", 516, __extension__
__PRETTY_FUNCTION__))
;
517 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
518 if (LocInfo.first.isInvalid())
519 return Loc;
520
521 bool Invalid = false;
522 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
523 if (Invalid)
524 return Loc;
525
526 // Back up from the current location until we hit the beginning of a line
527 // (or the buffer). We'll relex from that point.
528 const char *StrData = Buffer.data() + LocInfo.second;
529 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
530 if (!LexStart || LexStart == StrData)
531 return Loc;
532
533 // Create a lexer starting at the beginning of this token.
534 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
535 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
536 Buffer.end());
537 TheLexer.SetCommentRetentionState(true);
538
539 // Lex tokens until we find the token that contains the source location.
540 Token TheTok;
541 do {
542 TheLexer.LexFromRawLexer(TheTok);
543
544 if (TheLexer.getBufferLocation() > StrData) {
545 // Lexing this token has taken the lexer past the source location we're
546 // looking for. If the current token encompasses our source location,
547 // return the beginning of that token.
548 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
549 return TheTok.getLocation();
550
551 // We ended up skipping over the source location entirely, which means
552 // that it points into whitespace. We're done here.
553 break;
554 }
555 } while (TheTok.getKind() != tok::eof);
556
557 // We've passed our source location; just return the original source location.
558 return Loc;
559}
560
561SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
562 const SourceManager &SM,
563 const LangOptions &LangOpts) {
564 if (Loc.isFileID())
565 return getBeginningOfFileToken(Loc, SM, LangOpts);
566
567 if (!SM.isMacroArgExpansion(Loc))
568 return Loc;
569
570 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
571 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
572 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
573 std::pair<FileID, unsigned> BeginFileLocInfo =
574 SM.getDecomposedLoc(BeginFileLoc);
575 assert(FileLocInfo.first == BeginFileLocInfo.first &&(static_cast <bool> (FileLocInfo.first == BeginFileLocInfo
.first && FileLocInfo.second >= BeginFileLocInfo.second
) ? void (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "clang/lib/Lex/Lexer.cpp", 576, __extension__ __PRETTY_FUNCTION__
))
576 FileLocInfo.second >= BeginFileLocInfo.second)(static_cast <bool> (FileLocInfo.first == BeginFileLocInfo
.first && FileLocInfo.second >= BeginFileLocInfo.second
) ? void (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "clang/lib/Lex/Lexer.cpp", 576, __extension__ __PRETTY_FUNCTION__
))
;
577 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
578}
579
580namespace {
581
582enum PreambleDirectiveKind {
583 PDK_Skipped,
584 PDK_Unknown
585};
586
587} // namespace
588
589PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
590 const LangOptions &LangOpts,
591 unsigned MaxLines) {
592 // Create a lexer starting at the beginning of the file. Note that we use a
593 // "fake" file source location at offset 1 so that the lexer will track our
594 // position within the file.
595 const SourceLocation::UIntTy StartOffset = 1;
596 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
597 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
598 Buffer.end());
599 TheLexer.SetCommentRetentionState(true);
600
601 bool InPreprocessorDirective = false;
602 Token TheTok;
603 SourceLocation ActiveCommentLoc;
604
605 unsigned MaxLineOffset = 0;
606 if (MaxLines) {
607 const char *CurPtr = Buffer.begin();
608 unsigned CurLine = 0;
609 while (CurPtr != Buffer.end()) {
610 char ch = *CurPtr++;
611 if (ch == '\n') {
612 ++CurLine;
613 if (CurLine == MaxLines)
614 break;
615 }
616 }
617 if (CurPtr != Buffer.end())
618 MaxLineOffset = CurPtr - Buffer.begin();
619 }
620
621 do {
622 TheLexer.LexFromRawLexer(TheTok);
623
624 if (InPreprocessorDirective) {
625 // If we've hit the end of the file, we're done.
626 if (TheTok.getKind() == tok::eof) {
627 break;
628 }
629
630 // If we haven't hit the end of the preprocessor directive, skip this
631 // token.
632 if (!TheTok.isAtStartOfLine())
633 continue;
634
635 // We've passed the end of the preprocessor directive, and will look
636 // at this token again below.
637 InPreprocessorDirective = false;
638 }
639
640 // Keep track of the # of lines in the preamble.
641 if (TheTok.isAtStartOfLine()) {
642 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
643
644 // If we were asked to limit the number of lines in the preamble,
645 // and we're about to exceed that limit, we're done.
646 if (MaxLineOffset && TokOffset >= MaxLineOffset)
647 break;
648 }
649
650 // Comments are okay; skip over them.
651 if (TheTok.getKind() == tok::comment) {
652 if (ActiveCommentLoc.isInvalid())
653 ActiveCommentLoc = TheTok.getLocation();
654 continue;
655 }
656
657 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
658 // This is the start of a preprocessor directive.
659 Token HashTok = TheTok;
660 InPreprocessorDirective = true;
661 ActiveCommentLoc = SourceLocation();
662
663 // Figure out which directive this is. Since we're lexing raw tokens,
664 // we don't have an identifier table available. Instead, just look at
665 // the raw identifier to recognize and categorize preprocessor directives.
666 TheLexer.LexFromRawLexer(TheTok);
667 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
668 StringRef Keyword = TheTok.getRawIdentifier();
669 PreambleDirectiveKind PDK
670 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
671 .Case("include", PDK_Skipped)
672 .Case("__include_macros", PDK_Skipped)
673 .Case("define", PDK_Skipped)
674 .Case("undef", PDK_Skipped)
675 .Case("line", PDK_Skipped)
676 .Case("error", PDK_Skipped)
677 .Case("pragma", PDK_Skipped)
678 .Case("import", PDK_Skipped)
679 .Case("include_next", PDK_Skipped)
680 .Case("warning", PDK_Skipped)
681 .Case("ident", PDK_Skipped)
682 .Case("sccs", PDK_Skipped)
683 .Case("assert", PDK_Skipped)
684 .Case("unassert", PDK_Skipped)
685 .Case("if", PDK_Skipped)
686 .Case("ifdef", PDK_Skipped)
687 .Case("ifndef", PDK_Skipped)
688 .Case("elif", PDK_Skipped)
689 .Case("elifdef", PDK_Skipped)
690 .Case("elifndef", PDK_Skipped)
691 .Case("else", PDK_Skipped)
692 .Case("endif", PDK_Skipped)
693 .Default(PDK_Unknown);
694
695 switch (PDK) {
696 case PDK_Skipped:
697 continue;
698
699 case PDK_Unknown:
700 // We don't know what this directive is; stop at the '#'.
701 break;
702 }
703 }
704
705 // We only end up here if we didn't recognize the preprocessor
706 // directive or it was one that can't occur in the preamble at this
707 // point. Roll back the current token to the location of the '#'.
708 TheTok = HashTok;
709 }
710
711 // We hit a token that we don't recognize as being in the
712 // "preprocessing only" part of the file, so we're no longer in
713 // the preamble.
714 break;
715 } while (true);
716
717 SourceLocation End;
718 if (ActiveCommentLoc.isValid())
719 End = ActiveCommentLoc; // don't truncate a decl comment.
720 else
721 End = TheTok.getLocation();
722
723 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
724 TheTok.isAtStartOfLine());
725}
726
727unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
728 const SourceManager &SM,
729 const LangOptions &LangOpts) {
730 // Figure out how many physical characters away the specified expansion
731 // character is. This needs to take into consideration newlines and
732 // trigraphs.
733 bool Invalid = false;
734 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
735
736 // If they request the first char of the token, we're trivially done.
737 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
738 return 0;
739
740 unsigned PhysOffset = 0;
741
742 // The usual case is that tokens don't contain anything interesting. Skip
743 // over the uninteresting characters. If a token only consists of simple
744 // chars, this method is extremely fast.
745 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
746 if (CharNo == 0)
747 return PhysOffset;
748 ++TokPtr;
749 --CharNo;
750 ++PhysOffset;
751 }
752
753 // If we have a character that may be a trigraph or escaped newline, use a
754 // lexer to parse it correctly.
755 for (; CharNo; --CharNo) {
756 unsigned Size;
757 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
758 TokPtr += Size;
759 PhysOffset += Size;
760 }
761
762 // Final detail: if we end up on an escaped newline, we want to return the
763 // location of the actual byte of the token. For example foo\<newline>bar
764 // advanced by 3 should return the location of b, not of \\. One compounding
765 // detail of this is that the escape may be made by a trigraph.
766 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
767 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
768
769 return PhysOffset;
770}
771
772/// Computes the source location just past the end of the
773/// token at this source location.
774///
775/// This routine can be used to produce a source location that
776/// points just past the end of the token referenced by \p Loc, and
777/// is generally used when a diagnostic needs to point just after a
778/// token where it expected something different that it received. If
779/// the returned source location would not be meaningful (e.g., if
780/// it points into a macro), this routine returns an invalid
781/// source location.
782///
783/// \param Offset an offset from the end of the token, where the source
784/// location should refer to. The default offset (0) produces a source
785/// location pointing just past the end of the token; an offset of 1 produces
786/// a source location pointing to the last character in the token, etc.
787SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
788 const SourceManager &SM,
789 const LangOptions &LangOpts) {
790 if (Loc.isInvalid())
791 return {};
792
793 if (Loc.isMacroID()) {
794 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
795 return {}; // Points inside the macro expansion.
796 }
797
798 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
799 if (Len > Offset)
800 Len = Len - Offset;
801 else
802 return Loc;
803
804 return Loc.getLocWithOffset(Len);
805}
806
807/// Returns true if the given MacroID location points at the first
808/// token of the macro expansion.
809bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
810 const SourceManager &SM,
811 const LangOptions &LangOpts,
812 SourceLocation *MacroBegin) {
813 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")(static_cast <bool> (loc.isValid() && loc.isMacroID
() && "Expected a valid macro loc") ? void (0) : __assert_fail
("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "clang/lib/Lex/Lexer.cpp", 813, __extension__ __PRETTY_FUNCTION__
))
;
814
815 SourceLocation expansionLoc;
816 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
817 return false;
818
819 if (expansionLoc.isFileID()) {
820 // No other macro expansions, this is the first.
821 if (MacroBegin)
822 *MacroBegin = expansionLoc;
823 return true;
824 }
825
826 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
827}
828
829/// Returns true if the given MacroID location points at the last
830/// token of the macro expansion.
831bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
832 const SourceManager &SM,
833 const LangOptions &LangOpts,
834 SourceLocation *MacroEnd) {
835 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")(static_cast <bool> (loc.isValid() && loc.isMacroID
() && "Expected a valid macro loc") ? void (0) : __assert_fail
("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "clang/lib/Lex/Lexer.cpp", 835, __extension__ __PRETTY_FUNCTION__
))
;
836
837 SourceLocation spellLoc = SM.getSpellingLoc(loc);
838 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
839 if (tokLen == 0)
840 return false;
841
842 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
843 SourceLocation expansionLoc;
844 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
845 return false;
846
847 if (expansionLoc.isFileID()) {
848 // No other macro expansions.
849 if (MacroEnd)
850 *MacroEnd = expansionLoc;
851 return true;
852 }
853
854 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
855}
856
857static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
858 const SourceManager &SM,
859 const LangOptions &LangOpts) {
860 SourceLocation Begin = Range.getBegin();
861 SourceLocation End = Range.getEnd();
862 assert(Begin.isFileID() && End.isFileID())(static_cast <bool> (Begin.isFileID() && End.isFileID
()) ? void (0) : __assert_fail ("Begin.isFileID() && End.isFileID()"
, "clang/lib/Lex/Lexer.cpp", 862, __extension__ __PRETTY_FUNCTION__
))
;
863 if (Range.isTokenRange()) {
864 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
865 if (End.isInvalid())
866 return {};
867 }
868
869 // Break down the source locations.
870 FileID FID;
871 unsigned BeginOffs;
872 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
873 if (FID.isInvalid())
874 return {};
875
876 unsigned EndOffs;
877 if (!SM.isInFileID(End, FID, &EndOffs) ||
878 BeginOffs > EndOffs)
879 return {};
880
881 return CharSourceRange::getCharRange(Begin, End);
882}
883
884// Assumes that `Loc` is in an expansion.
885static bool isInExpansionTokenRange(const SourceLocation Loc,
886 const SourceManager &SM) {
887 return SM.getSLocEntry(SM.getFileID(Loc))
888 .getExpansion()
889 .isExpansionTokenRange();
890}
891
892CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
893 const SourceManager &SM,
894 const LangOptions &LangOpts) {
895 SourceLocation Begin = Range.getBegin();
896 SourceLocation End = Range.getEnd();
897 if (Begin.isInvalid() || End.isInvalid())
898 return {};
899
900 if (Begin.isFileID() && End.isFileID())
901 return makeRangeFromFileLocs(Range, SM, LangOpts);
902
903 if (Begin.isMacroID() && End.isFileID()) {
904 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
905 return {};
906 Range.setBegin(Begin);
907 return makeRangeFromFileLocs(Range, SM, LangOpts);
908 }
909
910 if (Begin.isFileID() && End.isMacroID()) {
911 if (Range.isTokenRange()) {
912 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
913 return {};
914 // Use the *original* end, not the expanded one in `End`.
915 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
916 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
917 return {};
918 Range.setEnd(End);
919 return makeRangeFromFileLocs(Range, SM, LangOpts);
920 }
921
922 assert(Begin.isMacroID() && End.isMacroID())(static_cast <bool> (Begin.isMacroID() && End.isMacroID
()) ? void (0) : __assert_fail ("Begin.isMacroID() && End.isMacroID()"
, "clang/lib/Lex/Lexer.cpp", 922, __extension__ __PRETTY_FUNCTION__
))
;
923 SourceLocation MacroBegin, MacroEnd;
924 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
925 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
926 &MacroEnd)) ||
927 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
928 &MacroEnd)))) {
929 Range.setBegin(MacroBegin);
930 Range.setEnd(MacroEnd);
931 // Use the *original* `End`, not the expanded one in `MacroEnd`.
932 if (Range.isTokenRange())
933 Range.setTokenRange(isInExpansionTokenRange(End, SM));
934 return makeRangeFromFileLocs(Range, SM, LangOpts);
935 }
936
937 bool Invalid = false;
938 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
939 &Invalid);
940 if (Invalid)
941 return {};
942
943 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
944 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
945 &Invalid);
946 if (Invalid)
947 return {};
948
949 if (EndEntry.getExpansion().isMacroArgExpansion() &&
950 BeginEntry.getExpansion().getExpansionLocStart() ==
951 EndEntry.getExpansion().getExpansionLocStart()) {
952 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
953 Range.setEnd(SM.getImmediateSpellingLoc(End));
954 return makeFileCharRange(Range, SM, LangOpts);
955 }
956 }
957
958 return {};
959}
960
961StringRef Lexer::getSourceText(CharSourceRange Range,
962 const SourceManager &SM,
963 const LangOptions &LangOpts,
964 bool *Invalid) {
965 Range = makeFileCharRange(Range, SM, LangOpts);
966 if (Range.isInvalid()) {
967 if (Invalid) *Invalid = true;
968 return {};
969 }
970
971 // Break down the source location.
972 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
973 if (beginInfo.first.isInvalid()) {
974 if (Invalid) *Invalid = true;
975 return {};
976 }
977
978 unsigned EndOffs;
979 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
980 beginInfo.second > EndOffs) {
981 if (Invalid) *Invalid = true;
982 return {};
983 }
984
985 // Try to the load the file buffer.
986 bool invalidTemp = false;
987 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
988 if (invalidTemp) {
989 if (Invalid) *Invalid = true;
990 return {};
991 }
992
993 if (Invalid) *Invalid = false;
994 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
995}
996
997StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
998 const SourceManager &SM,
999 const LangOptions &LangOpts) {
1000 assert(Loc.isMacroID() && "Only reasonable to call this on macros")(static_cast <bool> (Loc.isMacroID() && "Only reasonable to call this on macros"
) ? void (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "clang/lib/Lex/Lexer.cpp", 1000, __extension__ __PRETTY_FUNCTION__
))
;
1001
1002 // Find the location of the immediate macro expansion.
1003 while (true) {
1004 FileID FID = SM.getFileID(Loc);
1005 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1006 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1007 Loc = Expansion.getExpansionLocStart();
1008 if (!Expansion.isMacroArgExpansion())
1009 break;
1010
1011 // For macro arguments we need to check that the argument did not come
1012 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1013
1014 // Loc points to the argument id of the macro definition, move to the
1015 // macro expansion.
1016 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1017 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1018 if (SpellLoc.isFileID())
1019 break; // No inner macro.
1020
1021 // If spelling location resides in the same FileID as macro expansion
1022 // location, it means there is no inner macro.
1023 FileID MacroFID = SM.getFileID(Loc);
1024 if (SM.isInFileID(SpellLoc, MacroFID))
1025 break;
1026
1027 // Argument came from inner macro.
1028 Loc = SpellLoc;
1029 }
1030
1031 // Find the spelling location of the start of the non-argument expansion
1032 // range. This is where the macro name was spelled in order to begin
1033 // expanding this macro.
1034 Loc = SM.getSpellingLoc(Loc);
1035
1036 // Dig out the buffer where the macro name was spelled and the extents of the
1037 // name so that we can render it into the expansion note.
1038 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1039 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1040 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1041 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1042}
1043
1044StringRef Lexer::getImmediateMacroNameForDiagnostics(
1045 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1046 assert(Loc.isMacroID() && "Only reasonable to call this on macros")(static_cast <bool> (Loc.isMacroID() && "Only reasonable to call this on macros"
) ? void (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "clang/lib/Lex/Lexer.cpp", 1046, __extension__ __PRETTY_FUNCTION__
))
;
1047 // Walk past macro argument expansions.
1048 while (SM.isMacroArgExpansion(Loc))
1049 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1050
1051 // If the macro's spelling has no FileID, then it's actually a token paste
1052 // or stringization (or similar) and not a macro at all.
1053 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1054 return {};
1055
1056 // Find the spelling location of the start of the non-argument expansion
1057 // range. This is where the macro name was spelled in order to begin
1058 // expanding this macro.
1059 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1060
1061 // Dig out the buffer where the macro name was spelled and the extents of the
1062 // name so that we can render it into the expansion note.
1063 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1064 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1065 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1066 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1067}
1068
1069bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
1070 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1071}
1072
1073bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1074 assert(isVerticalWhitespace(Str[0]))(static_cast <bool> (isVerticalWhitespace(Str[0])) ? void
(0) : __assert_fail ("isVerticalWhitespace(Str[0])", "clang/lib/Lex/Lexer.cpp"
, 1074, __extension__ __PRETTY_FUNCTION__))
;
1075 if (Str - 1 < BufferStart)
1076 return false;
1077
1078 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1079 (Str[0] == '\r' && Str[-1] == '\n')) {
1080 if (Str - 2 < BufferStart)
1081 return false;
1082 --Str;
1083 }
1084 --Str;
1085
1086 // Rewind to first non-space character:
1087 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1088 --Str;
1089
1090 return *Str == '\\';
1091}
1092
1093StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1094 const SourceManager &SM) {
1095 if (Loc.isInvalid() || Loc.isMacroID())
1096 return {};
1097 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1098 if (LocInfo.first.isInvalid())
1099 return {};
1100 bool Invalid = false;
1101 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1102 if (Invalid)
1103 return {};
1104 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1105 if (!Line)
1106 return {};
1107 StringRef Rest = Buffer.substr(Line - Buffer.data());
1108 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1109 return NumWhitespaceChars == StringRef::npos
1110 ? ""
1111 : Rest.take_front(NumWhitespaceChars);
1112}
1113
1114//===----------------------------------------------------------------------===//
1115// Diagnostics forwarding code.
1116//===----------------------------------------------------------------------===//
1117
1118/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1119/// lexer buffer was all expanded at a single point, perform the mapping.
1120/// This is currently only used for _Pragma implementation, so it is the slow
1121/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1122static LLVM_ATTRIBUTE_NOINLINE__attribute__((noinline)) SourceLocation GetMappedTokenLoc(
1123 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1124static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1125 SourceLocation FileLoc,
1126 unsigned CharNo, unsigned TokLen) {
1127 assert(FileLoc.isMacroID() && "Must be a macro expansion")(static_cast <bool> (FileLoc.isMacroID() && "Must be a macro expansion"
) ? void (0) : __assert_fail ("FileLoc.isMacroID() && \"Must be a macro expansion\""
, "clang/lib/Lex/Lexer.cpp", 1127, __extension__ __PRETTY_FUNCTION__
))
;
1128
1129 // Otherwise, we're lexing "mapped tokens". This is used for things like
1130 // _Pragma handling. Combine the expansion location of FileLoc with the
1131 // spelling location.
1132 SourceManager &SM = PP.getSourceManager();
1133
1134 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1135 // characters come from spelling(FileLoc)+Offset.
1136 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1137 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1138
1139 // Figure out the expansion loc range, which is the range covered by the
1140 // original _Pragma(...) sequence.
1141 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1142
1143 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1144}
1145
1146/// getSourceLocation - Return a source location identifier for the specified
1147/// offset in the current file.
1148SourceLocation Lexer::getSourceLocation(const char *Loc,
1149 unsigned TokLen) const {
1150 assert(Loc >= BufferStart && Loc <= BufferEnd &&(static_cast <bool> (Loc >= BufferStart && Loc
<= BufferEnd && "Location out of range for this buffer!"
) ? void (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "clang/lib/Lex/Lexer.cpp", 1151, __extension__ __PRETTY_FUNCTION__
))
1151 "Location out of range for this buffer!")(static_cast <bool> (Loc >= BufferStart && Loc
<= BufferEnd && "Location out of range for this buffer!"
) ? void (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "clang/lib/Lex/Lexer.cpp", 1151, __extension__ __PRETTY_FUNCTION__
))
;
1152
1153 // In the normal case, we're just lexing from a simple file buffer, return
1154 // the file id from FileLoc with the offset specified.
1155 unsigned CharNo = Loc-BufferStart;
1156 if (FileLoc.isFileID())
1157 return FileLoc.getLocWithOffset(CharNo);
1158
1159 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1160 // tokens are lexed from where the _Pragma was defined.
1161 assert(PP && "This doesn't work on raw lexers")(static_cast <bool> (PP && "This doesn't work on raw lexers"
) ? void (0) : __assert_fail ("PP && \"This doesn't work on raw lexers\""
, "clang/lib/Lex/Lexer.cpp", 1161, __extension__ __PRETTY_FUNCTION__
))
;
1162 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1163}
1164
1165/// Diag - Forwarding function for diagnostics. This translate a source
1166/// position in the current buffer into a SourceLocation object for rendering.
1167DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1168 return PP->Diag(getSourceLocation(Loc), DiagID);
30
Called C++ object pointer is null
1169}
1170
1171//===----------------------------------------------------------------------===//
1172// Trigraph and Escaped Newline Handling Code.
1173//===----------------------------------------------------------------------===//
1174
1175/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1176/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1177static char GetTrigraphCharForLetter(char Letter) {
1178 switch (Letter) {
1179 default: return 0;
1180 case '=': return '#';
1181 case ')': return ']';
1182 case '(': return '[';
1183 case '!': return '|';
1184 case '\'': return '^';
1185 case '>': return '}';
1186 case '/': return '\\';
1187 case '<': return '{';
1188 case '-': return '~';
1189 }
1190}
1191
1192/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1193/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1194/// return the result character. Finally, emit a warning about trigraph use
1195/// whether trigraphs are enabled or not.
1196static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1197 char Res = GetTrigraphCharForLetter(*CP);
1198 if (!Res || !L) return Res;
1199
1200 if (!Trigraphs) {
1201 if (!L->isLexingRawMode())
1202 L->Diag(CP-2, diag::trigraph_ignored);
1203 return 0;
1204 }
1205
1206 if (!L->isLexingRawMode())
1207 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1208 return Res;
1209}
1210
1211/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1212/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1213/// trigraph equivalent on entry to this function.
1214unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1215 unsigned Size = 0;
1216 while (isWhitespace(Ptr[Size])) {
1217 ++Size;
1218
1219 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1220 continue;
1221
1222 // If this is a \r\n or \n\r, skip the other half.
1223 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1224 Ptr[Size-1] != Ptr[Size])
1225 ++Size;
1226
1227 return Size;
1228 }
1229
1230 // Not an escaped newline, must be a \t or something else.
1231 return 0;
1232}
1233
1234/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1235/// them), skip over them and return the first non-escaped-newline found,
1236/// otherwise return P.
1237const char *Lexer::SkipEscapedNewLines(const char *P) {
1238 while (true) {
1239 const char *AfterEscape;
1240 if (*P == '\\') {
1241 AfterEscape = P+1;
1242 } else if (*P == '?') {
1243 // If not a trigraph for escape, bail out.
1244 if (P[1] != '?' || P[2] != '/')
1245 return P;
1246 // FIXME: Take LangOpts into account; the language might not
1247 // support trigraphs.
1248 AfterEscape = P+3;
1249 } else {
1250 return P;
1251 }
1252
1253 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1254 if (NewLineSize == 0) return P;
1255 P = AfterEscape+NewLineSize;
1256 }
1257}
1258
1259Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1260 const SourceManager &SM,
1261 const LangOptions &LangOpts) {
1262 if (Loc.isMacroID()) {
1263 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1264 return None;
1265 }
1266 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1267
1268 // Break down the source location.
1269 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1270
1271 // Try to load the file buffer.
1272 bool InvalidTemp = false;
1273 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1274 if (InvalidTemp)
1275 return None;
1276
1277 const char *TokenBegin = File.data() + LocInfo.second;
1278
1279 // Lex from the start of the given location.
1280 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1281 TokenBegin, File.end());
1282 // Find the token.
1283 Token Tok;
1284 lexer.LexFromRawLexer(Tok);
1285 return Tok;
1286}
1287
1288/// Checks that the given token is the first token that occurs after the
1289/// given location (this excludes comments and whitespace). Returns the location
1290/// immediately after the specified token. If the token is not found or the
1291/// location is inside a macro, the returned source location will be invalid.
1292SourceLocation Lexer::findLocationAfterToken(
1293 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1294 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1295 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1296 if (!Tok || Tok->isNot(TKind))
1297 return {};
1298 SourceLocation TokenLoc = Tok->getLocation();
1299
1300 // Calculate how much whitespace needs to be skipped if any.
1301 unsigned NumWhitespaceChars = 0;
1302 if (SkipTrailingWhitespaceAndNewLine) {
1303 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1304 unsigned char C = *TokenEnd;
1305 while (isHorizontalWhitespace(C)) {
1306 C = *(++TokenEnd);
1307 NumWhitespaceChars++;
1308 }
1309
1310 // Skip \r, \n, \r\n, or \n\r
1311 if (C == '\n' || C == '\r') {
1312 char PrevC = C;
1313 C = *(++TokenEnd);
1314 NumWhitespaceChars++;
1315 if ((C == '\n' || C == '\r') && C != PrevC)
1316 NumWhitespaceChars++;
1317 }
1318 }
1319
1320 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1321}
1322
1323/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1324/// get its size, and return it. This is tricky in several cases:
1325/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1326/// then either return the trigraph (skipping 3 chars) or the '?',
1327/// depending on whether trigraphs are enabled or not.
1328/// 2. If this is an escaped newline (potentially with whitespace between
1329/// the backslash and newline), implicitly skip the newline and return
1330/// the char after it.
1331///
1332/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1333/// know that we can accumulate into Size, and that we have already incremented
1334/// Ptr by Size bytes.
1335///
1336/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1337/// be updated to match.
1338char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1339 Token *Tok) {
1340 // If we have a slash, look for an escaped newline.
1341 if (Ptr[0] == '\\') {
1342 ++Size;
1343 ++Ptr;
1344Slash:
1345 // Common case, backslash-char where the char is not whitespace.
1346 if (!isWhitespace(Ptr[0])) return '\\';
1347
1348 // See if we have optional whitespace characters between the slash and
1349 // newline.
1350 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1351 // Remember that this token needs to be cleaned.
1352 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1353
1354 // Warn if there was whitespace between the backslash and newline.
1355 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1356 Diag(Ptr, diag::backslash_newline_space);
1357
1358 // Found backslash<whitespace><newline>. Parse the char after it.
1359 Size += EscapedNewLineSize;
1360 Ptr += EscapedNewLineSize;
1361
1362 // Use slow version to accumulate a correct size field.
1363 return getCharAndSizeSlow(Ptr, Size, Tok);
1364 }
1365
1366 // Otherwise, this is not an escaped newline, just return the slash.
1367 return '\\';
1368 }
1369
1370 // If this is a trigraph, process it.
1371 if (Ptr[0] == '?' && Ptr[1] == '?') {
1372 // If this is actually a legal trigraph (not something like "??x"), emit
1373 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1374 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1375 LangOpts.Trigraphs)) {
1376 // Remember that this token needs to be cleaned.
1377 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1378
1379 Ptr += 3;
1380 Size += 3;
1381 if (C == '\\') goto Slash;
1382 return C;
1383 }
1384 }
1385
1386 // If this is neither, return a single character.
1387 ++Size;
1388 return *Ptr;
1389}
1390
1391/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1392/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1393/// and that we have already incremented Ptr by Size bytes.
1394///
1395/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1396/// be updated to match.
1397char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1398 const LangOptions &LangOpts) {
1399 // If we have a slash, look for an escaped newline.
1400 if (Ptr[0] == '\\') {
1401 ++Size;
1402 ++Ptr;
1403Slash:
1404 // Common case, backslash-char where the char is not whitespace.
1405 if (!isWhitespace(Ptr[0])) return '\\';
1406
1407 // See if we have optional whitespace characters followed by a newline.
1408 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1409 // Found backslash<whitespace><newline>. Parse the char after it.
1410 Size += EscapedNewLineSize;
1411 Ptr += EscapedNewLineSize;
1412
1413 // Use slow version to accumulate a correct size field.
1414 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1415 }
1416
1417 // Otherwise, this is not an escaped newline, just return the slash.
1418 return '\\';
1419 }
1420
1421 // If this is a trigraph, process it.
1422 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1423 // If this is actually a legal trigraph (not something like "??x"), return
1424 // it.
1425 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1426 Ptr += 3;
1427 Size += 3;
1428 if (C == '\\') goto Slash;
1429 return C;
1430 }
1431 }
1432
1433 // If this is neither, return a single character.
1434 ++Size;
1435 return *Ptr;
1436}
1437
1438//===----------------------------------------------------------------------===//
1439// Helper methods for lexing.
1440//===----------------------------------------------------------------------===//
1441
1442/// Routine that indiscriminately sets the offset into the source file.
1443void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1444 BufferPtr = BufferStart + Offset;
1445 if (BufferPtr > BufferEnd)
1446 BufferPtr = BufferEnd;
1447 // FIXME: What exactly does the StartOfLine bit mean? There are two
1448 // possible meanings for the "start" of the line: the first token on the
1449 // unexpanded line, or the first token on the expanded line.
1450 IsAtStartOfLine = StartOfLine;
1451 IsAtPhysicalStartOfLine = StartOfLine;
1452}
1453
1454static bool isUnicodeWhitespace(uint32_t Codepoint) {
1455 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1456 UnicodeWhitespaceCharRanges);
1457 return UnicodeWhitespaceChars.contains(Codepoint);
1458}
1459
1460static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1461 if (LangOpts.AsmPreprocessor) {
1462 return false;
1463 } else if (LangOpts.DollarIdents && '$' == C) {
1464 return true;
1465 } else if (LangOpts.CPlusPlus || LangOpts.C2x) {
1466 // A non-leading codepoint must have the XID_Continue property.
1467 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1468 // so we need to check both tables.
1469 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1470 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1471 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1472 return C == '_' || XIDStartChars.contains(C) ||
1473 XIDContinueChars.contains(C);
1474 } else if (LangOpts.C11) {
1475 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1476 C11AllowedIDCharRanges);
1477 return C11AllowedIDChars.contains(C);
1478 } else {
1479 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1480 C99AllowedIDCharRanges);
1481 return C99AllowedIDChars.contains(C);
1482 }
1483}
1484
1485static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1486 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint")(static_cast <bool> (C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"
) ? void (0) : __assert_fail ("C > 0x7F && \"isAllowedInitiallyIDChar called with an ASCII codepoint\""
, "clang/lib/Lex/Lexer.cpp", 1486, __extension__ __PRETTY_FUNCTION__
))
;
1487 if (LangOpts.AsmPreprocessor) {
1488 return false;
1489 }
1490 if (LangOpts.CPlusPlus || LangOpts.C2x) {
1491 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1492 return XIDStartChars.contains(C);
1493 }
1494 if (!isAllowedIDChar(C, LangOpts))
1495 return false;
1496 if (LangOpts.C11) {
1497 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1498 C11DisallowedInitialIDCharRanges);
1499 return !C11DisallowedInitialIDChars.contains(C);
1500 }
1501 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1502 C99DisallowedInitialIDCharRanges);
1503 return !C99DisallowedInitialIDChars.contains(C);
1504}
1505
1506static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1507 const char *End) {
1508 return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1509 L.getSourceLocation(End));
1510}
1511
1512static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1513 CharSourceRange Range, bool IsFirst) {
1514 // Check C99 compatibility.
1515 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1516 enum {
1517 CannotAppearInIdentifier = 0,
1518 CannotStartIdentifier
1519 };
1520
1521 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1522 C99AllowedIDCharRanges);
1523 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1524 C99DisallowedInitialIDCharRanges);
1525 if (!C99AllowedIDChars.contains(C)) {
1526 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1527 << Range
1528 << CannotAppearInIdentifier;
1529 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1530 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1531 << Range
1532 << CannotStartIdentifier;
1533 }
1534 }
1535}
1536
1537/// After encountering UTF-8 character C and interpreting it as an identifier
1538/// character, check whether it's a homoglyph for a common non-identifier
1539/// source character that is unlikely to be an intentional identifier
1540/// character and warn if so.
1541static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1542 CharSourceRange Range) {
1543 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1544 struct HomoglyphPair {
1545 uint32_t Character;
1546 char LooksLike;
1547 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1548 };
1549 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1550 {U'\u00ad', 0}, // SOFT HYPHEN
1551 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1552 {U'\u037e', ';'}, // GREEK QUESTION MARK
1553 {U'\u200b', 0}, // ZERO WIDTH SPACE
1554 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1555 {U'\u200d', 0}, // ZERO WIDTH JOINER
1556 {U'\u2060', 0}, // WORD JOINER
1557 {U'\u2061', 0}, // FUNCTION APPLICATION
1558 {U'\u2062', 0}, // INVISIBLE TIMES
1559 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1560 {U'\u2064', 0}, // INVISIBLE PLUS
1561 {U'\u2212', '-'}, // MINUS SIGN
1562 {U'\u2215', '/'}, // DIVISION SLASH
1563 {U'\u2216', '\\'}, // SET MINUS
1564 {U'\u2217', '*'}, // ASTERISK OPERATOR
1565 {U'\u2223', '|'}, // DIVIDES
1566 {U'\u2227', '^'}, // LOGICAL AND
1567 {U'\u2236', ':'}, // RATIO
1568 {U'\u223c', '~'}, // TILDE OPERATOR
1569 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1570 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1571 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1572 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1573 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1574 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1575 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1576 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1577 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1578 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1579 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1580 {U'\uff0c', ','}, // FULLWIDTH COMMA
1581 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1582 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1583 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1584 {U'\uff1a', ':'}, // FULLWIDTH COLON
1585 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1586 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1587 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1588 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1589 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1590 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1591 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1592 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1593 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1594 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1595 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1596 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1597 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1598 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1599 {0, 0}
1600 };
1601 auto Homoglyph =
1602 std::lower_bound(std::begin(SortedHomoglyphs),
1603 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1604 if (Homoglyph->Character == C) {
1605 llvm::SmallString<5> CharBuf;
1606 {
1607 llvm::raw_svector_ostream CharOS(CharBuf);
1608 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1609 }
1610 if (Homoglyph->LooksLike) {
1611 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1612 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1613 << Range << CharBuf << LooksLikeStr;
1614 } else {
1615 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1616 << Range << CharBuf;
1617 }
1618 }
1619}
1620
1621static void diagnoseInvalidUnicodeCodepointInIdentifier(
1622 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1623 CharSourceRange Range, bool IsFirst) {
1624 if (isASCII(CodePoint))
1625 return;
1626
1627 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
1628 bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
1629
1630 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1631 return;
1632
1633 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1634
1635 llvm::SmallString<5> CharBuf;
1636 llvm::raw_svector_ostream CharOS(CharBuf);
1637 llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);
1638
1639 if (!IsFirst || InvalidOnlyAtStart) {
1640 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1641 << Range << CharBuf << int(InvalidOnlyAtStart)
1642 << FixItHint::CreateRemoval(Range);
1643 } else {
1644 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1645 << Range << CharBuf << FixItHint::CreateRemoval(Range);
1646 }
1647}
1648
1649bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1650 Token &Result) {
1651 const char *UCNPtr = CurPtr + Size;
1652 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1653 if (CodePoint == 0) {
1654 return false;
1655 }
1656
1657 if (!isAllowedIDChar(CodePoint, LangOpts)) {
1658 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1659 return false;
1660 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1661 !PP->isPreprocessedOutput())
1662 diagnoseInvalidUnicodeCodepointInIdentifier(
1663 PP->getDiagnostics(), LangOpts, CodePoint,
1664 makeCharRange(*this, CurPtr, UCNPtr),
1665 /*IsFirst=*/false);
1666
1667 // We got a unicode codepoint that is neither a space nor a
1668 // a valid identifier part.
1669 // Carry on as if the codepoint was valid for recovery purposes.
1670 } else if (!isLexingRawMode())
1671 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1672 makeCharRange(*this, CurPtr, UCNPtr),
1673 /*IsFirst=*/false);
1674
1675 Result.setFlag(Token::HasUCN);
1676 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1677 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1678 CurPtr = UCNPtr;
1679 else
1680 while (CurPtr != UCNPtr)
1681 (void)getAndAdvanceChar(CurPtr, Result);
1682 return true;
1683}
1684
1685bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1686 const char *UnicodePtr = CurPtr;
1687 llvm::UTF32 CodePoint;
1688 llvm::ConversionResult Result =
1689 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1690 (const llvm::UTF8 *)BufferEnd,
1691 &CodePoint,
1692 llvm::strictConversion);
1693 if (Result != llvm::conversionOK)
1694 return false;
1695
1696 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
1697 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1698 return false;
1699
1700 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1701 !PP->isPreprocessedOutput())
1702 diagnoseInvalidUnicodeCodepointInIdentifier(
1703 PP->getDiagnostics(), LangOpts, CodePoint,
1704 makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
1705 // We got a unicode codepoint that is neither a space nor a
1706 // a valid identifier part. Carry on as if the codepoint was
1707 // valid for recovery purposes.
1708 } else if (!isLexingRawMode()) {
1709 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1710 makeCharRange(*this, CurPtr, UnicodePtr),
1711 /*IsFirst=*/false);
1712 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1713 makeCharRange(*this, CurPtr, UnicodePtr));
1714 }
1715
1716 CurPtr = UnicodePtr;
1717 return true;
1718}
1719
1720bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1721 const char *CurPtr) {
1722 if (isAllowedInitiallyIDChar(C, LangOpts)) {
1723 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1724 !PP->isPreprocessedOutput()) {
1725 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
1726 makeCharRange(*this, BufferPtr, CurPtr),
1727 /*IsFirst=*/true);
1728 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
1729 makeCharRange(*this, BufferPtr, CurPtr));
1730 }
1731
1732 MIOpt.ReadToken();
1733 return LexIdentifierContinue(Result, CurPtr);
1734 }
1735
1736 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1737 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1738 !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
1739 // Non-ASCII characters tend to creep into source code unintentionally.
1740 // Instead of letting the parser complain about the unknown token,
1741 // just drop the character.
1742 // Note that we can /only/ do this when the non-ASCII character is actually
1743 // spelled as Unicode, not written as a UCN. The standard requires that
1744 // we not throw away any possible preprocessor tokens, but there's a
1745 // loophole in the mapping of Unicode characters to basic character set
1746 // characters that allows us to map these particular characters to, say,
1747 // whitespace.
1748 diagnoseInvalidUnicodeCodepointInIdentifier(
1749 PP->getDiagnostics(), LangOpts, C,
1750 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1751 BufferPtr = CurPtr;
1752 return false;
1753 }
1754
1755 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1756 // up by accident.
1757 MIOpt.ReadToken();
1758 FormTokenWithChars(Result, CurPtr, tok::unknown);
1759 return true;
1760}
1761
1762bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1763 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1764 while (true) {
1765 unsigned char C = *CurPtr;
1766 // Fast path.
1767 if (isAsciiIdentifierContinue(C)) {
1768 ++CurPtr;
1769 continue;
1770 }
1771
1772 unsigned Size;
1773 // Slow path: handle trigraph, unicode codepoints, UCNs.
1774 C = getCharAndSize(CurPtr, Size);
1775 if (isAsciiIdentifierContinue(C)) {
1776 CurPtr = ConsumeChar(CurPtr, Size, Result);
1777 continue;
1778 }
1779 if (C == '$') {
1780 // If we hit a $ and they are not supported in identifiers, we are done.
1781 if (!LangOpts.DollarIdents)
1782 break;
1783 // Otherwise, emit a diagnostic and continue.
1784 if (!isLexingRawMode())
1785 Diag(CurPtr, diag::ext_dollar_in_identifier);
1786 CurPtr = ConsumeChar(CurPtr, Size, Result);
1787 continue;
1788 }
1789 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1790 continue;
1791 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1792 continue;
1793 // Neither an expected Unicode codepoint nor a UCN.
1794 break;
1795 }
1796
1797 const char *IdStart = BufferPtr;
1798 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1799 Result.setRawIdentifierData(IdStart);
1800
1801 // If we are in raw mode, return this identifier raw. There is no need to
1802 // look up identifier information or attempt to macro expand it.
1803 if (LexingRawMode)
1804 return true;
1805
1806 // Fill in Result.IdentifierInfo and update the token kind,
1807 // looking up the identifier in the identifier table.
1808 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1809 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1810 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1811
1812 // If the completion point is at the end of an identifier, we want to treat
1813 // the identifier as incomplete even if it resolves to a macro or a keyword.
1814 // This allows e.g. 'class^' to complete to 'classifier'.
1815 if (isCodeCompletionPoint(CurPtr)) {
1816 // Return the code-completion token.
1817 Result.setKind(tok::code_completion);
1818 // Skip the code-completion char and all immediate identifier characters.
1819 // This ensures we get consistent behavior when completing at any point in
1820 // an identifier (i.e. at the start, in the middle, at the end). Note that
1821 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1822 // simpler.
1823 assert(*CurPtr == 0 && "Completion character must be 0")(static_cast <bool> (*CurPtr == 0 && "Completion character must be 0"
) ? void (0) : __assert_fail ("*CurPtr == 0 && \"Completion character must be 0\""
, "clang/lib/Lex/Lexer.cpp", 1823, __extension__ __PRETTY_FUNCTION__
))
;
1824 ++CurPtr;
1825 // Note that code completion token is not added as a separate character
1826 // when the completion point is at the end of the buffer. Therefore, we need
1827 // to check if the buffer has ended.
1828 if (CurPtr < BufferEnd) {
1829 while (isAsciiIdentifierContinue(*CurPtr))
1830 ++CurPtr;
1831 }
1832 BufferPtr = CurPtr;
1833 return true;
1834 }
1835
1836 // Finally, now that we know we have an identifier, pass this off to the
1837 // preprocessor, which may macro expand it or something.
1838 if (II->isHandleIdentifierCase())
1839 return PP->HandleIdentifier(Result);
1840
1841 return true;
1842}
1843
1844/// isHexaLiteral - Return true if Start points to a hex constant.
1845/// in microsoft mode (where this is supposed to be several different tokens).
1846bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1847 unsigned Size;
1848 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1849 if (C1 != '0')
1850 return false;
1851 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1852 return (C2 == 'x' || C2 == 'X');
1853}
1854
1855/// LexNumericConstant - Lex the remainder of a integer or floating point
1856/// constant. From[-1] is the first character lexed. Return the end of the
1857/// constant.
1858bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1859 unsigned Size;
1860 char C = getCharAndSize(CurPtr, Size);
1861 char PrevCh = 0;
1862 while (isPreprocessingNumberBody(C)) {
1863 CurPtr = ConsumeChar(CurPtr, Size, Result);
1864 PrevCh = C;
1865 C = getCharAndSize(CurPtr, Size);
1866 }
1867
1868 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1869 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1870 // If we are in Microsoft mode, don't continue if the constant is hex.
1871 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1872 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1873 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1874 }
1875
1876 // If we have a hex FP constant, continue.
1877 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1878 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1879 // not-quite-conforming extension. Only do so if this looks like it's
1880 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1881 bool IsHexFloat = true;
1882 if (!LangOpts.C99) {
1883 if (!isHexaLiteral(BufferPtr, LangOpts))
1884 IsHexFloat = false;
1885 else if (!LangOpts.CPlusPlus17 &&
1886 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1887 IsHexFloat = false;
1888 }
1889 if (IsHexFloat)
1890 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1891 }
1892
1893 // If we have a digit separator, continue.
1894 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C2x)) {
1895 unsigned NextSize;
1896 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
1897 if (isAsciiIdentifierContinue(Next)) {
1898 if (!isLexingRawMode())
1899 Diag(CurPtr, LangOpts.CPlusPlus
1900 ? diag::warn_cxx11_compat_digit_separator
1901 : diag::warn_c2x_compat_digit_separator);
1902 CurPtr = ConsumeChar(CurPtr, Size, Result);
1903 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1904 return LexNumericConstant(Result, CurPtr);
1905 }
1906 }
1907
1908 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1909 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1910 return LexNumericConstant(Result, CurPtr);
1911 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1912 return LexNumericConstant(Result, CurPtr);
1913
1914 // Update the location of token as well as BufferPtr.
1915 const char *TokStart = BufferPtr;
1916 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1917 Result.setLiteralData(TokStart);
1918 return true;
1919}
1920
1921/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1922/// in C++11, or warn on a ud-suffix in C++98.
1923const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1924 bool IsStringLiteral) {
1925 assert(LangOpts.CPlusPlus)(static_cast <bool> (LangOpts.CPlusPlus) ? void (0) : __assert_fail
("LangOpts.CPlusPlus", "clang/lib/Lex/Lexer.cpp", 1925, __extension__
__PRETTY_FUNCTION__))
;
1926
1927 // Maximally munch an identifier.
1928 unsigned Size;
1929 char C = getCharAndSize(CurPtr, Size);
1930 bool Consumed = false;
1931
1932 if (!isAsciiIdentifierStart(C)) {
1933 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1934 Consumed = true;
1935 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1936 Consumed = true;
1937 else
1938 return CurPtr;
1939 }
1940
1941 if (!LangOpts.CPlusPlus11) {
1942 if (!isLexingRawMode())
1943 Diag(CurPtr,
1944 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1945 : diag::warn_cxx11_compat_reserved_user_defined_literal)
1946 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1947 return CurPtr;
1948 }
1949
1950 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1951 // that does not start with an underscore is ill-formed. As a conforming
1952 // extension, we treat all such suffixes as if they had whitespace before
1953 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1954 // likely to be a ud-suffix than a macro, however, and accept that.
1955 if (!Consumed) {
1956 bool IsUDSuffix = false;
1957 if (C == '_')
1958 IsUDSuffix = true;
1959 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
1960 // In C++1y, we need to look ahead a few characters to see if this is a
1961 // valid suffix for a string literal or a numeric literal (this could be
1962 // the 'operator""if' defining a numeric literal operator).
1963 const unsigned MaxStandardSuffixLength = 3;
1964 char Buffer[MaxStandardSuffixLength] = { C };
1965 unsigned Consumed = Size;
1966 unsigned Chars = 1;
1967 while (true) {
1968 unsigned NextSize;
1969 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
1970 if (!isAsciiIdentifierContinue(Next)) {
1971 // End of suffix. Check whether this is on the allowed list.
1972 const StringRef CompleteSuffix(Buffer, Chars);
1973 IsUDSuffix =
1974 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
1975 break;
1976 }
1977
1978 if (Chars == MaxStandardSuffixLength)
1979 // Too long: can't be a standard suffix.
1980 break;
1981
1982 Buffer[Chars++] = Next;
1983 Consumed += NextSize;
1984 }
1985 }
1986
1987 if (!IsUDSuffix) {
1988 if (!isLexingRawMode())
1989 Diag(CurPtr, LangOpts.MSVCCompat
1990 ? diag::ext_ms_reserved_user_defined_literal
1991 : diag::ext_reserved_user_defined_literal)
1992 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1993 return CurPtr;
1994 }
1995
1996 CurPtr = ConsumeChar(CurPtr, Size, Result);
1997 }
1998
1999 Result.setFlag(Token::HasUDSuffix);
2000 while (true) {
2001 C = getCharAndSize(CurPtr, Size);
2002 if (isAsciiIdentifierContinue(C)) {
2003 CurPtr = ConsumeChar(CurPtr, Size, Result);
2004 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2005 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
2006 } else
2007 break;
2008 }
2009
2010 return CurPtr;
2011}
2012
2013/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2014/// either " or L" or u8" or u" or U".
2015bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2016 tok::TokenKind Kind) {
2017 const char *AfterQuote = CurPtr;
2018 // Does this string contain the \0 character?
2019 const char *NulCharacter = nullptr;
2020
2021 if (!isLexingRawMode() &&
2022 (Kind == tok::utf8_string_literal ||
2023 Kind == tok::utf16_string_literal ||
2024 Kind == tok::utf32_string_literal))
2025 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2026 : diag::warn_c99_compat_unicode_literal);
2027
2028 char C = getAndAdvanceChar(CurPtr, Result);
2029 while (C != '"') {
2030 // Skip escaped characters. Escaped newlines will already be processed by
2031 // getAndAdvanceChar.
2032 if (C == '\\')
2033 C = getAndAdvanceChar(CurPtr, Result);
2034
2035 if (C == '\n' || C == '\r' || // Newline.
2036 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2037 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2038 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2039 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2040 return true;
2041 }
2042
2043 if (C == 0) {
2044 if (isCodeCompletionPoint(CurPtr-1)) {
2045 if (ParsingFilename)
2046 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2047 else
2048 PP->CodeCompleteNaturalLanguage();
2049 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2050 cutOffLexing();
2051 return true;
2052 }
2053
2054 NulCharacter = CurPtr-1;
2055 }
2056 C = getAndAdvanceChar(CurPtr, Result);
2057 }
2058
2059 // If we are in C++11, lex the optional ud-suffix.
2060 if (LangOpts.CPlusPlus)
2061 CurPtr = LexUDSuffix(Result, CurPtr, true);
2062
2063 // If a nul character existed in the string, warn about it.
2064 if (NulCharacter && !isLexingRawMode())
2065 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2066
2067 // Update the location of the token as well as the BufferPtr instance var.
2068 const char *TokStart = BufferPtr;
2069 FormTokenWithChars(Result, CurPtr, Kind);
2070 Result.setLiteralData(TokStart);
2071 return true;
2072}
2073
2074/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2075/// having lexed R", LR", u8R", uR", or UR".
2076bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2077 tok::TokenKind Kind) {
2078 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2079 // Between the initial and final double quote characters of the raw string,
2080 // any transformations performed in phases 1 and 2 (trigraphs,
2081 // universal-character-names, and line splicing) are reverted.
2082
2083 if (!isLexingRawMode())
2084 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2085
2086 unsigned PrefixLen = 0;
2087
2088 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
2089 ++PrefixLen;
2090
2091 // If the last character was not a '(', then we didn't lex a valid delimiter.
2092 if (CurPtr[PrefixLen] != '(') {
2093 if (!isLexingRawMode()) {
2094 const char *PrefixEnd = &CurPtr[PrefixLen];
2095 if (PrefixLen == 16) {
2096 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2097 } else {
2098 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2099 << StringRef(PrefixEnd, 1);
2100 }
2101 }
2102
2103 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2104 // it's possible the '"' was intended to be part of the raw string, but
2105 // there's not much we can do about that.
2106 while (true) {
2107 char C = *CurPtr++;
2108
2109 if (C == '"')
2110 break;
2111 if (C == 0 && CurPtr-1 == BufferEnd) {
2112 --CurPtr;
2113 break;
2114 }
2115 }
2116
2117 FormTokenWithChars(Result, CurPtr, tok::unknown);
2118 return true;
2119 }
2120
2121 // Save prefix and move CurPtr past it
2122 const char *Prefix = CurPtr;
2123 CurPtr += PrefixLen + 1; // skip over prefix and '('
2124
2125 while (true) {
2126 char C = *CurPtr++;
2127
2128 if (C == ')') {
2129 // Check for prefix match and closing quote.
2130 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2131 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2132 break;
2133 }
2134 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2135 if (!isLexingRawMode())
2136 Diag(BufferPtr, diag::err_unterminated_raw_string)
2137 << StringRef(Prefix, PrefixLen);
2138 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2139 return true;
2140 }
2141 }
2142
2143 // If we are in C++11, lex the optional ud-suffix.
2144 if (LangOpts.CPlusPlus)
2145 CurPtr = LexUDSuffix(Result, CurPtr, true);
2146
2147 // Update the location of token as well as BufferPtr.
2148 const char *TokStart = BufferPtr;
2149 FormTokenWithChars(Result, CurPtr, Kind);
2150 Result.setLiteralData(TokStart);
2151 return true;
2152}
2153
2154/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2155/// after having lexed the '<' character. This is used for #include filenames.
2156bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2157 // Does this string contain the \0 character?
2158 const char *NulCharacter = nullptr;
2159 const char *AfterLessPos = CurPtr;
2160 char C = getAndAdvanceChar(CurPtr, Result);
2161 while (C != '>') {
2162 // Skip escaped characters. Escaped newlines will already be processed by
2163 // getAndAdvanceChar.
2164 if (C == '\\')
2165 C = getAndAdvanceChar(CurPtr, Result);
2166
2167 if (isVerticalWhitespace(C) || // Newline.
2168 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2169 // If the filename is unterminated, then it must just be a lone <
2170 // character. Return this as such.
2171 FormTokenWithChars(Result, AfterLessPos, tok::less);
2172 return true;
2173 }
2174
2175 if (C == 0) {
2176 if (isCodeCompletionPoint(CurPtr - 1)) {
2177 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2178 cutOffLexing();
2179 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2180 return true;
2181 }
2182 NulCharacter = CurPtr-1;
2183 }
2184 C = getAndAdvanceChar(CurPtr, Result);
2185 }
2186
2187 // If a nul character existed in the string, warn about it.
2188 if (NulCharacter && !isLexingRawMode())
2189 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2190
2191 // Update the location of token as well as BufferPtr.
2192 const char *TokStart = BufferPtr;
2193 FormTokenWithChars(Result, CurPtr, tok::header_name);
2194 Result.setLiteralData(TokStart);
2195 return true;
2196}
2197
2198void Lexer::codeCompleteIncludedFile(const char *PathStart,
2199 const char *CompletionPoint,
2200 bool IsAngled) {
2201 // Completion only applies to the filename, after the last slash.
2202 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2203 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2204 auto Slash = PartialPath.find_last_of(SlashChars);
2205 StringRef Dir =
2206 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2207 const char *StartOfFilename =
2208 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2209 // Code completion filter range is the filename only, up to completion point.
2210 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2211 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2212 // We should replace the characters up to the closing quote or closest slash,
2213 // if any.
2214 while (CompletionPoint < BufferEnd) {
2215 char Next = *(CompletionPoint + 1);
2216 if (Next == 0 || Next == '\r' || Next == '\n')
2217 break;
2218 ++CompletionPoint;
2219 if (Next == (IsAngled ? '>' : '"'))
2220 break;
2221 if (llvm::is_contained(SlashChars, Next))
2222 break;
2223 }
2224
2225 PP->setCodeCompletionTokenRange(
2226 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2227 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2228 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2229}
2230
2231/// LexCharConstant - Lex the remainder of a character constant, after having
2232/// lexed either ' or L' or u8' or u' or U'.
2233bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2234 tok::TokenKind Kind) {
2235 // Does this character contain the \0 character?
2236 const char *NulCharacter = nullptr;
2237
2238 if (!isLexingRawMode()) {
2239 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2240 Diag(BufferPtr, LangOpts.CPlusPlus
2241 ? diag::warn_cxx98_compat_unicode_literal
2242 : diag::warn_c99_compat_unicode_literal);
2243 else if (Kind == tok::utf8_char_constant)
2244 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2245 }
2246
2247 char C = getAndAdvanceChar(CurPtr, Result);
2248 if (C == '\'') {
2249 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2250 Diag(BufferPtr, diag::ext_empty_character);
2251 FormTokenWithChars(Result, CurPtr, tok::unknown);
2252 return true;
2253 }
2254
2255 while (C != '\'') {
2256 // Skip escaped characters.
2257 if (C == '\\')
2258 C = getAndAdvanceChar(CurPtr, Result);
2259
2260 if (C == '\n' || C == '\r' || // Newline.
2261 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2262 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2263 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2264 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2265 return true;
2266 }
2267
2268 if (C == 0) {
2269 if (isCodeCompletionPoint(CurPtr-1)) {
2270 PP->CodeCompleteNaturalLanguage();
2271 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2272 cutOffLexing();
2273 return true;
2274 }
2275
2276 NulCharacter = CurPtr-1;
2277 }
2278 C = getAndAdvanceChar(CurPtr, Result);
2279 }
2280
2281 // If we are in C++11, lex the optional ud-suffix.
2282 if (LangOpts.CPlusPlus)
2283 CurPtr = LexUDSuffix(Result, CurPtr, false);
2284
2285 // If a nul character existed in the character, warn about it.
2286 if (NulCharacter && !isLexingRawMode())
2287 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2288
2289 // Update the location of token as well as BufferPtr.
2290 const char *TokStart = BufferPtr;
2291 FormTokenWithChars(Result, CurPtr, Kind);
2292 Result.setLiteralData(TokStart);
2293 return true;
2294}
2295
2296/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2297/// Update BufferPtr to point to the next non-whitespace character and return.
2298///
2299/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2300bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2301 bool &TokAtPhysicalStartOfLine) {
2302 // Whitespace - Skip it, then return the token after the whitespace.
2303 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2304
2305 unsigned char Char = *CurPtr;
2306
2307 const char *lastNewLine = nullptr;
2308 auto setLastNewLine = [&](const char *Ptr) {
2309 lastNewLine = Ptr;
2310 if (!NewLinePtr)
2311 NewLinePtr = Ptr;
2312 };
2313 if (SawNewline)
2314 setLastNewLine(CurPtr - 1);
2315
2316 // Skip consecutive spaces efficiently.
2317 while (true) {
2318 // Skip horizontal whitespace very aggressively.
2319 while (isHorizontalWhitespace(Char))
2320 Char = *++CurPtr;
2321
2322 // Otherwise if we have something other than whitespace, we're done.
2323 if (!isVerticalWhitespace(Char))
2324 break;
2325
2326 if (ParsingPreprocessorDirective) {
2327 // End of preprocessor directive line, let LexTokenInternal handle this.
2328 BufferPtr = CurPtr;
2329 return false;
2330 }
2331
2332 // OK, but handle newline.
2333 if (*CurPtr == '\n')
2334 setLastNewLine(CurPtr);
2335 SawNewline = true;
2336 Char = *++CurPtr;
2337 }
2338
2339 // If the client wants us to return whitespace, return it now.
2340 if (isKeepWhitespaceMode()) {
2341 FormTokenWithChars(Result, CurPtr, tok::unknown);
2342 if (SawNewline) {
2343 IsAtStartOfLine = true;
2344 IsAtPhysicalStartOfLine = true;
2345 }
2346 // FIXME: The next token will not have LeadingSpace set.
2347 return true;
2348 }
2349
2350 // If this isn't immediately after a newline, there is leading space.
2351 char PrevChar = CurPtr[-1];
2352 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2353
2354 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2355 if (SawNewline) {
2356 Result.setFlag(Token::StartOfLine);
2357 TokAtPhysicalStartOfLine = true;
2358
2359 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2360 if (auto *Handler = PP->getEmptylineHandler())
2361 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2362 getSourceLocation(lastNewLine)));
2363 }
2364 }
2365
2366 BufferPtr = CurPtr;
2367 return false;
2368}
2369
2370/// We have just read the // characters from input. Skip until we find the
2371/// newline character that terminates the comment. Then update BufferPtr and
2372/// return.
2373///
2374/// If we're in KeepCommentMode or any CommentHandler has inserted
2375/// some tokens, this will store the first token and return true.
2376bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2377 bool &TokAtPhysicalStartOfLine) {
2378 // If Line comments aren't explicitly enabled for this language, emit an
2379 // extension warning.
2380 if (!LineComment) {
2381 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2382 Diag(BufferPtr, diag::ext_line_comment);
2383
2384 // Mark them enabled so we only emit one warning for this translation
2385 // unit.
2386 LineComment = true;
2387 }
2388
2389 // Scan over the body of the comment. The common case, when scanning, is that
2390 // the comment contains normal ascii characters with nothing interesting in
2391 // them. As such, optimize for this case with the inner loop.
2392 //
2393 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2394 // character that ends the line comment.
2395
2396 // C++23 [lex.phases] p1
2397 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2398 // diagnostic only once per entire ill-formed subsequence to avoid
2399 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2400 bool UnicodeDecodingAlreadyDiagnosed = false;
2401
2402 char C;
2403 while (true) {
2404 C = *CurPtr;
2405 // Skip over characters in the fast loop.
2406 while (isASCII(C) && C != 0 && // Potentially EOF.
2407 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2408 C = *++CurPtr;
2409 UnicodeDecodingAlreadyDiagnosed = false;
2410 }
2411
2412 if (!isASCII(C)) {
2413 unsigned Length = llvm::getUTF8SequenceSize(
2414 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2415 if (Length == 0) {
2416 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2417 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2418 UnicodeDecodingAlreadyDiagnosed = true;
2419 ++CurPtr;
2420 } else {
2421 UnicodeDecodingAlreadyDiagnosed = false;
2422 CurPtr += Length;
2423 }
2424 continue;
2425 }
2426
2427 const char *NextLine = CurPtr;
2428 if (C != 0) {
2429 // We found a newline, see if it's escaped.
2430 const char *EscapePtr = CurPtr-1;
2431 bool HasSpace = false;
2432 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2433 --EscapePtr;
2434 HasSpace = true;
2435 }
2436
2437 if (*EscapePtr == '\\')
2438 // Escaped newline.
2439 CurPtr = EscapePtr;
2440 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2441 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2442 // Trigraph-escaped newline.
2443 CurPtr = EscapePtr-2;
2444 else
2445 break; // This is a newline, we're done.
2446
2447 // If there was space between the backslash and newline, warn about it.
2448 if (HasSpace && !isLexingRawMode())
2449 Diag(EscapePtr, diag::backslash_newline_space);
2450 }
2451
2452 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2453 // properly decode the character. Read it in raw mode to avoid emitting
2454 // diagnostics about things like trigraphs. If we see an escaped newline,
2455 // we'll handle it below.
2456 const char *OldPtr = CurPtr;
2457 bool OldRawMode = isLexingRawMode();
2458 LexingRawMode = true;
2459 C = getAndAdvanceChar(CurPtr, Result);
2460 LexingRawMode = OldRawMode;
2461
2462 // If we only read only one character, then no special handling is needed.
2463 // We're done and can skip forward to the newline.
2464 if (C != 0 && CurPtr == OldPtr+1) {
2465 CurPtr = NextLine;
2466 break;
2467 }
2468
2469 // If we read multiple characters, and one of those characters was a \r or
2470 // \n, then we had an escaped newline within the comment. Emit diagnostic
2471 // unless the next line is also a // comment.
2472 if (CurPtr != OldPtr + 1 && C != '/' &&
2473 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2474 for (; OldPtr != CurPtr; ++OldPtr)
2475 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2476 // Okay, we found a // comment that ends in a newline, if the next
2477 // line is also a // comment, but has spaces, don't emit a diagnostic.
2478 if (isWhitespace(C)) {
2479 const char *ForwardPtr = CurPtr;
2480 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2481 ++ForwardPtr;
2482 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2483 break;
2484 }
2485
2486 if (!isLexingRawMode())
2487 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2488 break;
2489 }
2490 }
2491
2492 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2493 --CurPtr;
2494 break;
2495 }
2496
2497 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2498 PP->CodeCompleteNaturalLanguage();
2499 cutOffLexing();
2500 return false;
2501 }
2502 }
2503
2504 // Found but did not consume the newline. Notify comment handlers about the
2505 // comment unless we're in a #if 0 block.
2506 if (PP && !isLexingRawMode() &&
2507 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2508 getSourceLocation(CurPtr)))) {
2509 BufferPtr = CurPtr;
2510 return true; // A token has to be returned.
2511 }
2512
2513 // If we are returning comments as tokens, return this comment as a token.
2514 if (inKeepCommentMode())
2515 return SaveLineComment(Result, CurPtr);
2516
2517 // If we are inside a preprocessor directive and we see the end of line,
2518 // return immediately, so that the lexer can return this as an EOD token.
2519 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2520 BufferPtr = CurPtr;
2521 return false;
2522 }
2523
2524 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2525 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2526 // contribute to another token), it isn't needed for correctness. Note that
2527 // this is ok even in KeepWhitespaceMode, because we would have returned the
2528 /// comment above in that mode.
2529 NewLinePtr = CurPtr++;
2530
2531 // The next returned token is at the start of the line.
2532 Result.setFlag(Token::StartOfLine);
2533 TokAtPhysicalStartOfLine = true;
2534 // No leading whitespace seen so far.
2535 Result.clearFlag(Token::LeadingSpace);
2536 BufferPtr = CurPtr;
2537 return false;
2538}
2539
2540/// If in save-comment mode, package up this Line comment in an appropriate
2541/// way and return it.
2542bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2543 // If we're not in a preprocessor directive, just return the // comment
2544 // directly.
2545 FormTokenWithChars(Result, CurPtr, tok::comment);
2546
2547 if (!ParsingPreprocessorDirective || LexingRawMode)
2548 return true;
2549
2550 // If this Line-style comment is in a macro definition, transmogrify it into
2551 // a C-style block comment.
2552 bool Invalid = false;
2553 std::string Spelling = PP->getSpelling(Result, &Invalid);
2554 if (Invalid)
2555 return true;
2556
2557 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?")(static_cast <bool> (Spelling[0] == '/' && Spelling
[1] == '/' && "Not line comment?") ? void (0) : __assert_fail
("Spelling[0] == '/' && Spelling[1] == '/' && \"Not line comment?\""
, "clang/lib/Lex/Lexer.cpp", 2557, __extension__ __PRETTY_FUNCTION__
))
;
2558 Spelling[1] = '*'; // Change prefix to "/*".
2559 Spelling += "*/"; // add suffix.
2560
2561 Result.setKind(tok::comment);
2562 PP->CreateString(Spelling, Result,
2563 Result.getLocation(), Result.getLocation());
2564 return true;
2565}
2566
2567/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2568/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2569/// a diagnostic if so. We know that the newline is inside of a block comment.
2570static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2571 bool Trigraphs) {
2572 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r')(static_cast <bool> (CurPtr[0] == '\n' || CurPtr[0] == '\r'
) ? void (0) : __assert_fail ("CurPtr[0] == '\\n' || CurPtr[0] == '\\r'"
, "clang/lib/Lex/Lexer.cpp", 2572, __extension__ __PRETTY_FUNCTION__
))
;
2573
2574 // Position of the first trigraph in the ending sequence.
2575 const char *TrigraphPos = nullptr;
2576 // Position of the first whitespace after a '\' in the ending sequence.
2577 const char *SpacePos = nullptr;
2578
2579 while (true) {
2580 // Back up off the newline.
2581 --CurPtr;
2582
2583 // If this is a two-character newline sequence, skip the other character.
2584 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2585 // \n\n or \r\r -> not escaped newline.
2586 if (CurPtr[0] == CurPtr[1])
2587 return false;
2588 // \n\r or \r\n -> skip the newline.
2589 --CurPtr;
2590 }
2591
2592 // If we have horizontal whitespace, skip over it. We allow whitespace
2593 // between the slash and newline.
2594 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2595 SpacePos = CurPtr;
2596 --CurPtr;
2597 }
2598
2599 // If we have a slash, this is an escaped newline.
2600 if (*CurPtr == '\\') {
2601 --CurPtr;
2602 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2603 // This is a trigraph encoding of a slash.
2604 TrigraphPos = CurPtr - 2;
2605 CurPtr -= 3;
2606 } else {
2607 return false;
2608 }
2609
2610 // If the character preceding the escaped newline is a '*', then after line
2611 // splicing we have a '*/' ending the comment.
2612 if (*CurPtr == '*')
2613 break;
2614
2615 if (*CurPtr != '\n' && *CurPtr != '\r')
2616 return false;
2617 }
2618
2619 if (TrigraphPos) {
2620 // If no trigraphs are enabled, warn that we ignored this trigraph and
2621 // ignore this * character.
2622 if (!Trigraphs) {
2623 if (!L->isLexingRawMode())
2624 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2625 return false;
2626 }
2627 if (!L->isLexingRawMode())
2628 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2629 }
2630
2631 // Warn about having an escaped newline between the */ characters.
2632 if (!L->isLexingRawMode())
2633 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2634
2635 // If there was space between the backslash and newline, warn about it.
2636 if (SpacePos && !L->isLexingRawMode())
2637 L->Diag(SpacePos, diag::backslash_newline_space);
2638
2639 return true;
2640}
2641
2642#ifdef __SSE2__1
2643#include <emmintrin.h>
2644#elif __ALTIVEC__
2645#include <altivec.h>
2646#undef bool
2647#endif
2648
2649/// We have just read from input the / and * characters that started a comment.
2650/// Read until we find the * and / characters that terminate the comment.
2651/// Note that we don't bother decoding trigraphs or escaped newlines in block
2652/// comments, because they cannot cause the comment to end. The only thing
2653/// that can happen is the comment could end with an escaped newline between
2654/// the terminating * and /.
2655///
2656/// If we're in KeepCommentMode or any CommentHandler has inserted
2657/// some tokens, this will store the first token and return true.
2658bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2659 bool &TokAtPhysicalStartOfLine) {
2660 // Scan one character past where we should, looking for a '/' character. Once
2661 // we find it, check to see if it was preceded by a *. This common
2662 // optimization helps people who like to put a lot of * characters in their
2663 // comments.
2664
2665 // The first character we get with newlines and trigraphs skipped to handle
2666 // the degenerate /*/ case below correctly if the * has an escaped newline
2667 // after it.
2668 unsigned CharSize;
2669 unsigned char C = getCharAndSize(CurPtr, CharSize);
11
Calling 'Lexer::getCharAndSize'
14
Returning from 'Lexer::getCharAndSize'
2670 CurPtr += CharSize;
2671 if (C == 0 && CurPtr == BufferEnd+1) {
15
Assuming 'C' is not equal to 0
2672 if (!isLexingRawMode())
2673 Diag(BufferPtr, diag::err_unterminated_block_comment);
2674 --CurPtr;
2675
2676 // KeepWhitespaceMode should return this broken comment as a token. Since
2677 // it isn't a well formed comment, just return it as an 'unknown' token.
2678 if (isKeepWhitespaceMode()) {
2679 FormTokenWithChars(Result, CurPtr, tok::unknown);
2680 return true;
2681 }
2682
2683 BufferPtr = CurPtr;
2684 return false;
2685 }
2686
2687 // Check to see if the first character after the '/*' is another /. If so,
2688 // then this slash does not end the block comment, it is part of it.
2689 if (C == '/')
16
Assuming the condition is false
17
Taking false branch
2690 C = *CurPtr++;
2691
2692 // C++23 [lex.phases] p1
2693 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2694 // diagnostic only once per entire ill-formed subsequence to avoid
2695 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2696 bool UnicodeDecodingAlreadyDiagnosed = false;
2697
2698 while (true) {
2699 // Skip over all non-interesting characters until we find end of buffer or a
2700 // (probably ending) '/' character.
2701 if (CurPtr + 24 < BufferEnd &&
18
Assuming the condition is true
2702 // If there is a code-completion point avoid the fast scan because it
2703 // doesn't check for '\0'.
2704 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
19
Assuming field 'PP' is null
2705 // While not aligned to a 16-byte boundary.
2706 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
20
Loop condition is true. Entering loop body
22
Assuming the condition is true
23
Loop condition is true. Entering loop body
2707 if (!isASCII(C))
21
Taking false branch
24
Taking true branch
2708 goto MultiByteUTF8;
25
Control jumps to line 2784
2709 C = *CurPtr++;
2710 }
2711 if (C == '/') goto FoundSlash;
2712
2713#ifdef __SSE2__1
2714 __m128i Slashes = _mm_set1_epi8('/');
2715 while (CurPtr + 16 < BufferEnd) {
2716 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2717 if (LLVM_UNLIKELY(Mask != 0)__builtin_expect((bool)(Mask != 0), false)) {
2718 goto MultiByteUTF8;
2719 }
2720 // look for slashes
2721 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2722 Slashes));
2723 if (cmp != 0) {
2724 // Adjust the pointer to point directly after the first slash. It's
2725 // not necessary to set C here, it will be overwritten at the end of
2726 // the outer loop.
2727 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2728 goto FoundSlash;
2729 }
2730 CurPtr += 16;
2731 }
2732#elif __ALTIVEC__
2733 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2734 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2735 0x80, 0x80, 0x80, 0x80};
2736 __vector unsigned char Slashes = {
2737 '/', '/', '/', '/', '/', '/', '/', '/',
2738 '/', '/', '/', '/', '/', '/', '/', '/'
2739 };
2740 while (CurPtr + 16 < BufferEnd) {
2741 if (LLVM_UNLIKELY(__builtin_expect((bool)(vec_any_ge(*(const __vector unsigned char
*)CurPtr, LongUTF)), false)
2742 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))__builtin_expect((bool)(vec_any_ge(*(const __vector unsigned char
*)CurPtr, LongUTF)), false)
)
2743 goto MultiByteUTF8;
2744 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2745 break;
2746 }
2747 CurPtr += 16;
2748 }
2749
2750#else
2751 while (CurPtr + 16 < BufferEnd) {
2752 bool HasNonASCII = false;
2753 for (unsigned I = 0; I < 16; ++I)
2754 HasNonASCII |= !isASCII(CurPtr[I]);
2755
2756 if (LLVM_UNLIKELY(HasNonASCII)__builtin_expect((bool)(HasNonASCII), false))
2757 goto MultiByteUTF8;
2758
2759 bool HasSlash = false;
2760 for (unsigned I = 0; I < 16; ++I)
2761 HasSlash |= CurPtr[I] == '/';
2762 if (HasSlash)
2763 break;
2764 CurPtr += 16;
2765 }
2766#endif
2767
2768 // It has to be one of the bytes scanned, increment to it and read one.
2769 C = *CurPtr++;
2770 }
2771
2772 // Loop to scan the remainder, warning on invalid UTF-8
2773 // if the corresponding warning is enabled, emitting a diagnostic only once
2774 // per sequence that cannot be decoded.
2775 while (C != '/' && C != '\0') {
2776 if (isASCII(C)) {
2777 UnicodeDecodingAlreadyDiagnosed = false;
2778 C = *CurPtr++;
2779 continue;
2780 }
2781 MultiByteUTF8:
2782 // CurPtr is 1 code unit past C, so to decode
2783 // the codepoint, we need to read from the previous position.
2784 unsigned Length = llvm::getUTF8SequenceSize(
2785 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2786 if (Length == 0) {
26
Assuming 'Length' is equal to 0
2787 if (!UnicodeDecodingAlreadyDiagnosed
26.1
'UnicodeDecodingAlreadyDiagnosed' is false
26.1
'UnicodeDecodingAlreadyDiagnosed' is false
&& !isLexingRawMode())
27
Assuming the condition is true
28
Taking true branch
2788 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
29
Calling 'Lexer::Diag'
2789 UnicodeDecodingAlreadyDiagnosed = true;
2790 } else {
2791 UnicodeDecodingAlreadyDiagnosed = false;
2792 CurPtr += Length - 1;
2793 }
2794 C = *CurPtr++;
2795 }
2796
2797 if (C == '/') {
2798 FoundSlash:
2799 if (CurPtr[-2] == '*') // We found the final */. We're done!
2800 break;
2801
2802 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2803 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2804 LangOpts.Trigraphs)) {
2805 // We found the final */, though it had an escaped newline between the
2806 // * and /. We're done!
2807 break;
2808 }
2809 }
2810 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2811 // If this is a /* inside of the comment, emit a warning. Don't do this
2812 // if this is a /*/, which will end the comment. This misses cases with
2813 // embedded escaped newlines, but oh well.
2814 if (!isLexingRawMode())
2815 Diag(CurPtr-1, diag::warn_nested_block_comment);
2816 }
2817 } else if (C == 0 && CurPtr == BufferEnd+1) {
2818 if (!isLexingRawMode())
2819 Diag(BufferPtr, diag::err_unterminated_block_comment);
2820 // Note: the user probably forgot a */. We could continue immediately
2821 // after the /*, but this would involve lexing a lot of what really is the
2822 // comment, which surely would confuse the parser.
2823 --CurPtr;
2824
2825 // KeepWhitespaceMode should return this broken comment as a token. Since
2826 // it isn't a well formed comment, just return it as an 'unknown' token.
2827 if (isKeepWhitespaceMode()) {
2828 FormTokenWithChars(Result, CurPtr, tok::unknown);
2829 return true;
2830 }
2831
2832 BufferPtr = CurPtr;
2833 return false;
2834 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2835 PP->CodeCompleteNaturalLanguage();
2836 cutOffLexing();
2837 return false;
2838 }
2839
2840 C = *CurPtr++;
2841 }
2842
2843 // Notify comment handlers about the comment unless we're in a #if 0 block.
2844 if (PP && !isLexingRawMode() &&
2845 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2846 getSourceLocation(CurPtr)))) {
2847 BufferPtr = CurPtr;
2848 return true; // A token has to be returned.
2849 }
2850
2851 // If we are returning comments as tokens, return this comment as a token.
2852 if (inKeepCommentMode()) {
2853 FormTokenWithChars(Result, CurPtr, tok::comment);
2854 return true;
2855 }
2856
2857 // It is common for the tokens immediately after a /**/ comment to be
2858 // whitespace. Instead of going through the big switch, handle it
2859 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2860 // have already returned above with the comment as a token.
2861 if (isHorizontalWhitespace(*CurPtr)) {
2862 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2863 return false;
2864 }
2865
2866 // Otherwise, just return so that the next character will be lexed as a token.
2867 BufferPtr = CurPtr;
2868 Result.setFlag(Token::LeadingSpace);
2869 return false;
2870}
2871
2872//===----------------------------------------------------------------------===//
2873// Primary Lexing Entry Points
2874//===----------------------------------------------------------------------===//
2875
2876/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2877/// uninterpreted string. This switches the lexer out of directive mode.
2878void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2879 assert(ParsingPreprocessorDirective && ParsingFilename == false &&(static_cast <bool> (ParsingPreprocessorDirective &&
ParsingFilename == false && "Must be in a preprocessing directive!"
) ? void (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "clang/lib/Lex/Lexer.cpp", 2880, __extension__ __PRETTY_FUNCTION__
))
2880 "Must be in a preprocessing directive!")(static_cast <bool> (ParsingPreprocessorDirective &&
ParsingFilename == false && "Must be in a preprocessing directive!"
) ? void (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "clang/lib/Lex/Lexer.cpp", 2880, __extension__ __PRETTY_FUNCTION__
))
;
2881 Token Tmp;
2882 Tmp.startToken();
2883
2884 // CurPtr - Cache BufferPtr in an automatic variable.
2885 const char *CurPtr = BufferPtr;
2886 while (true) {
2887 char Char = getAndAdvanceChar(CurPtr, Tmp);
2888 switch (Char) {
2889 default:
2890 if (Result)
2891 Result->push_back(Char);
2892 break;
2893 case 0: // Null.
2894 // Found end of file?
2895 if (CurPtr-1 != BufferEnd) {
2896 if (isCodeCompletionPoint(CurPtr-1)) {
2897 PP->CodeCompleteNaturalLanguage();
2898 cutOffLexing();
2899 return;
2900 }
2901
2902 // Nope, normal character, continue.
2903 if (Result)
2904 Result->push_back(Char);
2905 break;
2906 }
2907 // FALL THROUGH.
2908 [[fallthrough]];
2909 case '\r':
2910 case '\n':
2911 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2912 assert(CurPtr[-1] == Char && "Trigraphs for newline?")(static_cast <bool> (CurPtr[-1] == Char && "Trigraphs for newline?"
) ? void (0) : __assert_fail ("CurPtr[-1] == Char && \"Trigraphs for newline?\""
, "clang/lib/Lex/Lexer.cpp", 2912, __extension__ __PRETTY_FUNCTION__
))
;
2913 BufferPtr = CurPtr-1;
2914
2915 // Next, lex the character, which should handle the EOD transition.
2916 Lex(Tmp);
2917 if (Tmp.is(tok::code_completion)) {
2918 if (PP)
2919 PP->CodeCompleteNaturalLanguage();
2920 Lex(Tmp);
2921 }
2922 assert(Tmp.is(tok::eod) && "Unexpected token!")(static_cast <bool> (Tmp.is(tok::eod) && "Unexpected token!"
) ? void (0) : __assert_fail ("Tmp.is(tok::eod) && \"Unexpected token!\""
, "clang/lib/Lex/Lexer.cpp", 2922, __extension__ __PRETTY_FUNCTION__
))
;
2923
2924 // Finally, we're done;
2925 return;
2926 }
2927 }
2928}
2929
2930/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2931/// condition, reporting diagnostics and handling other edge cases as required.
2932/// This returns true if Result contains a token, false if PP.Lex should be
2933/// called again.
2934bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2935 // If we hit the end of the file while parsing a preprocessor directive,
2936 // end the preprocessor directive first. The next token returned will
2937 // then be the end of file.
2938 if (ParsingPreprocessorDirective) {
2939 // Done parsing the "line".
2940 ParsingPreprocessorDirective = false;
2941 // Update the location of token as well as BufferPtr.
2942 FormTokenWithChars(Result, CurPtr, tok::eod);
2943
2944 // Restore comment saving mode, in case it was disabled for directive.
2945 if (PP)
2946 resetExtendedTokenMode();
2947 return true; // Have a token.
2948 }
2949
2950 // If we are in raw mode, return this event as an EOF token. Let the caller
2951 // that put us in raw mode handle the event.
2952 if (isLexingRawMode()) {
2953 Result.startToken();
2954 BufferPtr = BufferEnd;
2955 FormTokenWithChars(Result, BufferEnd, tok::eof);
2956 return true;
2957 }
2958
2959 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2960 PP->setRecordedPreambleConditionalStack(ConditionalStack);
2961 // If the preamble cuts off the end of a header guard, consider it guarded.
2962 // The guard is valid for the preamble content itself, and for tools the
2963 // most useful answer is "yes, this file has a header guard".
2964 if (!ConditionalStack.empty())
2965 MIOpt.ExitTopLevelConditional();
2966 ConditionalStack.clear();
2967 }
2968
2969 // Issue diagnostics for unterminated #if and missing newline.
2970
2971 // If we are in a #if directive, emit an error.
2972 while (!ConditionalStack.empty()) {
2973 if (PP->getCodeCompletionFileLoc() != FileLoc)
2974 PP->Diag(ConditionalStack.back().IfLoc,
2975 diag::err_pp_unterminated_conditional);
2976 ConditionalStack.pop_back();
2977 }
2978
2979 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2980 // a pedwarn.
2981 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2982 DiagnosticsEngine &Diags = PP->getDiagnostics();
2983 SourceLocation EndLoc = getSourceLocation(BufferEnd);
2984 unsigned DiagID;
2985
2986 if (LangOpts.CPlusPlus11) {
2987 // C++11 [lex.phases] 2.2 p2
2988 // Prefer the C++98 pedantic compatibility warning over the generic,
2989 // non-extension, user-requested "missing newline at EOF" warning.
2990 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2991 DiagID = diag::warn_cxx98_compat_no_newline_eof;
2992 } else {
2993 DiagID = diag::warn_no_newline_eof;
2994 }
2995 } else {
2996 DiagID = diag::ext_no_newline_eof;
2997 }
2998
2999 Diag(BufferEnd, DiagID)
3000 << FixItHint::CreateInsertion(EndLoc, "\n");
3001 }
3002
3003 BufferPtr = CurPtr;
3004
3005 // Finally, let the preprocessor handle this.
3006 return PP->HandleEndOfFile(Result, isPragmaLexer());
3007}
3008
3009/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3010/// the specified lexer will return a tok::l_paren token, 0 if it is something
3011/// else and 2 if there are no more tokens in the buffer controlled by the
3012/// lexer.
3013unsigned Lexer::isNextPPTokenLParen() {
3014 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?")(static_cast <bool> (!LexingRawMode && "How can we expand a macro from a skipping buffer?"
) ? void (0) : __assert_fail ("!LexingRawMode && \"How can we expand a macro from a skipping buffer?\""
, "clang/lib/Lex/Lexer.cpp", 3014, __extension__ __PRETTY_FUNCTION__
))
;
3015
3016 if (isDependencyDirectivesLexer()) {
3017 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3018 return 2;
3019 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3020 tok::l_paren);
3021 }
3022
3023 // Switch to 'skipping' mode. This will ensure that we can lex a token
3024 // without emitting diagnostics, disables macro expansion, and will cause EOF
3025 // to return an EOF token instead of popping the include stack.
3026 LexingRawMode = true;
3027
3028 // Save state that can be changed while lexing so that we can restore it.
3029 const char *TmpBufferPtr = BufferPtr;
3030 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3031 bool atStartOfLine = IsAtStartOfLine;
3032 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3033 bool leadingSpace = HasLeadingSpace;
3034
3035 Token Tok;
3036 Lex(Tok);
3037
3038 // Restore state that may have changed.
3039 BufferPtr = TmpBufferPtr;
3040 ParsingPreprocessorDirective = inPPDirectiveMode;
3041 HasLeadingSpace = leadingSpace;
3042 IsAtStartOfLine = atStartOfLine;
3043 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3044
3045 // Restore the lexer back to non-skipping mode.
3046 LexingRawMode = false;
3047
3048 if (Tok.is(tok::eof))
3049 return 2;
3050 return Tok.is(tok::l_paren);
3051}
3052
3053/// Find the end of a version control conflict marker.
3054static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3055 ConflictMarkerKind CMK) {
3056 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3057 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3058 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3059 size_t Pos = RestOfBuffer.find(Terminator);
3060 while (Pos != StringRef::npos) {
3061 // Must occur at start of line.
3062 if (Pos == 0 ||
3063 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3064 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3065 Pos = RestOfBuffer.find(Terminator);
3066 continue;
3067 }
3068 return RestOfBuffer.data()+Pos;
3069 }
3070 return nullptr;
3071}
3072
3073/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3074/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3075/// and recover nicely. This returns true if it is a conflict marker and false
3076/// if not.
3077bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3078 // Only a conflict marker if it starts at the beginning of a line.
3079 if (CurPtr != BufferStart &&
3080 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3081 return false;
3082
3083 // Check to see if we have <<<<<<< or >>>>.
3084 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
3085 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
3086 return false;
3087
3088 // If we have a situation where we don't care about conflict markers, ignore
3089 // it.
3090 if (CurrentConflictMarkerState || isLexingRawMode())
3091 return false;
3092
3093 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3094
3095 // Check to see if there is an ending marker somewhere in the buffer at the
3096 // start of a line to terminate this conflict marker.
3097 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3098 // We found a match. We are really in a conflict marker.
3099 // Diagnose this, and ignore to the end of line.
3100 Diag(CurPtr, diag::err_conflict_marker);
3101 CurrentConflictMarkerState = Kind;
3102
3103 // Skip ahead to the end of line. We know this exists because the
3104 // end-of-conflict marker starts with \r or \n.
3105 while (*CurPtr != '\r' && *CurPtr != '\n') {
3106 assert(CurPtr != BufferEnd && "Didn't find end of line")(static_cast <bool> (CurPtr != BufferEnd && "Didn't find end of line"
) ? void (0) : __assert_fail ("CurPtr != BufferEnd && \"Didn't find end of line\""
, "clang/lib/Lex/Lexer.cpp", 3106, __extension__ __PRETTY_FUNCTION__
))
;
3107 ++CurPtr;
3108 }
3109 BufferPtr = CurPtr;
3110 return true;
3111 }
3112
3113 // No end of conflict marker found.
3114 return false;
3115}
3116
3117/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3118/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3119/// is the end of a conflict marker. Handle it by ignoring up until the end of
3120/// the line. This returns true if it is a conflict marker and false if not.
3121bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3122 // Only a conflict marker if it starts at the beginning of a line.
3123 if (CurPtr != BufferStart &&
3124 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3125 return false;
3126
3127 // If we have a situation where we don't care about conflict markers, ignore
3128 // it.
3129 if (!CurrentConflictMarkerState || isLexingRawMode())
3130 return false;
3131
3132 // Check to see if we have the marker (4 characters in a row).
3133 for (unsigned i = 1; i != 4; ++i)
3134 if (CurPtr[i] != CurPtr[0])
3135 return false;
3136
3137 // If we do have it, search for the end of the conflict marker. This could
3138 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3139 // be the end of conflict marker.
3140 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3141 CurrentConflictMarkerState)) {
3142 CurPtr = End;
3143
3144 // Skip ahead to the end of line.
3145 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3146 ++CurPtr;
3147
3148 BufferPtr = CurPtr;
3149
3150 // No longer in the conflict marker.
3151 CurrentConflictMarkerState = CMK_None;
3152 return true;
3153 }
3154
3155 return false;
3156}
3157
3158static const char *findPlaceholderEnd(const char *CurPtr,
3159 const char *BufferEnd) {
3160 if (CurPtr == BufferEnd)
3161 return nullptr;
3162 BufferEnd -= 1; // Scan until the second last character.
3163 for (; CurPtr != BufferEnd; ++CurPtr) {
3164 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3165 return CurPtr + 2;
3166 }
3167 return nullptr;
3168}
3169
3170bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3171 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!")(static_cast <bool> (CurPtr[-1] == '<' && CurPtr
[0] == '#' && "Not a placeholder!") ? void (0) : __assert_fail
("CurPtr[-1] == '<' && CurPtr[0] == '#' && \"Not a placeholder!\""
, "clang/lib/Lex/Lexer.cpp", 3171, __extension__ __PRETTY_FUNCTION__
))
;
3172 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
3173 return false;
3174 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3175 if (!End)
3176 return false;
3177 const char *Start = CurPtr - 1;
3178 if (!LangOpts.AllowEditorPlaceholders)
3179 Diag(Start, diag::err_placeholder_in_source);
3180 Result.startToken();
3181 FormTokenWithChars(Result, End, tok::raw_identifier);
3182 Result.setRawIdentifierData(Start);
3183 PP->LookUpIdentifierInfo(Result);
3184 Result.setFlag(Token::IsEditorPlaceholder);
3185 BufferPtr = End;
3186 return true;
3187}
3188
3189bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3190 if (PP && PP->isCodeCompletionEnabled()) {
3191 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3192 return Loc == PP->getCodeCompletionLoc();
3193 }
3194
3195 return false;
3196}
3197
3198llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3199 const char *SlashLoc,
3200 Token *Result) {
3201 unsigned CharSize;
3202 char Kind = getCharAndSize(StartPtr, CharSize);
3203 assert((Kind == 'u' || Kind == 'U') && "expected a UCN")(static_cast <bool> ((Kind == 'u' || Kind == 'U') &&
"expected a UCN") ? void (0) : __assert_fail ("(Kind == 'u' || Kind == 'U') && \"expected a UCN\""
, "clang/lib/Lex/Lexer.cpp", 3203, __extension__ __PRETTY_FUNCTION__
))
;
3204
3205 unsigned NumHexDigits;
3206 if (Kind == 'u')
3207 NumHexDigits = 4;
3208 else if (Kind == 'U')
3209 NumHexDigits = 8;
3210
3211 bool Delimited = false;
3212 bool FoundEndDelimiter = false;
3213 unsigned Count = 0;
3214 bool Diagnose = Result && !isLexingRawMode();
3215
3216 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3217 if (Diagnose)
3218 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3219 return llvm::None;
3220 }
3221
3222 const char *CurPtr = StartPtr + CharSize;
3223 const char *KindLoc = &CurPtr[-1];
3224
3225 uint32_t CodePoint = 0;
3226 while (Count != NumHexDigits || Delimited) {
3227 char C = getCharAndSize(CurPtr, CharSize);
3228 if (!Delimited && C == '{') {
3229 Delimited = true;
3230 CurPtr += CharSize;
3231 continue;
3232 }
3233
3234 if (Delimited && C == '}') {
3235 CurPtr += CharSize;
3236 FoundEndDelimiter = true;
3237 break;
3238 }
3239
3240 unsigned Value = llvm::hexDigitValue(C);
3241 if (Value == -1U) {
3242 if (!Delimited)
3243 break;
3244 if (Diagnose)
3245 Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
3246 << StringRef(KindLoc, 1);
3247 return llvm::None;
3248 }
3249
3250 if (CodePoint & 0xF000'0000) {
3251 if (Diagnose)
3252 Diag(KindLoc, diag::err_escape_too_large) << 0;
3253 return llvm::None;
3254 }
3255
3256 CodePoint <<= 4;
3257 CodePoint |= Value;
3258 CurPtr += CharSize;
3259 Count++;
3260 }
3261
3262 if (Count == 0) {
3263 if (Diagnose)
3264 Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3265 : diag::warn_ucn_escape_no_digits)
3266 << StringRef(KindLoc, 1);
3267 return llvm::None;
3268 }
3269
3270 if (Delimited && Kind == 'U') {
3271 if (Diagnose)
3272 Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3273 return llvm::None;
3274 }
3275
3276 if (!Delimited && Count != NumHexDigits) {
3277 if (Diagnose) {
3278 Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
3279 // If the user wrote \U1234, suggest a fixit to \u.
3280 if (Count == 4 && NumHexDigits == 8) {
3281 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3282 Diag(KindLoc, diag::note_ucn_four_not_eight)
3283 << FixItHint::CreateReplacement(URange, "u");
3284 }
3285 }
3286 return llvm::None;
3287 }
3288
3289 if (Delimited && PP) {
3290 Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b
3291 ? diag::warn_cxx2b_delimited_escape_sequence
3292 : diag::ext_delimited_escape_sequence)
3293 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3294 }
3295
3296 if (Result) {
3297 Result->setFlag(Token::HasUCN);
3298 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0)))
3299 StartPtr = CurPtr;
3300 else
3301 while (StartPtr != CurPtr)
3302 (void)getAndAdvanceChar(StartPtr, *Result);
3303 } else {
3304 StartPtr = CurPtr;
3305 }
3306 return CodePoint;
3307}
3308
3309llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3310 Token *Result) {
3311 unsigned CharSize;
3312 bool Diagnose = Result && !isLexingRawMode();
3313
3314 char C = getCharAndSize(StartPtr, CharSize);
3315 assert(C == 'N' && "expected \\N{...}")(static_cast <bool> (C == 'N' && "expected \\N{...}"
) ? void (0) : __assert_fail ("C == 'N' && \"expected \\\\N{...}\""
, "clang/lib/Lex/Lexer.cpp", 3315, __extension__ __PRETTY_FUNCTION__
))
;
3316
3317 const char *CurPtr = StartPtr + CharSize;
3318 const char *KindLoc = &CurPtr[-1];
3319
3320 C = getCharAndSize(CurPtr, CharSize);
3321 if (C != '{') {
3322 if (Diagnose)
3323 Diag(StartPtr, diag::warn_ucn_escape_incomplete);
3324 return llvm::None;
3325 }
3326 CurPtr += CharSize;
3327 const char *StartName = CurPtr;
3328 bool FoundEndDelimiter = false;
3329 llvm::SmallVector<char, 30> Buffer;
3330 while (C) {
3331 C = getCharAndSize(CurPtr, CharSize);
3332 CurPtr += CharSize;
3333 if (C == '}') {
3334 FoundEndDelimiter = true;
3335 break;
3336 }
3337
3338 if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
3339 break;
3340 Buffer.push_back(C);
3341 }
3342
3343 if (!FoundEndDelimiter || Buffer.empty()) {
3344 if (Diagnose)
3345 Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3346 : diag::warn_delimited_ucn_incomplete)
3347 << StringRef(KindLoc, 1);
3348 return llvm::None;
3349 }
3350
3351 StringRef Name(Buffer.data(), Buffer.size());
3352 llvm::Optional<char32_t> Res =
3353 llvm::sys::unicode::nameToCodepointStrict(Name);
3354 llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3355 if (!Res) {
3356 if (!isLexingRawMode()) {
3357 Diag(StartPtr, diag::err_invalid_ucn_name)
3358 << StringRef(Buffer.data(), Buffer.size());
3359 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3360 if (LooseMatch) {
3361 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3362 << FixItHint::CreateReplacement(
3363 makeCharRange(*this, StartName, CurPtr - CharSize),
3364 LooseMatch->Name);
3365 }
3366 }
3367 // When finding a match using Unicode loose matching rules
3368 // recover after having emitted a diagnostic.
3369 if (!LooseMatch)
3370 return llvm::None;
3371 // We do not offer misspelled character names suggestions here
3372 // as the set of what would be a valid suggestion depends on context,
3373 // and we should not make invalid suggestions.
3374 }
3375
3376 if (Diagnose && PP && !LooseMatch)
3377 Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b
3378 ? diag::warn_cxx2b_delimited_escape_sequence
3379 : diag::ext_delimited_escape_sequence)
3380 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3381
3382 if (LooseMatch)
3383 Res = LooseMatch->CodePoint;
3384
3385 if (Result) {
3386 Result->setFlag(Token::HasUCN);
3387 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
3388 StartPtr = CurPtr;
3389 else
3390 while (StartPtr != CurPtr)
3391 (void)getAndAdvanceChar(StartPtr, *Result);
3392 } else {
3393 StartPtr = CurPtr;
3394 }
3395 return *Res;
3396}
3397
3398uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3399 Token *Result) {
3400
3401 unsigned CharSize;
3402 llvm::Optional<uint32_t> CodePointOpt;
3403 char Kind = getCharAndSize(StartPtr, CharSize);
3404 if (Kind == 'u' || Kind == 'U')
3405 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3406 else if (Kind == 'N')
3407 CodePointOpt = tryReadNamedUCN(StartPtr, Result);
3408
3409 if (!CodePointOpt)
3410 return 0;
3411
3412 uint32_t CodePoint = *CodePointOpt;
3413
3414 // Don't apply C family restrictions to UCNs in assembly mode
3415 if (LangOpts.AsmPreprocessor)
3416 return CodePoint;
3417
3418 // C99 6.4.3p2: A universal character name shall not specify a character whose
3419 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3420 // 0060 (`), nor one in the range D800 through DFFF inclusive.)
3421 // C++11 [lex.charset]p2: If the hexadecimal value for a
3422 // universal-character-name corresponds to a surrogate code point (in the
3423 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3424 // if the hexadecimal value for a universal-character-name outside the
3425 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3426 // string literal corresponds to a control character (in either of the
3427 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3428 // basic source character set, the program is ill-formed.
3429 if (CodePoint < 0xA0) {
3430 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
3431 return CodePoint;
3432
3433 // We don't use isLexingRawMode() here because we need to warn about bad
3434 // UCNs even when skipping preprocessing tokens in a #if block.
3435 if (Result && PP) {
3436 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3437 Diag(BufferPtr, diag::err_ucn_control_character);
3438 else {
3439 char C = static_cast<char>(CodePoint);
3440 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3441 }
3442 }
3443
3444 return 0;
3445 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3446 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3447 // We don't use isLexingRawMode() here because we need to diagnose bad
3448 // UCNs even when skipping preprocessing tokens in a #if block.
3449 if (Result && PP) {
3450 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3451 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3452 else
3453 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3454 }
3455 return 0;
3456 }
3457
3458 return CodePoint;
3459}
3460
3461bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3462 const char *CurPtr) {
3463 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3464 isUnicodeWhitespace(C)) {
3465 Diag(BufferPtr, diag::ext_unicode_whitespace)
3466 << makeCharRange(*this, BufferPtr, CurPtr);
3467
3468 Result.setFlag(Token::LeadingSpace);
3469 return true;
3470 }
3471 return false;
3472}
3473
3474void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3475 IsAtStartOfLine = Result.isAtStartOfLine();
3476 HasLeadingSpace = Result.hasLeadingSpace();
3477 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3478 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3479}
3480
3481bool Lexer::Lex(Token &Result) {
3482 assert(!isDependencyDirectivesLexer())(static_cast <bool> (!isDependencyDirectivesLexer()) ? void
(0) : __assert_fail ("!isDependencyDirectivesLexer()", "clang/lib/Lex/Lexer.cpp"
, 3482, __extension__ __PRETTY_FUNCTION__))
;
3483
3484 // Start a new token.
3485 Result.startToken();
3486
3487 // Set up misc whitespace flags for LexTokenInternal.
3488 if (IsAtStartOfLine) {
3489 Result.setFlag(Token::StartOfLine);
3490 IsAtStartOfLine = false;
3491 }
3492
3493 if (HasLeadingSpace) {
3494 Result.setFlag(Token::LeadingSpace);
3495 HasLeadingSpace = false;
3496 }
3497
3498 if (HasLeadingEmptyMacro) {
3499 Result.setFlag(Token::LeadingEmptyMacro);
3500 HasLeadingEmptyMacro = false;
3501 }
3502
3503 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3504 IsAtPhysicalStartOfLine = false;
3505 bool isRawLex = isLexingRawMode();
3506 (void) isRawLex;
3507 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3508 // (After the LexTokenInternal call, the lexer might be destroyed.)
3509 assert((returnedToken || !isRawLex) && "Raw lex must succeed")(static_cast <bool> ((returnedToken || !isRawLex) &&
"Raw lex must succeed") ? void (0) : __assert_fail ("(returnedToken || !isRawLex) && \"Raw lex must succeed\""
, "clang/lib/Lex/Lexer.cpp", 3509, __extension__ __PRETTY_FUNCTION__
))
;
3510 return returnedToken;
3511}
3512
3513/// LexTokenInternal - This implements a simple C family lexer. It is an
3514/// extremely performance critical piece of code. This assumes that the buffer
3515/// has a null character at the end of the file. This returns a preprocessing
3516/// token, not a normal token, as such, it is an internal interface. It assumes
3517/// that the Flags of result have been cleared before calling this.
3518bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3519LexNextToken:
3520 // New token, can't need cleaning yet.
3521 Result.clearFlag(Token::NeedsCleaning);
3522 Result.setIdentifierInfo(nullptr);
3523
3524 // CurPtr - Cache BufferPtr in an automatic variable.
3525 const char *CurPtr = BufferPtr;
3526
3527 // Small amounts of horizontal whitespace is very common between tokens.
3528 if (isHorizontalWhitespace(*CurPtr)) {
1
Assuming the condition is false
2
Taking false branch
3529 do {
3530 ++CurPtr;
3531 } while (isHorizontalWhitespace(*CurPtr));
3532
3533 // If we are keeping whitespace and other tokens, just return what we just
3534 // skipped. The next lexer invocation will return the token after the
3535 // whitespace.
3536 if (isKeepWhitespaceMode()) {
3537 FormTokenWithChars(Result, CurPtr, tok::unknown);
3538 // FIXME: The next token will not have LeadingSpace set.
3539 return true;
3540 }
3541
3542 BufferPtr = CurPtr;
3543 Result.setFlag(Token::LeadingSpace);
3544 }
3545
3546 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3547
3548 // Read a character, advancing over it.
3549 char Char = getAndAdvanceChar(CurPtr, Result);
3550 tok::TokenKind Kind;
3551
3552 if (!isVerticalWhitespace(Char))
3
Assuming the condition is false
4
Taking false branch
3553 NewLinePtr = nullptr;
3554
3555 switch (Char) {
5
Control jumps to 'case 47:' at line 3937
3556 case 0: // Null.
3557 // Found end of file?
3558 if (CurPtr-1 == BufferEnd)
3559 return LexEndOfFile(Result, CurPtr-1);
3560
3561 // Check if we are performing code completion.
3562 if (isCodeCompletionPoint(CurPtr-1)) {
3563 // Return the code-completion token.
3564 Result.startToken();
3565 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3566 return true;
3567 }
3568
3569 if (!isLexingRawMode())
3570 Diag(CurPtr-1, diag::null_in_file);
3571 Result.setFlag(Token::LeadingSpace);
3572 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3573 return true; // KeepWhitespaceMode
3574
3575 // We know the lexer hasn't changed, so just try again with this lexer.
3576 // (We manually eliminate the tail call to avoid recursion.)
3577 goto LexNextToken;
3578
3579 case 26: // DOS & CP/M EOF: "^Z".
3580 // If we're in Microsoft extensions mode, treat this as end of file.
3581 if (LangOpts.MicrosoftExt) {
3582 if (!isLexingRawMode())
3583 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3584 return LexEndOfFile(Result, CurPtr-1);
3585 }
3586
3587 // If Microsoft extensions are disabled, this is just random garbage.
3588 Kind = tok::unknown;
3589 break;
3590
3591 case '\r':
3592 if (CurPtr[0] == '\n')
3593 (void)getAndAdvanceChar(CurPtr, Result);
3594 [[fallthrough]];
3595 case '\n':
3596 // If we are inside a preprocessor directive and we see the end of line,
3597 // we know we are done with the directive, so return an EOD token.
3598 if (ParsingPreprocessorDirective) {
3599 // Done parsing the "line".
3600 ParsingPreprocessorDirective = false;
3601
3602 // Restore comment saving mode, in case it was disabled for directive.
3603 if (PP)
3604 resetExtendedTokenMode();
3605
3606 // Since we consumed a newline, we are back at the start of a line.
3607 IsAtStartOfLine = true;
3608 IsAtPhysicalStartOfLine = true;
3609 NewLinePtr = CurPtr - 1;
3610
3611 Kind = tok::eod;
3612 break;
3613 }
3614
3615 // No leading whitespace seen so far.
3616 Result.clearFlag(Token::LeadingSpace);
3617
3618 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3619 return true; // KeepWhitespaceMode
3620
3621 // We only saw whitespace, so just try again with this lexer.
3622 // (We manually eliminate the tail call to avoid recursion.)
3623 goto LexNextToken;
3624 case ' ':
3625 case '\t':
3626 case '\f':
3627 case '\v':
3628 SkipHorizontalWhitespace:
3629 Result.setFlag(Token::LeadingSpace);
3630 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3631 return true; // KeepWhitespaceMode
3632
3633 SkipIgnoredUnits:
3634 CurPtr = BufferPtr;
3635
3636 // If the next token is obviously a // or /* */ comment, skip it efficiently
3637 // too (without going through the big switch stmt).
3638 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3639 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3640 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3641 return true; // There is a token to return.
3642 goto SkipIgnoredUnits;
3643 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3644 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3645 return true; // There is a token to return.
3646 goto SkipIgnoredUnits;
3647 } else if (isHorizontalWhitespace(*CurPtr)) {
3648 goto SkipHorizontalWhitespace;
3649 }
3650 // We only saw whitespace, so just try again with this lexer.
3651 // (We manually eliminate the tail call to avoid recursion.)
3652 goto LexNextToken;
3653
3654 // C99 6.4.4.1: Integer Constants.
3655 // C99 6.4.4.2: Floating Constants.
3656 case '0': case '1': case '2': case '3': case '4':
3657 case '5': case '6': case '7': case '8': case '9':
3658 // Notify MIOpt that we read a non-whitespace/non-comment token.
3659 MIOpt.ReadToken();
3660 return LexNumericConstant(Result, CurPtr);
3661
3662 // Identifier (e.g., uber), or
3663 // UTF-8 (C2x/C++17) or UTF-16 (C11/C++11) character literal, or
3664 // UTF-8 or UTF-16 string literal (C11/C++11).
3665 case 'u':
3666 // Notify MIOpt that we read a non-whitespace/non-comment token.
3667 MIOpt.ReadToken();
3668
3669 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3670 Char = getCharAndSize(CurPtr, SizeTmp);
3671
3672 // UTF-16 string literal
3673 if (Char == '"')
3674 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3675 tok::utf16_string_literal);
3676
3677 // UTF-16 character constant
3678 if (Char == '\'')
3679 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3680 tok::utf16_char_constant);
3681
3682 // UTF-16 raw string literal
3683 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3684 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3685 return LexRawStringLiteral(Result,
3686 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3687 SizeTmp2, Result),
3688 tok::utf16_string_literal);
3689
3690 if (Char == '8') {
3691 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3692
3693 // UTF-8 string literal
3694 if (Char2 == '"')
3695 return LexStringLiteral(Result,
3696 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3697 SizeTmp2, Result),
3698 tok::utf8_string_literal);
3699 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C2x))
3700 return LexCharConstant(
3701 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3702 SizeTmp2, Result),
3703 tok::utf8_char_constant);
3704
3705 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3706 unsigned SizeTmp3;
3707 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3708 // UTF-8 raw string literal
3709 if (Char3 == '"') {
3710 return LexRawStringLiteral(Result,
3711 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3712 SizeTmp2, Result),
3713 SizeTmp3, Result),
3714 tok::utf8_string_literal);
3715 }
3716 }
3717 }
3718 }
3719
3720 // treat u like the start of an identifier.
3721 return LexIdentifierContinue(Result, CurPtr);
3722
3723 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3724 // Notify MIOpt that we read a non-whitespace/non-comment token.
3725 MIOpt.ReadToken();
3726
3727 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3728 Char = getCharAndSize(CurPtr, SizeTmp);
3729
3730 // UTF-32 string literal
3731 if (Char == '"')
3732 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3733 tok::utf32_string_literal);
3734
3735 // UTF-32 character constant
3736 if (Char == '\'')
3737 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3738 tok::utf32_char_constant);
3739
3740 // UTF-32 raw string literal
3741 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3742 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3743 return LexRawStringLiteral(Result,
3744 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3745 SizeTmp2, Result),
3746 tok::utf32_string_literal);
3747 }
3748
3749 // treat U like the start of an identifier.
3750 return LexIdentifierContinue(Result, CurPtr);
3751
3752 case 'R': // Identifier or C++0x raw string literal
3753 // Notify MIOpt that we read a non-whitespace/non-comment token.
3754 MIOpt.ReadToken();
3755
3756 if (LangOpts.CPlusPlus11) {
3757 Char = getCharAndSize(CurPtr, SizeTmp);
3758
3759 if (Char == '"')
3760 return LexRawStringLiteral(Result,
3761 ConsumeChar(CurPtr, SizeTmp, Result),
3762 tok::string_literal);
3763 }
3764
3765 // treat R like the start of an identifier.
3766 return LexIdentifierContinue(Result, CurPtr);
3767
3768 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3769 // Notify MIOpt that we read a non-whitespace/non-comment token.
3770 MIOpt.ReadToken();
3771 Char = getCharAndSize(CurPtr, SizeTmp);
3772
3773 // Wide string literal.
3774 if (Char == '"')
3775 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3776 tok::wide_string_literal);
3777
3778 // Wide raw string literal.
3779 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3780 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3781 return LexRawStringLiteral(Result,
3782 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3783 SizeTmp2, Result),
3784 tok::wide_string_literal);
3785
3786 // Wide character constant.
3787 if (Char == '\'')
3788 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3789 tok::wide_char_constant);
3790 // FALL THROUGH, treating L like the start of an identifier.
3791 [[fallthrough]];
3792
3793 // C99 6.4.2: Identifiers.
3794 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3795 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3796 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3797 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3798 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3799 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3800 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3801 case 'v': case 'w': case 'x': case 'y': case 'z':
3802 case '_':
3803 // Notify MIOpt that we read a non-whitespace/non-comment token.
3804 MIOpt.ReadToken();
3805 return LexIdentifierContinue(Result, CurPtr);
3806
3807 case '$': // $ in identifiers.
3808 if (LangOpts.DollarIdents) {
3809 if (!isLexingRawMode())
3810 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3811 // Notify MIOpt that we read a non-whitespace/non-comment token.
3812 MIOpt.ReadToken();
3813 return LexIdentifierContinue(Result, CurPtr);
3814 }
3815
3816 Kind = tok::unknown;
3817 break;
3818
3819 // C99 6.4.4: Character Constants.
3820 case '\'':
3821 // Notify MIOpt that we read a non-whitespace/non-comment token.
3822 MIOpt.ReadToken();
3823 return LexCharConstant(Result, CurPtr, tok::char_constant);
3824
3825 // C99 6.4.5: String Literals.
3826 case '"':
3827 // Notify MIOpt that we read a non-whitespace/non-comment token.
3828 MIOpt.ReadToken();
3829 return LexStringLiteral(Result, CurPtr,
3830 ParsingFilename ? tok::header_name
3831 : tok::string_literal);
3832
3833 // C99 6.4.6: Punctuators.
3834 case '?':
3835 Kind = tok::question;
3836 break;
3837 case '[':
3838 Kind = tok::l_square;
3839 break;
3840 case ']':
3841 Kind = tok::r_square;
3842 break;
3843 case '(':
3844 Kind = tok::l_paren;
3845 break;
3846 case ')':
3847 Kind = tok::r_paren;
3848 break;
3849 case '{':
3850 Kind = tok::l_brace;
3851 break;
3852 case '}':
3853 Kind = tok::r_brace;
3854 break;
3855 case '.':
3856 Char = getCharAndSize(CurPtr, SizeTmp);
3857 if (Char >= '0' && Char <= '9') {
3858 // Notify MIOpt that we read a non-whitespace/non-comment token.
3859 MIOpt.ReadToken();
3860
3861 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3862 } else if (LangOpts.CPlusPlus && Char == '*') {
3863 Kind = tok::periodstar;
3864 CurPtr += SizeTmp;
3865 } else if (Char == '.' &&
3866 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3867 Kind = tok::ellipsis;
3868 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3869 SizeTmp2, Result);
3870 } else {
3871 Kind = tok::period;
3872 }
3873 break;
3874 case '&':
3875 Char = getCharAndSize(CurPtr, SizeTmp);
3876 if (Char == '&') {
3877 Kind = tok::ampamp;
3878 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3879 } else if (Char == '=') {
3880 Kind = tok::ampequal;
3881 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3882 } else {
3883 Kind = tok::amp;
3884 }
3885 break;
3886 case '*':
3887 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3888 Kind = tok::starequal;
3889 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3890 } else {
3891 Kind = tok::star;
3892 }
3893 break;
3894 case '+':
3895 Char = getCharAndSize(CurPtr, SizeTmp);
3896 if (Char == '+') {
3897 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3898 Kind = tok::plusplus;
3899 } else if (Char == '=') {
3900 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3901 Kind = tok::plusequal;
3902 } else {
3903 Kind = tok::plus;
3904 }
3905 break;
3906 case '-':
3907 Char = getCharAndSize(CurPtr, SizeTmp);
3908 if (Char == '-') { // --
3909 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3910 Kind = tok::minusminus;
3911 } else if (Char == '>' && LangOpts.CPlusPlus &&
3912 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3913 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3914 SizeTmp2, Result);
3915 Kind = tok::arrowstar;
3916 } else if (Char == '>') { // ->
3917 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3918 Kind = tok::arrow;
3919 } else if (Char == '=') { // -=
3920 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3921 Kind = tok::minusequal;
3922 } else {
3923 Kind = tok::minus;
3924 }
3925 break;
3926 case '~':
3927 Kind = tok::tilde;
3928 break;
3929 case '!':
3930 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3931 Kind = tok::exclaimequal;
3932 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3933 } else {
3934 Kind = tok::exclaim;
3935 }
3936 break;
3937 case '/':
3938 // 6.4.9: Comments
3939 Char = getCharAndSize(CurPtr, SizeTmp);
3940 if (Char == '/') { // Line comment.
6
Assuming the condition is false
7
Taking false branch
3941 // Even if Line comments are disabled (e.g. in C89 mode), we generally
3942 // want to lex this as a comment. There is one problem with this though,
3943 // that in one particular corner case, this can change the behavior of the
3944 // resultant program. For example, In "foo //**/ bar", C89 would lex
3945 // this as "foo / bar" and languages with Line comments would lex it as
3946 // "foo". Check to see if the character after the second slash is a '*'.
3947 // If so, we will lex that as a "/" instead of the start of a comment.
3948 // However, we never do this if we are just preprocessing.
3949 bool TreatAsComment =
3950 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3951 if (!TreatAsComment)
3952 if (!(PP && PP->isPreprocessedOutput()))
3953 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3954
3955 if (TreatAsComment) {
3956 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3957 TokAtPhysicalStartOfLine))
3958 return true; // There is a token to return.
3959
3960 // It is common for the tokens immediately after a // comment to be
3961 // whitespace (indentation for the next line). Instead of going through
3962 // the big switch, handle it efficiently now.
3963 goto SkipIgnoredUnits;
3964 }
3965 }
3966
3967 if (Char == '*') { // /**/ comment.
8
Assuming the condition is true
9
Taking true branch
3968 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
10
Calling 'Lexer::SkipBlockComment'
3969 TokAtPhysicalStartOfLine))
3970 return true; // There is a token to return.
3971
3972 // We only saw whitespace, so just try again with this lexer.
3973 // (We manually eliminate the tail call to avoid recursion.)
3974 goto LexNextToken;
3975 }
3976
3977 if (Char == '=') {
3978 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3979 Kind = tok::slashequal;
3980 } else {
3981 Kind = tok::slash;
3982 }
3983 break;
3984 case '%':
3985 Char = getCharAndSize(CurPtr, SizeTmp);
3986 if (Char == '=') {
3987 Kind = tok::percentequal;
3988 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3989 } else if (LangOpts.Digraphs && Char == '>') {
3990 Kind = tok::r_brace; // '%>' -> '}'
3991 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3992 } else if (LangOpts.Digraphs && Char == ':') {
3993 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3994 Char = getCharAndSize(CurPtr, SizeTmp);
3995 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3996 Kind = tok::hashhash; // '%:%:' -> '##'
3997 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3998 SizeTmp2, Result);
3999 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4000 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4001 if (!isLexingRawMode())
4002 Diag(BufferPtr, diag::ext_charize_microsoft);
4003 Kind = tok::hashat;
4004 } else { // '%:' -> '#'
4005 // We parsed a # character. If this occurs at the start of the line,
4006 // it's actually the start of a preprocessing directive. Callback to
4007 // the preprocessor to handle it.
4008 // TODO: -fpreprocessed mode??
4009 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4010 goto HandleDirective;
4011
4012 Kind = tok::hash;
4013 }
4014 } else {
4015 Kind = tok::percent;
4016 }
4017 break;
4018 case '<':
4019 Char = getCharAndSize(CurPtr, SizeTmp);
4020 if (ParsingFilename) {
4021 return LexAngledStringLiteral(Result, CurPtr);
4022 } else if (Char == '<') {
4023 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4024 if (After == '=') {
4025 Kind = tok::lesslessequal;
4026 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4027 SizeTmp2, Result);
4028 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4029 // If this is actually a '<<<<<<<' version control conflict marker,
4030 // recognize it as such and recover nicely.
4031 goto LexNextToken;
4032 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4033 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4034 // ignore it.
4035 goto LexNextToken;
4036 } else if (LangOpts.CUDA && After == '<') {
4037 Kind = tok::lesslessless;
4038 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4039 SizeTmp2, Result);
4040 } else {
4041 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4042 Kind = tok::lessless;
4043 }
4044 } else if (Char == '=') {
4045 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4046 if (After == '>') {
4047 if (LangOpts.CPlusPlus20) {
4048 if (!isLexingRawMode())
4049 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4050 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4051 SizeTmp2, Result);
4052 Kind = tok::spaceship;
4053 break;
4054 }
4055 // Suggest adding a space between the '<=' and the '>' to avoid a
4056 // change in semantics if this turns up in C++ <=17 mode.
4057 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4058 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4059 << FixItHint::CreateInsertion(
4060 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4061 }
4062 }
4063 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4064 Kind = tok::lessequal;
4065 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4066 if (LangOpts.CPlusPlus11 &&
4067 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4068 // C++0x [lex.pptoken]p3:
4069 // Otherwise, if the next three characters are <:: and the subsequent
4070 // character is neither : nor >, the < is treated as a preprocessor
4071 // token by itself and not as the first character of the alternative
4072 // token <:.
4073 unsigned SizeTmp3;
4074 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4075 if (After != ':' && After != '>') {
4076 Kind = tok::less;
4077 if (!isLexingRawMode())
4078 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4079 break;
4080 }
4081 }
4082
4083 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4084 Kind = tok::l_square;
4085 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4086 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4087 Kind = tok::l_brace;
4088 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4089 lexEditorPlaceholder(Result, CurPtr)) {
4090 return true;
4091 } else {
4092 Kind = tok::less;
4093 }
4094 break;
4095 case '>':
4096 Char = getCharAndSize(CurPtr, SizeTmp);
4097 if (Char == '=') {
4098 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4099 Kind = tok::greaterequal;
4100 } else if (Char == '>') {
4101 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4102 if (After == '=') {
4103 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4104 SizeTmp2, Result);
4105 Kind = tok::greatergreaterequal;
4106 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4107 // If this is actually a '>>>>' conflict marker, recognize it as such
4108 // and recover nicely.
4109 goto LexNextToken;
4110 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4111 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4112 goto LexNextToken;
4113 } else if (LangOpts.CUDA && After == '>') {
4114 Kind = tok::greatergreatergreater;
4115 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4116 SizeTmp2, Result);
4117 } else {
4118 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4119 Kind = tok::greatergreater;
4120 }
4121 } else {
4122 Kind = tok::greater;
4123 }
4124 break;
4125 case '^':
4126 Char = getCharAndSize(CurPtr, SizeTmp);
4127 if (Char == '=') {
4128 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4129 Kind = tok::caretequal;
4130 } else if (LangOpts.OpenCL && Char == '^') {
4131 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4132 Kind = tok::caretcaret;
4133 } else {
4134 Kind = tok::caret;
4135 }
4136 break;
4137 case '|':
4138 Char = getCharAndSize(CurPtr, SizeTmp);
4139 if (Char == '=') {
4140 Kind = tok::pipeequal;
4141 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4142 } else if (Char == '|') {
4143 // If this is '|||||||' and we're in a conflict marker, ignore it.
4144 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4145 goto LexNextToken;
4146 Kind = tok::pipepipe;
4147 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4148 } else {
4149 Kind = tok::pipe;
4150 }
4151 break;
4152 case ':':
4153 Char = getCharAndSize(CurPtr, SizeTmp);
4154 if (LangOpts.Digraphs && Char == '>') {
4155 Kind = tok::r_square; // ':>' -> ']'
4156 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4157 } else if ((LangOpts.CPlusPlus ||
4158 LangOpts.DoubleSquareBracketAttributes) &&
4159 Char == ':') {
4160 Kind = tok::coloncolon;
4161 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4162 } else {
4163 Kind = tok::colon;
4164 }
4165 break;
4166 case ';':
4167 Kind = tok::semi;
4168 break;
4169 case '=':
4170 Char = getCharAndSize(CurPtr, SizeTmp);
4171 if (Char == '=') {
4172 // If this is '====' and we're in a conflict marker, ignore it.
4173 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4174 goto LexNextToken;
4175
4176 Kind = tok::equalequal;
4177 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4178 } else {
4179 Kind = tok::equal;
4180 }
4181 break;
4182 case ',':
4183 Kind = tok::comma;
4184 break;
4185 case '#':
4186 Char = getCharAndSize(CurPtr, SizeTmp);
4187 if (Char == '#') {
4188 Kind = tok::hashhash;
4189 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4190 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4191 Kind = tok::hashat;
4192 if (!isLexingRawMode())
4193 Diag(BufferPtr, diag::ext_charize_microsoft);
4194 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4195 } else {
4196 // We parsed a # character. If this occurs at the start of the line,
4197 // it's actually the start of a preprocessing directive. Callback to
4198 // the preprocessor to handle it.
4199 // TODO: -fpreprocessed mode??
4200 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4201 goto HandleDirective;
4202
4203 Kind = tok::hash;
4204 }
4205 break;
4206
4207 case '@':
4208 // Objective C support.
4209 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4210 Kind = tok::at;
4211 else
4212 Kind = tok::unknown;
4213 break;
4214
4215 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4216 case '\\':
4217 if (!LangOpts.AsmPreprocessor) {
4218 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4219 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4220 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4221 return true; // KeepWhitespaceMode
4222
4223 // We only saw whitespace, so just try again with this lexer.
4224 // (We manually eliminate the tail call to avoid recursion.)
4225 goto LexNextToken;
4226 }
4227
4228 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4229 }
4230 }
4231
4232 Kind = tok::unknown;
4233 break;
4234
4235 default: {
4236 if (isASCII(Char)) {
4237 Kind = tok::unknown;
4238 break;
4239 }
4240
4241 llvm::UTF32 CodePoint;
4242
4243 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4244 // an escaped newline.
4245 --CurPtr;
4246 llvm::ConversionResult Status =
4247 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4248 (const llvm::UTF8 *)BufferEnd,
4249 &CodePoint,
4250 llvm::strictConversion);
4251 if (Status == llvm::conversionOK) {
4252 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4253 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4254 return true; // KeepWhitespaceMode
4255
4256 // We only saw whitespace, so just try again with this lexer.
4257 // (We manually eliminate the tail call to avoid recursion.)
4258 goto LexNextToken;
4259 }
4260 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4261 }
4262
4263 if (isLexingRawMode() || ParsingPreprocessorDirective ||
4264 PP->isPreprocessedOutput()) {
4265 ++CurPtr;
4266 Kind = tok::unknown;
4267 break;
4268 }
4269
4270 // Non-ASCII characters tend to creep into source code unintentionally.
4271 // Instead of letting the parser complain about the unknown token,
4272 // just diagnose the invalid UTF-8, then drop the character.
4273 Diag(CurPtr, diag::err_invalid_utf8);
4274
4275 BufferPtr = CurPtr+1;
4276 // We're pretending the character didn't exist, so just try again with
4277 // this lexer.
4278 // (We manually eliminate the tail call to avoid recursion.)
4279 goto LexNextToken;
4280 }
4281 }
4282
4283 // Notify MIOpt that we read a non-whitespace/non-comment token.
4284 MIOpt.ReadToken();
4285
4286 // Update the location of token as well as BufferPtr.
4287 FormTokenWithChars(Result, CurPtr, Kind);
4288 return true;
4289
4290HandleDirective:
4291 // We parsed a # character and it's the start of a preprocessing directive.
4292
4293 FormTokenWithChars(Result, CurPtr, tok::hash);
4294 PP->HandleDirective(Result);
4295
4296 if (PP->hadModuleLoaderFatalFailure()) {
4297 // With a fatal failure in the module loader, we abort parsing.
4298 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof")(static_cast <bool> (Result.is(tok::eof) && "Preprocessor did not set tok:eof"
) ? void (0) : __assert_fail ("Result.is(tok::eof) && \"Preprocessor did not set tok:eof\""
, "clang/lib/Lex/Lexer.cpp", 4298, __extension__ __PRETTY_FUNCTION__
))
;
4299 return true;
4300 }
4301
4302 // We parsed the directive; lex a token with the new state.
4303 return false;
4304}
4305
4306const char *Lexer::convertDependencyDirectiveToken(
4307 const dependency_directives_scan::Token &DDTok, Token &Result) {
4308 const char *TokPtr = BufferStart + DDTok.Offset;
4309 Result.startToken();
4310 Result.setLocation(getSourceLocation(TokPtr));
4311 Result.setKind(DDTok.Kind);
4312 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4313 Result.setLength(DDTok.Length);
4314 BufferPtr = TokPtr + DDTok.Length;
4315 return TokPtr;
4316}
4317
4318bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4319 assert(isDependencyDirectivesLexer())(static_cast <bool> (isDependencyDirectivesLexer()) ? void
(0) : __assert_fail ("isDependencyDirectivesLexer()", "clang/lib/Lex/Lexer.cpp"
, 4319, __extension__ __PRETTY_FUNCTION__))
;
4320
4321 using namespace dependency_directives_scan;
4322
4323 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4324 if (DepDirectives.front().Kind == pp_eof)
4325 return LexEndOfFile(Result, BufferEnd);
4326 NextDepDirectiveTokenIndex = 0;
4327 DepDirectives = DepDirectives.drop_front();
4328 }
4329
4330 const dependency_directives_scan::Token &DDTok =
4331 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4332 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4333 // Read something other than a preprocessor directive hash.
4334 MIOpt.ReadToken();
4335 }
4336
4337 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4338
4339 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4340 PP->HandleDirective(Result);
4341 return false;
4342 }
4343 if (Result.is(tok::raw_identifier)) {
4344 Result.setRawIdentifierData(TokPtr);
4345 if (!isLexingRawMode()) {
4346 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
4347 if (II->isHandleIdentifierCase())
4348 return PP->HandleIdentifier(Result);
4349 }
4350 return true;
4351 }
4352 if (Result.isLiteral()) {
4353 Result.setLiteralData(TokPtr);
4354 return true;
4355 }
4356 if (Result.is(tok::colon) &&
4357 (LangOpts.CPlusPlus || LangOpts.DoubleSquareBracketAttributes)) {
4358 // Convert consecutive colons to 'tok::coloncolon'.
4359 if (*BufferPtr == ':') {
4360 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is((static_cast <bool> (DepDirectives.front().Tokens[NextDepDirectiveTokenIndex
].is( tok::colon)) ? void (0) : __assert_fail ("DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( tok::colon)"
, "clang/lib/Lex/Lexer.cpp", 4361, __extension__ __PRETTY_FUNCTION__
))
4361 tok::colon))(static_cast <bool> (DepDirectives.front().Tokens[NextDepDirectiveTokenIndex
].is( tok::colon)) ? void (0) : __assert_fail ("DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( tok::colon)"
, "clang/lib/Lex/Lexer.cpp", 4361, __extension__ __PRETTY_FUNCTION__
))
;
4362 ++NextDepDirectiveTokenIndex;
4363 Result.setKind(tok::coloncolon);
4364 }
4365 return true;
4366 }
4367 if (Result.is(tok::eod))
4368 ParsingPreprocessorDirective = false;
4369
4370 return true;
4371}
4372
4373bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4374 assert(isDependencyDirectivesLexer())(static_cast <bool> (isDependencyDirectivesLexer()) ? void
(0) : __assert_fail ("isDependencyDirectivesLexer()", "clang/lib/Lex/Lexer.cpp"
, 4374, __extension__ __PRETTY_FUNCTION__))
;
4375
4376 using namespace dependency_directives_scan;
4377
4378 bool Stop = false;
4379 unsigned NestedIfs = 0;
4380 do {
4381 DepDirectives = DepDirectives.drop_front();
4382 switch (DepDirectives.front().Kind) {
4383 case pp_none:
4384 llvm_unreachable("unexpected 'pp_none'")::llvm::llvm_unreachable_internal("unexpected 'pp_none'", "clang/lib/Lex/Lexer.cpp"
, 4384)
;
4385 case pp_include:
4386 case pp___include_macros:
4387 case pp_define:
4388 case pp_undef:
4389 case pp_import:
4390 case pp_pragma_import:
4391 case pp_pragma_once:
4392 case pp_pragma_push_macro:
4393 case pp_pragma_pop_macro:
4394 case pp_pragma_include_alias:
4395 case pp_include_next:
4396 case decl_at_import:
4397 case cxx_module_decl:
4398 case cxx_import_decl:
4399 case cxx_export_module_decl:
4400 case cxx_export_import_decl:
4401 break;
4402 case pp_if:
4403 case pp_ifdef:
4404 case pp_ifndef:
4405 ++NestedIfs;
4406 break;
4407 case pp_elif:
4408 case pp_elifdef:
4409 case pp_elifndef:
4410 case pp_else:
4411 if (!NestedIfs) {
4412 Stop = true;
4413 }
4414 break;
4415 case pp_endif:
4416 if (!NestedIfs) {
4417 Stop = true;
4418 } else {
4419 --NestedIfs;
4420 }
4421 break;
4422 case pp_eof:
4423 NextDepDirectiveTokenIndex = 0;
4424 return LexEndOfFile(Result, BufferEnd);
4425 }
4426 } while (!Stop);
4427
4428 const dependency_directives_scan::Token &DDTok =
4429 DepDirectives.front().Tokens.front();
4430 assert(DDTok.is(tok::hash))(static_cast <bool> (DDTok.is(tok::hash)) ? void (0) : __assert_fail
("DDTok.is(tok::hash)", "clang/lib/Lex/Lexer.cpp", 4430, __extension__
__PRETTY_FUNCTION__))
;
4431 NextDepDirectiveTokenIndex = 1;
4432
4433 convertDependencyDirectiveToken(DDTok, Result);
4434 return false;
4435}

/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/include/clang/Lex/Lexer.h

1//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the Lexer interface.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_LEX_LEXER_H
14#define LLVM_CLANG_LEX_LEXER_H
15
16#include "clang/Basic/LangOptions.h"
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/TokenKinds.h"
19#include "clang/Lex/DependencyDirectivesScanner.h"
20#include "clang/Lex/PreprocessorLexer.h"
21#include "clang/Lex/Token.h"
22#include "llvm/ADT/Optional.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include <cassert>
26#include <cstdint>
27#include <string>
28
29namespace llvm {
30
31class MemoryBufferRef;
32
33} // namespace llvm
34
35namespace clang {
36
37class DiagnosticBuilder;
38class Preprocessor;
39class SourceManager;
40class LangOptions;
41
42/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
43/// recovering from.
44enum ConflictMarkerKind {
45 /// Not within a conflict marker.
46 CMK_None,
47
48 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
49 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
50 CMK_Normal,
51
52 /// A Perforce-style conflict marker, initiated by 4 ">"s,
53 /// separated by 4 "="s, and terminated by 4 "<"s.
54 CMK_Perforce
55};
56
57/// Describes the bounds (start, size) of the preamble and a flag required by
58/// PreprocessorOptions::PrecompiledPreambleBytes.
59/// The preamble includes the BOM, if any.
60struct PreambleBounds {
61 /// Size of the preamble in bytes.
62 unsigned Size;
63
64 /// Whether the preamble ends at the start of a new line.
65 ///
66 /// Used to inform the lexer as to whether it's starting at the beginning of
67 /// a line after skipping the preamble.
68 bool PreambleEndsAtStartOfLine;
69
70 PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
71 : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
72};
73
74/// Lexer - This provides a simple interface that turns a text buffer into a
75/// stream of tokens. This provides no support for file reading or buffering,
76/// or buffering/seeking of tokens, only forward lexing is supported. It relies
77/// on the specified Preprocessor object to handle preprocessor directives, etc.
78class Lexer : public PreprocessorLexer {
79 friend class Preprocessor;
80
81 void anchor() override;
82
83 //===--------------------------------------------------------------------===//
84 // Constant configuration values for this lexer.
85
86 // Start of the buffer.
87 const char *BufferStart;
88
89 // End of the buffer.
90 const char *BufferEnd;
91
92 // Location for start of file.
93 SourceLocation FileLoc;
94
95 // LangOpts enabled by this language.
96 // Storing LangOptions as reference here is important from performance point
97 // of view. Lack of reference means that LangOptions copy constructor would be
98 // called by Lexer(..., const LangOptions &LangOpts,...). Given that local
99 // Lexer objects are created thousands times (in Lexer::getRawToken,
100 // Preprocessor::EnterSourceFile and other places) during single module
101 // processing in frontend it would make std::vector<std::string> copy
102 // constructors surprisingly hot.
103 const LangOptions &LangOpts;
104
105 // True if '//' line comments are enabled.
106 bool LineComment;
107
108 // True if lexer for _Pragma handling.
109 bool Is_PragmaLexer;
110
111 //===--------------------------------------------------------------------===//
112 // Context-specific lexing flags set by the preprocessor.
113 //
114
115 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
116 /// and return them as tokens. This is used for -C and -CC modes, and
117 /// whitespace preservation can be useful for some clients that want to lex
118 /// the file in raw mode and get every character from the file.
119 ///
120 /// When this is set to 2 it returns comments and whitespace. When set to 1
121 /// it returns comments, when it is set to 0 it returns normal tokens only.
122 unsigned char ExtendedTokenMode;
123
124 //===--------------------------------------------------------------------===//
125 // Context that changes as the file is lexed.
126 // NOTE: any state that mutates when in raw mode must have save/restore code
127 // in Lexer::isNextPPTokenLParen.
128
129 // BufferPtr - Current pointer into the buffer. This is the next character
130 // to be lexed.
131 const char *BufferPtr;
132
133 // IsAtStartOfLine - True if the next lexed token should get the "start of
134 // line" flag set on it.
135 bool IsAtStartOfLine;
136
137 bool IsAtPhysicalStartOfLine;
138
139 bool HasLeadingSpace;
140
141 bool HasLeadingEmptyMacro;
142
143 /// True if this is the first time we're lexing the input file.
144 bool IsFirstTimeLexingFile;
145
146 // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
147 // it also points to '\n.'
148 const char *NewLinePtr;
149
150 // CurrentConflictMarkerState - The kind of conflict marker we are handling.
151 ConflictMarkerKind CurrentConflictMarkerState;
152
153 /// Non-empty if this \p Lexer is \p isDependencyDirectivesLexer().
154 ArrayRef<dependency_directives_scan::Directive> DepDirectives;
155
156 /// If this \p Lexer is \p isDependencyDirectivesLexer(), it represents the
157 /// next token to use from the current dependency directive.
158 unsigned NextDepDirectiveTokenIndex = 0;
159
160 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
161
162public:
163 /// Lexer constructor - Create a new lexer object for the specified buffer
164 /// with the specified preprocessor managing the lexing process. This lexer
165 /// assumes that the associated file buffer and Preprocessor objects will
166 /// outlive it, so it doesn't take ownership of either of them.
167 Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP,
168 bool IsFirstIncludeOfFile = true);
169
170 /// Lexer constructor - Create a new raw lexer object. This object is only
171 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
172 /// text range will outlive it, so it doesn't take ownership of it.
173 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
174 const char *BufStart, const char *BufPtr, const char *BufEnd,
175 bool IsFirstIncludeOfFile = true);
176
177 /// Lexer constructor - Create a new raw lexer object. This object is only
178 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
179 /// text range will outlive it, so it doesn't take ownership of it.
180 Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
181 const SourceManager &SM, const LangOptions &LangOpts,
182 bool IsFirstIncludeOfFile = true);
183
184 Lexer(const Lexer &) = delete;
185 Lexer &operator=(const Lexer &) = delete;
186
187 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
188 /// _Pragma expansion. This has a variety of magic semantics that this method
189 /// sets up. It returns a new'd Lexer that must be delete'd when done.
190 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
191 SourceLocation ExpansionLocStart,
192 SourceLocation ExpansionLocEnd,
193 unsigned TokLen, Preprocessor &PP);
194
195 /// getFileLoc - Return the File Location for the file we are lexing out of.
196 /// The physical location encodes the location where the characters come from,
197 /// the virtual location encodes where we should *claim* the characters came
198 /// from. Currently this is only used by _Pragma handling.
199 SourceLocation getFileLoc() const { return FileLoc; }
200
201private:
202 /// Lex - Return the next token in the file. If this is the end of file, it
203 /// return the tok::eof token. This implicitly involves the preprocessor.
204 bool Lex(Token &Result);
205
206 /// Called when the preprocessor is in 'dependency scanning lexing mode'.
207 bool LexDependencyDirectiveToken(Token &Result);
208
209 /// Called when the preprocessor is in 'dependency scanning lexing mode' and
210 /// is skipping a conditional block.
211 bool LexDependencyDirectiveTokenWhileSkipping(Token &Result);
212
213 /// True when the preprocessor is in 'dependency scanning lexing mode' and
214 /// created this \p Lexer for lexing a set of dependency directive tokens.
215 bool isDependencyDirectivesLexer() const { return !DepDirectives.empty(); }
216
217 /// Initializes \p Result with data from \p DDTok and advances \p BufferPtr to
218 /// the position just after the token.
219 /// \returns the buffer pointer at the beginning of the token.
220 const char *convertDependencyDirectiveToken(
221 const dependency_directives_scan::Token &DDTok, Token &Result);
222
223public:
224 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
225 bool isPragmaLexer() const { return Is_PragmaLexer; }
226
227private:
228 /// IndirectLex - An indirect call to 'Lex' that can be invoked via
229 /// the PreprocessorLexer interface.
230 void IndirectLex(Token &Result) override { Lex(Result); }
231
232public:
233 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
234 /// associated preprocessor object. Return true if the 'next character to
235 /// read' pointer points at the end of the lexer buffer, false otherwise.
236 bool LexFromRawLexer(Token &Result) {
237 assert(LexingRawMode && "Not already in raw mode!")(static_cast <bool> (LexingRawMode && "Not already in raw mode!"
) ? void (0) : __assert_fail ("LexingRawMode && \"Not already in raw mode!\""
, "clang/include/clang/Lex/Lexer.h", 237, __extension__ __PRETTY_FUNCTION__
))
;
238 Lex(Result);
239 // Note that lexing to the end of the buffer doesn't implicitly delete the
240 // lexer when in raw mode.
241 return BufferPtr == BufferEnd;
242 }
243
244 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
245 /// every character in the file, including whitespace and comments. This
246 /// should only be used in raw mode, as the preprocessor is not prepared to
247 /// deal with the excess tokens.
248 bool isKeepWhitespaceMode() const {
249 return ExtendedTokenMode > 1;
250 }
251
252 /// SetKeepWhitespaceMode - This method lets clients enable or disable
253 /// whitespace retention mode.
254 void SetKeepWhitespaceMode(bool Val) {
255 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&(static_cast <bool> ((!Val || LexingRawMode || LangOpts
.TraditionalCPP) && "Can only retain whitespace in raw mode or -traditional-cpp"
) ? void (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "clang/include/clang/Lex/Lexer.h", 256, __extension__ __PRETTY_FUNCTION__
))
256 "Can only retain whitespace in raw mode or -traditional-cpp")(static_cast <bool> ((!Val || LexingRawMode || LangOpts
.TraditionalCPP) && "Can only retain whitespace in raw mode or -traditional-cpp"
) ? void (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "clang/include/clang/Lex/Lexer.h", 256, __extension__ __PRETTY_FUNCTION__
))
;
257 ExtendedTokenMode = Val ? 2 : 0;
258 }
259
260 /// inKeepCommentMode - Return true if the lexer should return comments as
261 /// tokens.
262 bool inKeepCommentMode() const {
263 return ExtendedTokenMode > 0;
264 }
265
266 /// SetCommentRetentionMode - Change the comment retention mode of the lexer
267 /// to the specified mode. This is really only useful when lexing in raw
268 /// mode, because otherwise the lexer needs to manage this.
269 void SetCommentRetentionState(bool Mode) {
270 assert(!isKeepWhitespaceMode() &&(static_cast <bool> (!isKeepWhitespaceMode() &&
"Can't play with comment retention state when retaining whitespace"
) ? void (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "clang/include/clang/Lex/Lexer.h", 271, __extension__ __PRETTY_FUNCTION__
))
271 "Can't play with comment retention state when retaining whitespace")(static_cast <bool> (!isKeepWhitespaceMode() &&
"Can't play with comment retention state when retaining whitespace"
) ? void (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "clang/include/clang/Lex/Lexer.h", 271, __extension__ __PRETTY_FUNCTION__
))
;
272 ExtendedTokenMode = Mode ? 1 : 0;
273 }
274
275 /// Sets the extended token mode back to its initial value, according to the
276 /// language options and preprocessor. This controls whether the lexer
277 /// produces comment and whitespace tokens.
278 ///
279 /// This requires the lexer to have an associated preprocessor. A standalone
280 /// lexer has nothing to reset to.
281 void resetExtendedTokenMode();
282
283 /// Gets source code buffer.
284 StringRef getBuffer() const {
285 return StringRef(BufferStart, BufferEnd - BufferStart);
286 }
287
288 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
289 /// uninterpreted string. This switches the lexer out of directive mode.
290 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
291
292
293 /// Diag - Forwarding function for diagnostics. This translate a source
294 /// position in the current buffer into a SourceLocation object for rendering.
295 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
296
297 /// getSourceLocation - Return a source location identifier for the specified
298 /// offset in the current file.
299 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
300
301 /// getSourceLocation - Return a source location for the next character in
302 /// the current file.
303 SourceLocation getSourceLocation() override {
304 return getSourceLocation(BufferPtr);
305 }
306
307 /// Return the current location in the buffer.
308 const char *getBufferLocation() const { return BufferPtr; }
309
310 /// Returns the current lexing offset.
311 unsigned getCurrentBufferOffset() {
312 assert(BufferPtr >= BufferStart && "Invalid buffer state")(static_cast <bool> (BufferPtr >= BufferStart &&
"Invalid buffer state") ? void (0) : __assert_fail ("BufferPtr >= BufferStart && \"Invalid buffer state\""
, "clang/include/clang/Lex/Lexer.h", 312, __extension__ __PRETTY_FUNCTION__
))
;
313 return BufferPtr - BufferStart;
314 }
315
316 /// Set the lexer's buffer pointer to \p Offset.
317 void seek(unsigned Offset, bool IsAtStartOfLine);
318
319 /// Stringify - Convert the specified string into a C string by i) escaping
320 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
321 /// If Charify is true, this escapes the ' character instead of ".
322 static std::string Stringify(StringRef Str, bool Charify = false);
323
324 /// Stringify - Convert the specified string into a C string by i) escaping
325 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
326 static void Stringify(SmallVectorImpl<char> &Str);
327
328 /// getSpelling - This method is used to get the spelling of a token into a
329 /// preallocated buffer, instead of as an std::string. The caller is required
330 /// to allocate enough space for the token, which is guaranteed to be at least
331 /// Tok.getLength() bytes long. The length of the actual result is returned.
332 ///
333 /// Note that this method may do two possible things: it may either fill in
334 /// the buffer specified with characters, or it may *change the input pointer*
335 /// to point to a constant buffer with the data already in it (avoiding a
336 /// copy). The caller is not allowed to modify the returned buffer pointer
337 /// if an internal buffer is returned.
338 static unsigned getSpelling(const Token &Tok, const char *&Buffer,
339 const SourceManager &SourceMgr,
340 const LangOptions &LangOpts,
341 bool *Invalid = nullptr);
342
343 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
344 /// token is the characters used to represent the token in the source file
345 /// after trigraph expansion and escaped-newline folding. In particular, this
346 /// wants to get the true, uncanonicalized, spelling of things like digraphs
347 /// UCNs, etc.
348 static std::string getSpelling(const Token &Tok,
349 const SourceManager &SourceMgr,
350 const LangOptions &LangOpts,
351 bool *Invalid = nullptr);
352
353 /// getSpelling - This method is used to get the spelling of the
354 /// token at the given source location. If, as is usually true, it
355 /// is not necessary to copy any data, then the returned string may
356 /// not point into the provided buffer.
357 ///
358 /// This method lexes at the expansion depth of the given
359 /// location and does not jump to the expansion or spelling
360 /// location.
361 static StringRef getSpelling(SourceLocation loc,
362 SmallVectorImpl<char> &buffer,
363 const SourceManager &SM,
364 const LangOptions &options,
365 bool *invalid = nullptr);
366
367 /// MeasureTokenLength - Relex the token at the specified location and return
368 /// its length in bytes in the input file. If the token needs cleaning (e.g.
369 /// includes a trigraph or an escaped newline) then this count includes bytes
370 /// that are part of that.
371 static unsigned MeasureTokenLength(SourceLocation Loc,
372 const SourceManager &SM,
373 const LangOptions &LangOpts);
374
375 /// Relex the token at the specified location.
376 /// \returns true if there was a failure, false on success.
377 static bool getRawToken(SourceLocation Loc, Token &Result,
378 const SourceManager &SM,
379 const LangOptions &LangOpts,
380 bool IgnoreWhiteSpace = false);
381
382 /// Given a location any where in a source buffer, find the location
383 /// that corresponds to the beginning of the token in which the original
384 /// source location lands.
385 static SourceLocation GetBeginningOfToken(SourceLocation Loc,
386 const SourceManager &SM,
387 const LangOptions &LangOpts);
388
389 /// Get the physical length (including trigraphs and escaped newlines) of the
390 /// first \p Characters characters of the token starting at TokStart.
391 static unsigned getTokenPrefixLength(SourceLocation TokStart,
392 unsigned CharNo,
393 const SourceManager &SM,
394 const LangOptions &LangOpts);
395
396 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
397 /// location at the start of a token, return a new location that specifies a
398 /// character within the token. This handles trigraphs and escaped newlines.
399 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
400 unsigned Characters,
401 const SourceManager &SM,
402 const LangOptions &LangOpts) {
403 return TokStart.getLocWithOffset(
404 getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
405 }
406
407 /// Computes the source location just past the end of the
408 /// token at this source location.
409 ///
410 /// This routine can be used to produce a source location that
411 /// points just past the end of the token referenced by \p Loc, and
412 /// is generally used when a diagnostic needs to point just after a
413 /// token where it expected something different that it received. If
414 /// the returned source location would not be meaningful (e.g., if
415 /// it points into a macro), this routine returns an invalid
416 /// source location.
417 ///
418 /// \param Offset an offset from the end of the token, where the source
419 /// location should refer to. The default offset (0) produces a source
420 /// location pointing just past the end of the token; an offset of 1 produces
421 /// a source location pointing to the last character in the token, etc.
422 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
423 const SourceManager &SM,
424 const LangOptions &LangOpts);
425
426 /// Given a token range, produce a corresponding CharSourceRange that
427 /// is not a token range. This allows the source range to be used by
428 /// components that don't have access to the lexer and thus can't find the
429 /// end of the range for themselves.
430 static CharSourceRange getAsCharRange(SourceRange Range,
431 const SourceManager &SM,
432 const LangOptions &LangOpts) {
433 SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
434 return End.isInvalid() ? CharSourceRange()
435 : CharSourceRange::getCharRange(
436 Range.getBegin(), End);
437 }
438 static CharSourceRange getAsCharRange(CharSourceRange Range,
439 const SourceManager &SM,
440 const LangOptions &LangOpts) {
441 return Range.isTokenRange()
442 ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
443 : Range;
444 }
445
446 /// Returns true if the given MacroID location points at the first
447 /// token of the macro expansion.
448 ///
449 /// \param MacroBegin If non-null and function returns true, it is set to
450 /// begin location of the macro.
451 static bool isAtStartOfMacroExpansion(SourceLocation loc,
452 const SourceManager &SM,
453 const LangOptions &LangOpts,
454 SourceLocation *MacroBegin = nullptr);
455
456 /// Returns true if the given MacroID location points at the last
457 /// token of the macro expansion.
458 ///
459 /// \param MacroEnd If non-null and function returns true, it is set to
460 /// end location of the macro.
461 static bool isAtEndOfMacroExpansion(SourceLocation loc,
462 const SourceManager &SM,
463 const LangOptions &LangOpts,
464 SourceLocation *MacroEnd = nullptr);
465
466 /// Accepts a range and returns a character range with file locations.
467 ///
468 /// Returns a null range if a part of the range resides inside a macro
469 /// expansion or the range does not reside on the same FileID.
470 ///
471 /// This function is trying to deal with macros and return a range based on
472 /// file locations. The cases where it can successfully handle macros are:
473 ///
474 /// -begin or end range lies at the start or end of a macro expansion, in
475 /// which case the location will be set to the expansion point, e.g:
476 /// \#define M 1 2
477 /// a M
478 /// If you have a range [a, 2] (where 2 came from the macro), the function
479 /// will return a range for "a M"
480 /// if you have range [a, 1], the function will fail because the range
481 /// overlaps with only a part of the macro
482 ///
483 /// -The macro is a function macro and the range can be mapped to the macro
484 /// arguments, e.g:
485 /// \#define M 1 2
486 /// \#define FM(x) x
487 /// FM(a b M)
488 /// if you have range [b, 2], the function will return the file range "b M"
489 /// inside the macro arguments.
490 /// if you have range [a, 2], the function will return the file range
491 /// "FM(a b M)" since the range includes all of the macro expansion.
492 static CharSourceRange makeFileCharRange(CharSourceRange Range,
493 const SourceManager &SM,
494 const LangOptions &LangOpts);
495
496 /// Returns a string for the source that the range encompasses.
497 static StringRef getSourceText(CharSourceRange Range,
498 const SourceManager &SM,
499 const LangOptions &LangOpts,
500 bool *Invalid = nullptr);
501
502 /// Retrieve the name of the immediate macro expansion.
503 ///
504 /// This routine starts from a source location, and finds the name of the macro
505 /// responsible for its immediate expansion. It looks through any intervening
506 /// macro argument expansions to compute this. It returns a StringRef which
507 /// refers to the SourceManager-owned buffer of the source where that macro
508 /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
509 static StringRef getImmediateMacroName(SourceLocation Loc,
510 const SourceManager &SM,
511 const LangOptions &LangOpts);
512
513 /// Retrieve the name of the immediate macro expansion.
514 ///
515 /// This routine starts from a source location, and finds the name of the
516 /// macro responsible for its immediate expansion. It looks through any
517 /// intervening macro argument expansions to compute this. It returns a
518 /// StringRef which refers to the SourceManager-owned buffer of the source
519 /// where that macro name is spelled. Thus, the result shouldn't out-live
520 /// that SourceManager.
521 ///
522 /// This differs from Lexer::getImmediateMacroName in that any macro argument
523 /// location will result in the topmost function macro that accepted it.
524 /// e.g.
525 /// \code
526 /// MAC1( MAC2(foo) )
527 /// \endcode
528 /// for location of 'foo' token, this function will return "MAC1" while
529 /// Lexer::getImmediateMacroName will return "MAC2".
530 static StringRef getImmediateMacroNameForDiagnostics(
531 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
532
533 /// Compute the preamble of the given file.
534 ///
535 /// The preamble of a file contains the initial comments, include directives,
536 /// and other preprocessor directives that occur before the code in this
537 /// particular file actually begins. The preamble of the main source file is
538 /// a potential prefix header.
539 ///
540 /// \param Buffer The memory buffer containing the file's contents.
541 ///
542 /// \param MaxLines If non-zero, restrict the length of the preamble
543 /// to fewer than this number of lines.
544 ///
545 /// \returns The offset into the file where the preamble ends and the rest
546 /// of the file begins along with a boolean value indicating whether
547 /// the preamble ends at the beginning of a new line.
548 static PreambleBounds ComputePreamble(StringRef Buffer,
549 const LangOptions &LangOpts,
550 unsigned MaxLines = 0);
551
552 /// Finds the token that comes right after the given location.
553 ///
554 /// Returns the next token, or none if the location is inside a macro.
555 static Optional<Token> findNextToken(SourceLocation Loc,
556 const SourceManager &SM,
557 const LangOptions &LangOpts);
558
559 /// Checks that the given token is the first token that occurs after
560 /// the given location (this excludes comments and whitespace). Returns the
561 /// location immediately after the specified token. If the token is not found
562 /// or the location is inside a macro, the returned source location will be
563 /// invalid.
564 static SourceLocation findLocationAfterToken(SourceLocation loc,
565 tok::TokenKind TKind,
566 const SourceManager &SM,
567 const LangOptions &LangOpts,
568 bool SkipTrailingWhitespaceAndNewLine);
569
570 /// Returns true if the given character could appear in an identifier.
571 static bool isAsciiIdentifierContinueChar(char c,
572 const LangOptions &LangOpts);
573
574 /// Checks whether new line pointed by Str is preceded by escape
575 /// sequence.
576 static bool isNewLineEscaped(const char *BufferStart, const char *Str);
577
578 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
579 /// emit a warning.
580 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
581 const LangOptions &LangOpts) {
582 // If this is not a trigraph and not a UCN or escaped newline, return
583 // quickly.
584 if (isObviouslySimpleCharacter(Ptr[0])) {
585 Size = 1;
586 return *Ptr;
587 }
588
589 Size = 0;
590 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
591 }
592
593 /// Returns the leading whitespace for line that corresponds to the given
594 /// location \p Loc.
595 static StringRef getIndentationForLine(SourceLocation Loc,
596 const SourceManager &SM);
597
598 /// Check if this is the first time we're lexing the input file.
599 bool isFirstTimeLexingFile() const { return IsFirstTimeLexingFile; }
600
601private:
602 //===--------------------------------------------------------------------===//
603 // Internal implementation interfaces.
604
605 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
606 /// by Lex.
607 ///
608 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
609
610 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
611
612 bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);
613
614 /// FormTokenWithChars - When we lex a token, we have identified a span
615 /// starting at BufferPtr, going to TokEnd that forms the token. This method
616 /// takes that range and assigns it to the token as its location and size. In
617 /// addition, since tokens cannot overlap, this also updates BufferPtr to be
618 /// TokEnd.
619 void FormTokenWithChars(Token &Result, const char *TokEnd,
620 tok::TokenKind Kind) {
621 unsigned TokLen = TokEnd-BufferPtr;
622 Result.setLength(TokLen);
623 Result.setLocation(getSourceLocation(BufferPtr, TokLen));
624 Result.setKind(Kind);
625 BufferPtr = TokEnd;
626 }
627
628 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
629 /// tok::l_paren token, 0 if it is something else and 2 if there are no more
630 /// tokens in the buffer controlled by this lexer.
631 unsigned isNextPPTokenLParen();
632
633 //===--------------------------------------------------------------------===//
634 // Lexer character reading interfaces.
635
636 // This lexer is built on two interfaces for reading characters, both of which
637 // automatically provide phase 1/2 translation. getAndAdvanceChar is used
638 // when we know that we will be reading a character from the input buffer and
639 // that this character will be part of the result token. This occurs in (f.e.)
640 // string processing, because we know we need to read until we find the
641 // closing '"' character.
642 //
643 // The second interface is the combination of getCharAndSize with
644 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
645 // returning it and its size. If the lexer decides that this character is
646 // part of the current token, it calls ConsumeChar on it. This two stage
647 // approach allows us to emit diagnostics for characters (e.g. warnings about
648 // trigraphs), knowing that they only are emitted if the character is
649 // consumed.
650
651 /// isObviouslySimpleCharacter - Return true if the specified character is
652 /// obviously the same in translation phase 1 and translation phase 3. This
653 /// can return false for characters that end up being the same, but it will
654 /// never return true for something that needs to be mapped.
655 static bool isObviouslySimpleCharacter(char C) {
656 return C != '?' && C != '\\';
657 }
658
659 /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
660 /// advance over it, and return it. This is tricky in several cases. Here we
661 /// just handle the trivial case and fall-back to the non-inlined
662 /// getCharAndSizeSlow method to handle the hard case.
663 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
664 // If this is not a trigraph and not a UCN or escaped newline, return
665 // quickly.
666 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
667
668 unsigned Size = 0;
669 char C = getCharAndSizeSlow(Ptr, Size, &Tok);
670 Ptr += Size;
671 return C;
672 }
673
674 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
675 /// and added to a given token, check to see if there are diagnostics that
676 /// need to be emitted or flags that need to be set on the token. If so, do
677 /// it.
678 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
679 // Normal case, we consumed exactly one token. Just return it.
680 if (Size == 1)
681 return Ptr+Size;
682
683 // Otherwise, re-lex the character with a current token, allowing
684 // diagnostics to be emitted and flags to be set.
685 Size = 0;
686 getCharAndSizeSlow(Ptr, Size, &Tok);
687 return Ptr+Size;
688 }
689
690 /// getCharAndSize - Peek a single 'character' from the specified buffer,
691 /// get its size, and return it. This is tricky in several cases. Here we
692 /// just handle the trivial case and fall-back to the non-inlined
693 /// getCharAndSizeSlow method to handle the hard case.
694 inline char getCharAndSize(const char *Ptr, unsigned &Size) {
695 // If this is not a trigraph and not a UCN or escaped newline, return
696 // quickly.
697 if (isObviouslySimpleCharacter(Ptr[0])) {
12
Taking false branch
698 Size = 1;
699 return *Ptr;
700 }
701
702 Size = 0;
703 return getCharAndSizeSlow(Ptr, Size);
13
Value assigned to field 'PP'
704 }
705
706 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
707 /// method.
708 char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
709 Token *Tok = nullptr);
710
711 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
712 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
713 /// to this function.
714 static unsigned getEscapedNewLineSize(const char *P);
715
716 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
717 /// them), skip over them and return the first non-escaped-newline found,
718 /// otherwise return P.
719 static const char *SkipEscapedNewLines(const char *P);
720
721 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
722 /// diagnostic.
723 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
724 const LangOptions &LangOpts);
725
726 //===--------------------------------------------------------------------===//
727 // Other lexer functions.
728
729 void SetByteOffset(unsigned Offset, bool StartOfLine);
730
731 void PropagateLineStartLeadingSpaceInfo(Token &Result);
732
733 const char *LexUDSuffix(Token &Result, const char *CurPtr,
734 bool IsStringLiteral);
735
736 // Helper functions to lex the remainder of a token of the specific type.
737
738 // This function handles both ASCII and Unicode identifiers after
739 // the first codepoint of the identifyier has been parsed.
740 bool LexIdentifierContinue(Token &Result, const char *CurPtr);
741
742 bool LexNumericConstant (Token &Result, const char *CurPtr);
743 bool LexStringLiteral (Token &Result, const char *CurPtr,
744 tok::TokenKind Kind);
745 bool LexRawStringLiteral (Token &Result, const char *CurPtr,
746 tok::TokenKind Kind);
747 bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
748 bool LexCharConstant (Token &Result, const char *CurPtr,
749 tok::TokenKind Kind);
750 bool LexEndOfFile (Token &Result, const char *CurPtr);
751 bool SkipWhitespace (Token &Result, const char *CurPtr,
752 bool &TokAtPhysicalStartOfLine);
753 bool SkipLineComment (Token &Result, const char *CurPtr,
754 bool &TokAtPhysicalStartOfLine);
755 bool SkipBlockComment (Token &Result, const char *CurPtr,
756 bool &TokAtPhysicalStartOfLine);
757 bool SaveLineComment (Token &Result, const char *CurPtr);
758
759 bool IsStartOfConflictMarker(const char *CurPtr);
760 bool HandleEndOfConflictMarker(const char *CurPtr);
761
762 bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
763
764 bool isCodeCompletionPoint(const char *CurPtr) const;
765 void cutOffLexing() { BufferPtr = BufferEnd; }
766
767 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
768
769 void codeCompleteIncludedFile(const char *PathStart,
770 const char *CompletionPoint, bool IsAngled);
771
772 llvm::Optional<uint32_t>
773 tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
774 llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
775 Token *Result);
776
777 /// Read a universal character name.
778 ///
779 /// \param StartPtr The position in the source buffer after the initial '\'.
780 /// If the UCN is syntactically well-formed (but not
781 /// necessarily valid), this parameter will be updated to
782 /// point to the character after the UCN.
783 /// \param SlashLoc The position in the source buffer of the '\'.
784 /// \param Result The token being formed. Pass \c nullptr to suppress
785 /// diagnostics and handle token formation in the caller.
786 ///
787 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
788 /// invalid.
789 uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
790
791 /// Try to consume a UCN as part of an identifier at the current
792 /// location.
793 /// \param CurPtr Initially points to the range of characters in the source
794 /// buffer containing the '\'. Updated to point past the end of
795 /// the UCN on success.
796 /// \param Size The number of characters occupied by the '\' (including
797 /// trigraphs and escaped newlines).
798 /// \param Result The token being produced. Marked as containing a UCN on
799 /// success.
800 /// \return \c true if a UCN was lexed and it produced an acceptable
801 /// identifier character, \c false otherwise.
802 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
803 Token &Result);
804
805 /// Try to consume an identifier character encoded in UTF-8.
806 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
807 /// sequence. On success, updated to point past the end of it.
808 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
809 /// character was lexed, \c false otherwise.
810 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
811};
812
813} // namespace clang
814
815#endif // LLVM_CLANG_LEX_LEXER_H