Bug Summary

File:build/source/clang/lib/Lex/Lexer.cpp
Warning:line 1169, column 10
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name Lexer.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D CLANG_REPOSITORY_STRING="++20230510111145+7df43bdb42ae-1~exp1~20230510111303.1288" -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I tools/clang/lib/Lex -I /build/source/clang/lib/Lex -I /build/source/clang/include -I tools/clang/include -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/clang/lib/Lex/Lexer.cpp

/build/source/clang/lib/Lex/Lexer.cpp

1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
15#include "clang/Basic/CharInfo.h"
16#include "clang/Basic/Diagnostic.h"
17#include "clang/Basic/IdentifierTable.h"
18#include "clang/Basic/LLVM.h"
19#include "clang/Basic/LangOptions.h"
20#include "clang/Basic/SourceLocation.h"
21#include "clang/Basic/SourceManager.h"
22#include "clang/Basic/TokenKinds.h"
23#include "clang/Lex/LexDiagnostic.h"
24#include "clang/Lex/LiteralSupport.h"
25#include "clang/Lex/MultipleIncludeOpt.h"
26#include "clang/Lex/Preprocessor.h"
27#include "clang/Lex/PreprocessorOptions.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MathExtras.h"
36#include "llvm/Support/MemoryBufferRef.h"
37#include "llvm/Support/NativeFormatting.h"
38#include "llvm/Support/Unicode.h"
39#include "llvm/Support/UnicodeCharRanges.h"
40#include <algorithm>
41#include <cassert>
42#include <cstddef>
43#include <cstdint>
44#include <cstring>
45#include <optional>
46#include <string>
47#include <tuple>
48#include <utility>
49
50using namespace clang;
51
52//===----------------------------------------------------------------------===//
53// Token Class Implementation
54//===----------------------------------------------------------------------===//
55
56/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
57bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
58 if (isAnnotation())
59 return false;
60 if (IdentifierInfo *II = getIdentifierInfo())
61 return II->getObjCKeywordID() == objcKey;
62 return false;
63}
64
65/// getObjCKeywordID - Return the ObjC keyword kind.
66tok::ObjCKeywordKind Token::getObjCKeywordID() const {
67 if (isAnnotation())
68 return tok::objc_not_keyword;
69 IdentifierInfo *specId = getIdentifierInfo();
70 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
71}
72
73//===----------------------------------------------------------------------===//
74// Lexer Class Implementation
75//===----------------------------------------------------------------------===//
76
77void Lexer::anchor() {}
78
79void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
80 const char *BufEnd) {
81 BufferStart = BufStart;
82 BufferPtr = BufPtr;
83 BufferEnd = BufEnd;
84
85 assert(BufEnd[0] == 0 &&(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "clang/lib/Lex/Lexer.cpp", 87, __extension__ __PRETTY_FUNCTION__
))
86 "We assume that the input buffer has a null character at the end"(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "clang/lib/Lex/Lexer.cpp", 87, __extension__ __PRETTY_FUNCTION__
))
87 " to simplify lexing!")(static_cast <bool> (BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end"
" to simplify lexing!") ? void (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\""
, "clang/lib/Lex/Lexer.cpp", 87, __extension__ __PRETTY_FUNCTION__
))
;
88
89 // Check whether we have a BOM in the beginning of the buffer. If yes - act
90 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
91 // skip the UTF-8 BOM if it's present.
92 if (BufferStart == BufferPtr) {
93 // Determine the size of the BOM.
94 StringRef Buf(BufferStart, BufferEnd - BufferStart);
95 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97 .Default(0);
98
99 // Skip the BOM.
100 BufferPtr += BOMLength;
101 }
102
103 Is_PragmaLexer = false;
104 CurrentConflictMarkerState = CMK_None;
105
106 // Start of the file is a start of line.
107 IsAtStartOfLine = true;
108 IsAtPhysicalStartOfLine = true;
109
110 HasLeadingSpace = false;
111 HasLeadingEmptyMacro = false;
112
113 // We are not after parsing a #.
114 ParsingPreprocessorDirective = false;
115
116 // We are not after parsing #include.
117 ParsingFilename = false;
118
119 // We are not in raw mode. Raw mode disables diagnostics and interpretation
120 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
121 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122 // or otherwise skipping over tokens.
123 LexingRawMode = false;
124
125 // Default to not keeping comments.
126 ExtendedTokenMode = 0;
127
128 NewLinePtr = nullptr;
129}
130
131/// Lexer constructor - Create a new lexer object for the specified buffer
132/// with the specified preprocessor managing the lexing process. This lexer
133/// assumes that the associated file buffer and Preprocessor objects will
134/// outlive it, so it doesn't take ownership of either of them.
135Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
136 Preprocessor &PP, bool IsFirstIncludeOfFile)
137 : PreprocessorLexer(&PP, FID),
138 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
139 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
140 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
141 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
142 InputFile.getBufferEnd());
143
144 resetExtendedTokenMode();
145}
146
147/// Lexer constructor - Create a new raw lexer object. This object is only
148/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
149/// range will outlive it, so it doesn't take ownership of it.
150Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
151 const char *BufStart, const char *BufPtr, const char *BufEnd,
152 bool IsFirstIncludeOfFile)
153 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
154 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
155 InitLexer(BufStart, BufPtr, BufEnd);
156
157 // We *are* in raw mode.
158 LexingRawMode = true;
159}
160
161/// Lexer constructor - Create a new raw lexer object. This object is only
162/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
163/// range will outlive it, so it doesn't take ownership of it.
164Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
165 const SourceManager &SM, const LangOptions &langOpts,
166 bool IsFirstIncludeOfFile)
167 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
168 FromFile.getBufferStart(), FromFile.getBufferEnd(),
169 IsFirstIncludeOfFile) {}
170
171void Lexer::resetExtendedTokenMode() {
172 assert(PP && "Cannot reset token mode without a preprocessor")(static_cast <bool> (PP && "Cannot reset token mode without a preprocessor"
) ? void (0) : __assert_fail ("PP && \"Cannot reset token mode without a preprocessor\""
, "clang/lib/Lex/Lexer.cpp", 172, __extension__ __PRETTY_FUNCTION__
))
;
173 if (LangOpts.TraditionalCPP)
174 SetKeepWhitespaceMode(true);
175 else
176 SetCommentRetentionState(PP->getCommentRetentionState());
177}
178
179/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
180/// _Pragma expansion. This has a variety of magic semantics that this method
181/// sets up. It returns a new'd Lexer that must be delete'd when done.
182///
183/// On entrance to this routine, TokStartLoc is a macro location which has a
184/// spelling loc that indicates the bytes to be lexed for the token and an
185/// expansion location that indicates where all lexed tokens should be
186/// "expanded from".
187///
188/// TODO: It would really be nice to make _Pragma just be a wrapper around a
189/// normal lexer that remaps tokens as they fly by. This would require making
190/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
191/// interface that could handle this stuff. This would pull GetMappedTokenLoc
192/// out of the critical path of the lexer!
193///
194Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
195 SourceLocation ExpansionLocStart,
196 SourceLocation ExpansionLocEnd,
197 unsigned TokLen, Preprocessor &PP) {
198 SourceManager &SM = PP.getSourceManager();
199
200 // Create the lexer as if we were going to lex the file normally.
201 FileID SpellingFID = SM.getFileID(SpellingLoc);
202 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
203 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
204
205 // Now that the lexer is created, change the start/end locations so that we
206 // just lex the subsection of the file that we want. This is lexing from a
207 // scratch buffer.
208 const char *StrData = SM.getCharacterData(SpellingLoc);
209
210 L->BufferPtr = StrData;
211 L->BufferEnd = StrData+TokLen;
212 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!")(static_cast <bool> (L->BufferEnd[0] == 0 &&
"Buffer is not nul terminated!") ? void (0) : __assert_fail (
"L->BufferEnd[0] == 0 && \"Buffer is not nul terminated!\""
, "clang/lib/Lex/Lexer.cpp", 212, __extension__ __PRETTY_FUNCTION__
))
;
213
214 // Set the SourceLocation with the remapping information. This ensures that
215 // GetMappedTokenLoc will remap the tokens as they are lexed.
216 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
217 ExpansionLocStart,
218 ExpansionLocEnd, TokLen);
219
220 // Ensure that the lexer thinks it is inside a directive, so that end \n will
221 // return an EOD token.
222 L->ParsingPreprocessorDirective = true;
223
224 // This lexer really is for _Pragma.
225 L->Is_PragmaLexer = true;
226 return L;
227}
228
229void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
230 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
231 this->IsAtStartOfLine = IsAtStartOfLine;
232 assert((BufferStart + Offset) <= BufferEnd)(static_cast <bool> ((BufferStart + Offset) <= BufferEnd
) ? void (0) : __assert_fail ("(BufferStart + Offset) <= BufferEnd"
, "clang/lib/Lex/Lexer.cpp", 232, __extension__ __PRETTY_FUNCTION__
))
;
233 BufferPtr = BufferStart + Offset;
234}
235
236template <typename T> static void StringifyImpl(T &Str, char Quote) {
237 typename T::size_type i = 0, e = Str.size();
238 while (i < e) {
239 if (Str[i] == '\\' || Str[i] == Quote) {
240 Str.insert(Str.begin() + i, '\\');
241 i += 2;
242 ++e;
243 } else if (Str[i] == '\n' || Str[i] == '\r') {
244 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
245 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
246 Str[i] != Str[i + 1]) {
247 Str[i] = '\\';
248 Str[i + 1] = 'n';
249 } else {
250 // Replace '\n' and '\r' to '\\' followed by 'n'.
251 Str[i] = '\\';
252 Str.insert(Str.begin() + i + 1, 'n');
253 ++e;
254 }
255 i += 2;
256 } else
257 ++i;
258 }
259}
260
261std::string Lexer::Stringify(StringRef Str, bool Charify) {
262 std::string Result = std::string(Str);
263 char Quote = Charify ? '\'' : '"';
264 StringifyImpl(Result, Quote);
265 return Result;
266}
267
268void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
269
270//===----------------------------------------------------------------------===//
271// Token Spelling
272//===----------------------------------------------------------------------===//
273
274/// Slow case of getSpelling. Extract the characters comprising the
275/// spelling of this token from the provided input buffer.
276static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
277 const LangOptions &LangOpts, char *Spelling) {
278 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token")(static_cast <bool> (Tok.needsCleaning() && "getSpellingSlow called on simple token"
) ? void (0) : __assert_fail ("Tok.needsCleaning() && \"getSpellingSlow called on simple token\""
, "clang/lib/Lex/Lexer.cpp", 278, __extension__ __PRETTY_FUNCTION__
))
;
279
280 size_t Length = 0;
281 const char *BufEnd = BufPtr + Tok.getLength();
282
283 if (tok::isStringLiteral(Tok.getKind())) {
284 // Munch the encoding-prefix and opening double-quote.
285 while (BufPtr < BufEnd) {
286 unsigned Size;
287 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
288 BufPtr += Size;
289
290 if (Spelling[Length - 1] == '"')
291 break;
292 }
293
294 // Raw string literals need special handling; trigraph expansion and line
295 // splicing do not occur within their d-char-sequence nor within their
296 // r-char-sequence.
297 if (Length >= 2 &&
298 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
299 // Search backwards from the end of the token to find the matching closing
300 // quote.
301 const char *RawEnd = BufEnd;
302 do --RawEnd; while (*RawEnd != '"');
303 size_t RawLength = RawEnd - BufPtr + 1;
304
305 // Everything between the quotes is included verbatim in the spelling.
306 memcpy(Spelling + Length, BufPtr, RawLength);
307 Length += RawLength;
308 BufPtr += RawLength;
309
310 // The rest of the token is lexed normally.
311 }
312 }
313
314 while (BufPtr < BufEnd) {
315 unsigned Size;
316 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
317 BufPtr += Size;
318 }
319
320 assert(Length < Tok.getLength() &&(static_cast <bool> (Length < Tok.getLength() &&
"NeedsCleaning flag set on token that didn't need cleaning!"
) ? void (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "clang/lib/Lex/Lexer.cpp", 321, __extension__ __PRETTY_FUNCTION__
))
321 "NeedsCleaning flag set on token that didn't need cleaning!")(static_cast <bool> (Length < Tok.getLength() &&
"NeedsCleaning flag set on token that didn't need cleaning!"
) ? void (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\""
, "clang/lib/Lex/Lexer.cpp", 321, __extension__ __PRETTY_FUNCTION__
))
;
322 return Length;
323}
324
325/// getSpelling() - Return the 'spelling' of this token. The spelling of a
326/// token are the characters used to represent the token in the source file
327/// after trigraph expansion and escaped-newline folding. In particular, this
328/// wants to get the true, uncanonicalized, spelling of things like digraphs
329/// UCNs, etc.
330StringRef Lexer::getSpelling(SourceLocation loc,
331 SmallVectorImpl<char> &buffer,
332 const SourceManager &SM,
333 const LangOptions &options,
334 bool *invalid) {
335 // Break down the source location.
336 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
337
338 // Try to the load the file buffer.
339 bool invalidTemp = false;
340 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
341 if (invalidTemp) {
342 if (invalid) *invalid = true;
343 return {};
344 }
345
346 const char *tokenBegin = file.data() + locInfo.second;
347
348 // Lex from the start of the given location.
349 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
350 file.begin(), tokenBegin, file.end());
351 Token token;
352 lexer.LexFromRawLexer(token);
353
354 unsigned length = token.getLength();
355
356 // Common case: no need for cleaning.
357 if (!token.needsCleaning())
358 return StringRef(tokenBegin, length);
359
360 // Hard case, we need to relex the characters into the string.
361 buffer.resize(length);
362 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
363 return StringRef(buffer.data(), buffer.size());
364}
365
366/// getSpelling() - Return the 'spelling' of this token. The spelling of a
367/// token are the characters used to represent the token in the source file
368/// after trigraph expansion and escaped-newline folding. In particular, this
369/// wants to get the true, uncanonicalized, spelling of things like digraphs
370/// UCNs, etc.
371std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
372 const LangOptions &LangOpts, bool *Invalid) {
373 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(static_cast <bool> ((int)Tok.getLength() >= 0 &&
"Token character range is bogus!") ? void (0) : __assert_fail
("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "clang/lib/Lex/Lexer.cpp", 373, __extension__ __PRETTY_FUNCTION__
))
;
374
375 bool CharDataInvalid = false;
376 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
377 &CharDataInvalid);
378 if (Invalid)
379 *Invalid = CharDataInvalid;
380 if (CharDataInvalid)
381 return {};
382
383 // If this token contains nothing interesting, return it directly.
384 if (!Tok.needsCleaning())
385 return std::string(TokStart, TokStart + Tok.getLength());
386
387 std::string Result;
388 Result.resize(Tok.getLength());
389 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
390 return Result;
391}
392
393/// getSpelling - This method is used to get the spelling of a token into a
394/// preallocated buffer, instead of as an std::string. The caller is required
395/// to allocate enough space for the token, which is guaranteed to be at least
396/// Tok.getLength() bytes long. The actual length of the token is returned.
397///
398/// Note that this method may do two possible things: it may either fill in
399/// the buffer specified with characters, or it may *change the input pointer*
400/// to point to a constant buffer with the data already in it (avoiding a
401/// copy). The caller is not allowed to modify the returned buffer pointer
402/// if an internal buffer is returned.
403unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
404 const SourceManager &SourceMgr,
405 const LangOptions &LangOpts, bool *Invalid) {
406 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(static_cast <bool> ((int)Tok.getLength() >= 0 &&
"Token character range is bogus!") ? void (0) : __assert_fail
("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\""
, "clang/lib/Lex/Lexer.cpp", 406, __extension__ __PRETTY_FUNCTION__
))
;
407
408 const char *TokStart = nullptr;
409 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
410 if (Tok.is(tok::raw_identifier))
411 TokStart = Tok.getRawIdentifier().data();
412 else if (!Tok.hasUCN()) {
413 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
414 // Just return the string from the identifier table, which is very quick.
415 Buffer = II->getNameStart();
416 return II->getLength();
417 }
418 }
419
420 // NOTE: this can be checked even after testing for an IdentifierInfo.
421 if (Tok.isLiteral())
422 TokStart = Tok.getLiteralData();
423
424 if (!TokStart) {
425 // Compute the start of the token in the input lexer buffer.
426 bool CharDataInvalid = false;
427 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
428 if (Invalid)
429 *Invalid = CharDataInvalid;
430 if (CharDataInvalid) {
431 Buffer = "";
432 return 0;
433 }
434 }
435
436 // If this token contains nothing interesting, return it directly.
437 if (!Tok.needsCleaning()) {
438 Buffer = TokStart;
439 return Tok.getLength();
440 }
441
442 // Otherwise, hard case, relex the characters into the string.
443 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
444}
445
446/// MeasureTokenLength - Relex the token at the specified location and return
447/// its length in bytes in the input file. If the token needs cleaning (e.g.
448/// includes a trigraph or an escaped newline) then this count includes bytes
449/// that are part of that.
450unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
451 const SourceManager &SM,
452 const LangOptions &LangOpts) {
453 Token TheTok;
454 if (getRawToken(Loc, TheTok, SM, LangOpts))
455 return 0;
456 return TheTok.getLength();
457}
458
459/// Relex the token at the specified location.
460/// \returns true if there was a failure, false on success.
461bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
462 const SourceManager &SM,
463 const LangOptions &LangOpts,
464 bool IgnoreWhiteSpace) {
465 // TODO: this could be special cased for common tokens like identifiers, ')',
466 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
467 // all obviously single-char tokens. This could use
468 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
469 // something.
470
471 // If this comes from a macro expansion, we really do want the macro name, not
472 // the token this macro expanded to.
473 Loc = SM.getExpansionLoc(Loc);
474 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
475 bool Invalid = false;
476 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
477 if (Invalid)
478 return true;
479
480 const char *StrData = Buffer.data()+LocInfo.second;
481
482 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
483 return true;
484
485 // Create a lexer starting at the beginning of this token.
486 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
487 Buffer.begin(), StrData, Buffer.end());
488 TheLexer.SetCommentRetentionState(true);
489 TheLexer.LexFromRawLexer(Result);
490 return false;
491}
492
493/// Returns the pointer that points to the beginning of line that contains
494/// the given offset, or null if the offset if invalid.
495static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
496 const char *BufStart = Buffer.data();
497 if (Offset >= Buffer.size())
498 return nullptr;
499
500 const char *LexStart = BufStart + Offset;
501 for (; LexStart != BufStart; --LexStart) {
502 if (isVerticalWhitespace(LexStart[0]) &&
503 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
504 // LexStart should point at first character of logical line.
505 ++LexStart;
506 break;
507 }
508 }
509 return LexStart;
510}
511
512static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
513 const SourceManager &SM,
514 const LangOptions &LangOpts) {
515 assert(Loc.isFileID())(static_cast <bool> (Loc.isFileID()) ? void (0) : __assert_fail
("Loc.isFileID()", "clang/lib/Lex/Lexer.cpp", 515, __extension__
__PRETTY_FUNCTION__))
;
516 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
517 if (LocInfo.first.isInvalid())
518 return Loc;
519
520 bool Invalid = false;
521 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
522 if (Invalid)
523 return Loc;
524
525 // Back up from the current location until we hit the beginning of a line
526 // (or the buffer). We'll relex from that point.
527 const char *StrData = Buffer.data() + LocInfo.second;
528 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
529 if (!LexStart || LexStart == StrData)
530 return Loc;
531
532 // Create a lexer starting at the beginning of this token.
533 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
534 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
535 Buffer.end());
536 TheLexer.SetCommentRetentionState(true);
537
538 // Lex tokens until we find the token that contains the source location.
539 Token TheTok;
540 do {
541 TheLexer.LexFromRawLexer(TheTok);
542
543 if (TheLexer.getBufferLocation() > StrData) {
544 // Lexing this token has taken the lexer past the source location we're
545 // looking for. If the current token encompasses our source location,
546 // return the beginning of that token.
547 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
548 return TheTok.getLocation();
549
550 // We ended up skipping over the source location entirely, which means
551 // that it points into whitespace. We're done here.
552 break;
553 }
554 } while (TheTok.getKind() != tok::eof);
555
556 // We've passed our source location; just return the original source location.
557 return Loc;
558}
559
560SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
561 const SourceManager &SM,
562 const LangOptions &LangOpts) {
563 if (Loc.isFileID())
564 return getBeginningOfFileToken(Loc, SM, LangOpts);
565
566 if (!SM.isMacroArgExpansion(Loc))
567 return Loc;
568
569 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
570 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
571 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
572 std::pair<FileID, unsigned> BeginFileLocInfo =
573 SM.getDecomposedLoc(BeginFileLoc);
574 assert(FileLocInfo.first == BeginFileLocInfo.first &&(static_cast <bool> (FileLocInfo.first == BeginFileLocInfo
.first && FileLocInfo.second >= BeginFileLocInfo.second
) ? void (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "clang/lib/Lex/Lexer.cpp", 575, __extension__ __PRETTY_FUNCTION__
))
575 FileLocInfo.second >= BeginFileLocInfo.second)(static_cast <bool> (FileLocInfo.first == BeginFileLocInfo
.first && FileLocInfo.second >= BeginFileLocInfo.second
) ? void (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second"
, "clang/lib/Lex/Lexer.cpp", 575, __extension__ __PRETTY_FUNCTION__
))
;
576 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
577}
578
579namespace {
580
581enum PreambleDirectiveKind {
582 PDK_Skipped,
583 PDK_Unknown
584};
585
586} // namespace
587
588PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
589 const LangOptions &LangOpts,
590 unsigned MaxLines) {
591 // Create a lexer starting at the beginning of the file. Note that we use a
592 // "fake" file source location at offset 1 so that the lexer will track our
593 // position within the file.
594 const SourceLocation::UIntTy StartOffset = 1;
595 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
596 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
597 Buffer.end());
598 TheLexer.SetCommentRetentionState(true);
599
600 bool InPreprocessorDirective = false;
601 Token TheTok;
602 SourceLocation ActiveCommentLoc;
603
604 unsigned MaxLineOffset = 0;
605 if (MaxLines) {
606 const char *CurPtr = Buffer.begin();
607 unsigned CurLine = 0;
608 while (CurPtr != Buffer.end()) {
609 char ch = *CurPtr++;
610 if (ch == '\n') {
611 ++CurLine;
612 if (CurLine == MaxLines)
613 break;
614 }
615 }
616 if (CurPtr != Buffer.end())
617 MaxLineOffset = CurPtr - Buffer.begin();
618 }
619
620 do {
621 TheLexer.LexFromRawLexer(TheTok);
622
623 if (InPreprocessorDirective) {
624 // If we've hit the end of the file, we're done.
625 if (TheTok.getKind() == tok::eof) {
626 break;
627 }
628
629 // If we haven't hit the end of the preprocessor directive, skip this
630 // token.
631 if (!TheTok.isAtStartOfLine())
632 continue;
633
634 // We've passed the end of the preprocessor directive, and will look
635 // at this token again below.
636 InPreprocessorDirective = false;
637 }
638
639 // Keep track of the # of lines in the preamble.
640 if (TheTok.isAtStartOfLine()) {
641 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
642
643 // If we were asked to limit the number of lines in the preamble,
644 // and we're about to exceed that limit, we're done.
645 if (MaxLineOffset && TokOffset >= MaxLineOffset)
646 break;
647 }
648
649 // Comments are okay; skip over them.
650 if (TheTok.getKind() == tok::comment) {
651 if (ActiveCommentLoc.isInvalid())
652 ActiveCommentLoc = TheTok.getLocation();
653 continue;
654 }
655
656 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
657 // This is the start of a preprocessor directive.
658 Token HashTok = TheTok;
659 InPreprocessorDirective = true;
660 ActiveCommentLoc = SourceLocation();
661
662 // Figure out which directive this is. Since we're lexing raw tokens,
663 // we don't have an identifier table available. Instead, just look at
664 // the raw identifier to recognize and categorize preprocessor directives.
665 TheLexer.LexFromRawLexer(TheTok);
666 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
667 StringRef Keyword = TheTok.getRawIdentifier();
668 PreambleDirectiveKind PDK
669 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
670 .Case("include", PDK_Skipped)
671 .Case("__include_macros", PDK_Skipped)
672 .Case("define", PDK_Skipped)
673 .Case("undef", PDK_Skipped)
674 .Case("line", PDK_Skipped)
675 .Case("error", PDK_Skipped)
676 .Case("pragma", PDK_Skipped)
677 .Case("import", PDK_Skipped)
678 .Case("include_next", PDK_Skipped)
679 .Case("warning", PDK_Skipped)
680 .Case("ident", PDK_Skipped)
681 .Case("sccs", PDK_Skipped)
682 .Case("assert", PDK_Skipped)
683 .Case("unassert", PDK_Skipped)
684 .Case("if", PDK_Skipped)
685 .Case("ifdef", PDK_Skipped)
686 .Case("ifndef", PDK_Skipped)
687 .Case("elif", PDK_Skipped)
688 .Case("elifdef", PDK_Skipped)
689 .Case("elifndef", PDK_Skipped)
690 .Case("else", PDK_Skipped)
691 .Case("endif", PDK_Skipped)
692 .Default(PDK_Unknown);
693
694 switch (PDK) {
695 case PDK_Skipped:
696 continue;
697
698 case PDK_Unknown:
699 // We don't know what this directive is; stop at the '#'.
700 break;
701 }
702 }
703
704 // We only end up here if we didn't recognize the preprocessor
705 // directive or it was one that can't occur in the preamble at this
706 // point. Roll back the current token to the location of the '#'.
707 TheTok = HashTok;
708 }
709
710 // We hit a token that we don't recognize as being in the
711 // "preprocessing only" part of the file, so we're no longer in
712 // the preamble.
713 break;
714 } while (true);
715
716 SourceLocation End;
717 if (ActiveCommentLoc.isValid())
718 End = ActiveCommentLoc; // don't truncate a decl comment.
719 else
720 End = TheTok.getLocation();
721
722 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
723 TheTok.isAtStartOfLine());
724}
725
726unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
727 const SourceManager &SM,
728 const LangOptions &LangOpts) {
729 // Figure out how many physical characters away the specified expansion
730 // character is. This needs to take into consideration newlines and
731 // trigraphs.
732 bool Invalid = false;
733 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
734
735 // If they request the first char of the token, we're trivially done.
736 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
737 return 0;
738
739 unsigned PhysOffset = 0;
740
741 // The usual case is that tokens don't contain anything interesting. Skip
742 // over the uninteresting characters. If a token only consists of simple
743 // chars, this method is extremely fast.
744 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
745 if (CharNo == 0)
746 return PhysOffset;
747 ++TokPtr;
748 --CharNo;
749 ++PhysOffset;
750 }
751
752 // If we have a character that may be a trigraph or escaped newline, use a
753 // lexer to parse it correctly.
754 for (; CharNo; --CharNo) {
755 unsigned Size;
756 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
757 TokPtr += Size;
758 PhysOffset += Size;
759 }
760
761 // Final detail: if we end up on an escaped newline, we want to return the
762 // location of the actual byte of the token. For example foo\<newline>bar
763 // advanced by 3 should return the location of b, not of \\. One compounding
764 // detail of this is that the escape may be made by a trigraph.
765 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
766 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
767
768 return PhysOffset;
769}
770
771/// Computes the source location just past the end of the
772/// token at this source location.
773///
774/// This routine can be used to produce a source location that
775/// points just past the end of the token referenced by \p Loc, and
776/// is generally used when a diagnostic needs to point just after a
777/// token where it expected something different that it received. If
778/// the returned source location would not be meaningful (e.g., if
779/// it points into a macro), this routine returns an invalid
780/// source location.
781///
782/// \param Offset an offset from the end of the token, where the source
783/// location should refer to. The default offset (0) produces a source
784/// location pointing just past the end of the token; an offset of 1 produces
785/// a source location pointing to the last character in the token, etc.
786SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
787 const SourceManager &SM,
788 const LangOptions &LangOpts) {
789 if (Loc.isInvalid())
790 return {};
791
792 if (Loc.isMacroID()) {
793 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
794 return {}; // Points inside the macro expansion.
795 }
796
797 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
798 if (Len > Offset)
799 Len = Len - Offset;
800 else
801 return Loc;
802
803 return Loc.getLocWithOffset(Len);
804}
805
806/// Returns true if the given MacroID location points at the first
807/// token of the macro expansion.
808bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
809 const SourceManager &SM,
810 const LangOptions &LangOpts,
811 SourceLocation *MacroBegin) {
812 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")(static_cast <bool> (loc.isValid() && loc.isMacroID
() && "Expected a valid macro loc") ? void (0) : __assert_fail
("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "clang/lib/Lex/Lexer.cpp", 812, __extension__ __PRETTY_FUNCTION__
))
;
813
814 SourceLocation expansionLoc;
815 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
816 return false;
817
818 if (expansionLoc.isFileID()) {
819 // No other macro expansions, this is the first.
820 if (MacroBegin)
821 *MacroBegin = expansionLoc;
822 return true;
823 }
824
825 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
826}
827
828/// Returns true if the given MacroID location points at the last
829/// token of the macro expansion.
830bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
831 const SourceManager &SM,
832 const LangOptions &LangOpts,
833 SourceLocation *MacroEnd) {
834 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")(static_cast <bool> (loc.isValid() && loc.isMacroID
() && "Expected a valid macro loc") ? void (0) : __assert_fail
("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\""
, "clang/lib/Lex/Lexer.cpp", 834, __extension__ __PRETTY_FUNCTION__
))
;
835
836 SourceLocation spellLoc = SM.getSpellingLoc(loc);
837 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
838 if (tokLen == 0)
839 return false;
840
841 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
842 SourceLocation expansionLoc;
843 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
844 return false;
845
846 if (expansionLoc.isFileID()) {
847 // No other macro expansions.
848 if (MacroEnd)
849 *MacroEnd = expansionLoc;
850 return true;
851 }
852
853 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
854}
855
856static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
857 const SourceManager &SM,
858 const LangOptions &LangOpts) {
859 SourceLocation Begin = Range.getBegin();
860 SourceLocation End = Range.getEnd();
861 assert(Begin.isFileID() && End.isFileID())(static_cast <bool> (Begin.isFileID() && End.isFileID
()) ? void (0) : __assert_fail ("Begin.isFileID() && End.isFileID()"
, "clang/lib/Lex/Lexer.cpp", 861, __extension__ __PRETTY_FUNCTION__
))
;
862 if (Range.isTokenRange()) {
863 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
864 if (End.isInvalid())
865 return {};
866 }
867
868 // Break down the source locations.
869 FileID FID;
870 unsigned BeginOffs;
871 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
872 if (FID.isInvalid())
873 return {};
874
875 unsigned EndOffs;
876 if (!SM.isInFileID(End, FID, &EndOffs) ||
877 BeginOffs > EndOffs)
878 return {};
879
880 return CharSourceRange::getCharRange(Begin, End);
881}
882
883// Assumes that `Loc` is in an expansion.
884static bool isInExpansionTokenRange(const SourceLocation Loc,
885 const SourceManager &SM) {
886 return SM.getSLocEntry(SM.getFileID(Loc))
887 .getExpansion()
888 .isExpansionTokenRange();
889}
890
891CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
892 const SourceManager &SM,
893 const LangOptions &LangOpts) {
894 SourceLocation Begin = Range.getBegin();
895 SourceLocation End = Range.getEnd();
896 if (Begin.isInvalid() || End.isInvalid())
897 return {};
898
899 if (Begin.isFileID() && End.isFileID())
900 return makeRangeFromFileLocs(Range, SM, LangOpts);
901
902 if (Begin.isMacroID() && End.isFileID()) {
903 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
904 return {};
905 Range.setBegin(Begin);
906 return makeRangeFromFileLocs(Range, SM, LangOpts);
907 }
908
909 if (Begin.isFileID() && End.isMacroID()) {
910 if (Range.isTokenRange()) {
911 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
912 return {};
913 // Use the *original* end, not the expanded one in `End`.
914 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
915 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
916 return {};
917 Range.setEnd(End);
918 return makeRangeFromFileLocs(Range, SM, LangOpts);
919 }
920
921 assert(Begin.isMacroID() && End.isMacroID())(static_cast <bool> (Begin.isMacroID() && End.isMacroID
()) ? void (0) : __assert_fail ("Begin.isMacroID() && End.isMacroID()"
, "clang/lib/Lex/Lexer.cpp", 921, __extension__ __PRETTY_FUNCTION__
))
;
922 SourceLocation MacroBegin, MacroEnd;
923 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
924 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
925 &MacroEnd)) ||
926 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
927 &MacroEnd)))) {
928 Range.setBegin(MacroBegin);
929 Range.setEnd(MacroEnd);
930 // Use the *original* `End`, not the expanded one in `MacroEnd`.
931 if (Range.isTokenRange())
932 Range.setTokenRange(isInExpansionTokenRange(End, SM));
933 return makeRangeFromFileLocs(Range, SM, LangOpts);
934 }
935
936 bool Invalid = false;
937 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
938 &Invalid);
939 if (Invalid)
940 return {};
941
942 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
943 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
944 &Invalid);
945 if (Invalid)
946 return {};
947
948 if (EndEntry.getExpansion().isMacroArgExpansion() &&
949 BeginEntry.getExpansion().getExpansionLocStart() ==
950 EndEntry.getExpansion().getExpansionLocStart()) {
951 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
952 Range.setEnd(SM.getImmediateSpellingLoc(End));
953 return makeFileCharRange(Range, SM, LangOpts);
954 }
955 }
956
957 return {};
958}
959
960StringRef Lexer::getSourceText(CharSourceRange Range,
961 const SourceManager &SM,
962 const LangOptions &LangOpts,
963 bool *Invalid) {
964 Range = makeFileCharRange(Range, SM, LangOpts);
965 if (Range.isInvalid()) {
966 if (Invalid) *Invalid = true;
967 return {};
968 }
969
970 // Break down the source location.
971 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
972 if (beginInfo.first.isInvalid()) {
973 if (Invalid) *Invalid = true;
974 return {};
975 }
976
977 unsigned EndOffs;
978 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
979 beginInfo.second > EndOffs) {
980 if (Invalid) *Invalid = true;
981 return {};
982 }
983
984 // Try to the load the file buffer.
985 bool invalidTemp = false;
986 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
987 if (invalidTemp) {
988 if (Invalid) *Invalid = true;
989 return {};
990 }
991
992 if (Invalid) *Invalid = false;
993 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
994}
995
996StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
997 const SourceManager &SM,
998 const LangOptions &LangOpts) {
999 assert(Loc.isMacroID() && "Only reasonable to call this on macros")(static_cast <bool> (Loc.isMacroID() && "Only reasonable to call this on macros"
) ? void (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "clang/lib/Lex/Lexer.cpp", 999, __extension__ __PRETTY_FUNCTION__
))
;
1000
1001 // Find the location of the immediate macro expansion.
1002 while (true) {
1003 FileID FID = SM.getFileID(Loc);
1004 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1005 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1006 Loc = Expansion.getExpansionLocStart();
1007 if (!Expansion.isMacroArgExpansion())
1008 break;
1009
1010 // For macro arguments we need to check that the argument did not come
1011 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1012
1013 // Loc points to the argument id of the macro definition, move to the
1014 // macro expansion.
1015 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1016 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1017 if (SpellLoc.isFileID())
1018 break; // No inner macro.
1019
1020 // If spelling location resides in the same FileID as macro expansion
1021 // location, it means there is no inner macro.
1022 FileID MacroFID = SM.getFileID(Loc);
1023 if (SM.isInFileID(SpellLoc, MacroFID))
1024 break;
1025
1026 // Argument came from inner macro.
1027 Loc = SpellLoc;
1028 }
1029
1030 // Find the spelling location of the start of the non-argument expansion
1031 // range. This is where the macro name was spelled in order to begin
1032 // expanding this macro.
1033 Loc = SM.getSpellingLoc(Loc);
1034
1035 // Dig out the buffer where the macro name was spelled and the extents of the
1036 // name so that we can render it into the expansion note.
1037 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1038 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1039 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1040 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1041}
1042
1043StringRef Lexer::getImmediateMacroNameForDiagnostics(
1044 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1045 assert(Loc.isMacroID() && "Only reasonable to call this on macros")(static_cast <bool> (Loc.isMacroID() && "Only reasonable to call this on macros"
) ? void (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\""
, "clang/lib/Lex/Lexer.cpp", 1045, __extension__ __PRETTY_FUNCTION__
))
;
1046 // Walk past macro argument expansions.
1047 while (SM.isMacroArgExpansion(Loc))
1048 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1049
1050 // If the macro's spelling isn't FileID or from scratch space, then it's
1051 // actually a token paste or stringization (or similar) and not a macro at
1052 // all.
1053 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1054 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1055 return {};
1056
1057 // Find the spelling location of the start of the non-argument expansion
1058 // range. This is where the macro name was spelled in order to begin
1059 // expanding this macro.
1060 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1061
1062 // Dig out the buffer where the macro name was spelled and the extents of the
1063 // name so that we can render it into the expansion note.
1064 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1065 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1066 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1067 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1068}
1069
1070bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
1071 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1072}
1073
1074bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1075 assert(isVerticalWhitespace(Str[0]))(static_cast <bool> (isVerticalWhitespace(Str[0])) ? void
(0) : __assert_fail ("isVerticalWhitespace(Str[0])", "clang/lib/Lex/Lexer.cpp"
, 1075, __extension__ __PRETTY_FUNCTION__))
;
1076 if (Str - 1 < BufferStart)
1077 return false;
1078
1079 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1080 (Str[0] == '\r' && Str[-1] == '\n')) {
1081 if (Str - 2 < BufferStart)
1082 return false;
1083 --Str;
1084 }
1085 --Str;
1086
1087 // Rewind to first non-space character:
1088 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1089 --Str;
1090
1091 return *Str == '\\';
1092}
1093
1094StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1095 const SourceManager &SM) {
1096 if (Loc.isInvalid() || Loc.isMacroID())
1097 return {};
1098 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1099 if (LocInfo.first.isInvalid())
1100 return {};
1101 bool Invalid = false;
1102 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1103 if (Invalid)
1104 return {};
1105 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1106 if (!Line)
1107 return {};
1108 StringRef Rest = Buffer.substr(Line - Buffer.data());
1109 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1110 return NumWhitespaceChars == StringRef::npos
1111 ? ""
1112 : Rest.take_front(NumWhitespaceChars);
1113}
1114
1115//===----------------------------------------------------------------------===//
1116// Diagnostics forwarding code.
1117//===----------------------------------------------------------------------===//
1118
1119/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1120/// lexer buffer was all expanded at a single point, perform the mapping.
1121/// This is currently only used for _Pragma implementation, so it is the slow
1122/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1123static LLVM_ATTRIBUTE_NOINLINE__attribute__((noinline)) SourceLocation GetMappedTokenLoc(
1124 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1125static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1126 SourceLocation FileLoc,
1127 unsigned CharNo, unsigned TokLen) {
1128 assert(FileLoc.isMacroID() && "Must be a macro expansion")(static_cast <bool> (FileLoc.isMacroID() && "Must be a macro expansion"
) ? void (0) : __assert_fail ("FileLoc.isMacroID() && \"Must be a macro expansion\""
, "clang/lib/Lex/Lexer.cpp", 1128, __extension__ __PRETTY_FUNCTION__
))
;
1129
1130 // Otherwise, we're lexing "mapped tokens". This is used for things like
1131 // _Pragma handling. Combine the expansion location of FileLoc with the
1132 // spelling location.
1133 SourceManager &SM = PP.getSourceManager();
1134
1135 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1136 // characters come from spelling(FileLoc)+Offset.
1137 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1138 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1139
1140 // Figure out the expansion loc range, which is the range covered by the
1141 // original _Pragma(...) sequence.
1142 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1143
1144 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1145}
1146
1147/// getSourceLocation - Return a source location identifier for the specified
1148/// offset in the current file.
1149SourceLocation Lexer::getSourceLocation(const char *Loc,
1150 unsigned TokLen) const {
1151 assert(Loc >= BufferStart && Loc <= BufferEnd &&(static_cast <bool> (Loc >= BufferStart && Loc
<= BufferEnd && "Location out of range for this buffer!"
) ? void (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "clang/lib/Lex/Lexer.cpp", 1152, __extension__ __PRETTY_FUNCTION__
))
1152 "Location out of range for this buffer!")(static_cast <bool> (Loc >= BufferStart && Loc
<= BufferEnd && "Location out of range for this buffer!"
) ? void (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\""
, "clang/lib/Lex/Lexer.cpp", 1152, __extension__ __PRETTY_FUNCTION__
))
;
1153
1154 // In the normal case, we're just lexing from a simple file buffer, return
1155 // the file id from FileLoc with the offset specified.
1156 unsigned CharNo = Loc-BufferStart;
1157 if (FileLoc.isFileID())
1158 return FileLoc.getLocWithOffset(CharNo);
1159
1160 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1161 // tokens are lexed from where the _Pragma was defined.
1162 assert(PP && "This doesn't work on raw lexers")(static_cast <bool> (PP && "This doesn't work on raw lexers"
) ? void (0) : __assert_fail ("PP && \"This doesn't work on raw lexers\""
, "clang/lib/Lex/Lexer.cpp", 1162, __extension__ __PRETTY_FUNCTION__
))
;
1163 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1164}
1165
1166/// Diag - Forwarding function for diagnostics. This translate a source
1167/// position in the current buffer into a SourceLocation object for rendering.
1168DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1169 return PP->Diag(getSourceLocation(Loc), DiagID);
32
Called C++ object pointer is null
1170}
1171
1172//===----------------------------------------------------------------------===//
1173// Trigraph and Escaped Newline Handling Code.
1174//===----------------------------------------------------------------------===//
1175
1176/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1177/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1178static char GetTrigraphCharForLetter(char Letter) {
1179 switch (Letter) {
1180 default: return 0;
1181 case '=': return '#';
1182 case ')': return ']';
1183 case '(': return '[';
1184 case '!': return '|';
1185 case '\'': return '^';
1186 case '>': return '}';
1187 case '/': return '\\';
1188 case '<': return '{';
1189 case '-': return '~';
1190 }
1191}
1192
1193/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1194/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1195/// return the result character. Finally, emit a warning about trigraph use
1196/// whether trigraphs are enabled or not.
1197static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1198 char Res = GetTrigraphCharForLetter(*CP);
1199 if (!Res)
1200 return Res;
1201
1202 if (!Trigraphs) {
1203 if (L && !L->isLexingRawMode())
1204 L->Diag(CP-2, diag::trigraph_ignored);
1205 return 0;
1206 }
1207
1208 if (L && !L->isLexingRawMode())
1209 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1210 return Res;
1211}
1212
1213/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1214/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1215/// trigraph equivalent on entry to this function.
1216unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1217 unsigned Size = 0;
1218 while (isWhitespace(Ptr[Size])) {
1219 ++Size;
1220
1221 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1222 continue;
1223
1224 // If this is a \r\n or \n\r, skip the other half.
1225 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1226 Ptr[Size-1] != Ptr[Size])
1227 ++Size;
1228
1229 return Size;
1230 }
1231
1232 // Not an escaped newline, must be a \t or something else.
1233 return 0;
1234}
1235
1236/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1237/// them), skip over them and return the first non-escaped-newline found,
1238/// otherwise return P.
1239const char *Lexer::SkipEscapedNewLines(const char *P) {
1240 while (true) {
1241 const char *AfterEscape;
1242 if (*P == '\\') {
1243 AfterEscape = P+1;
1244 } else if (*P == '?') {
1245 // If not a trigraph for escape, bail out.
1246 if (P[1] != '?' || P[2] != '/')
1247 return P;
1248 // FIXME: Take LangOpts into account; the language might not
1249 // support trigraphs.
1250 AfterEscape = P+3;
1251 } else {
1252 return P;
1253 }
1254
1255 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1256 if (NewLineSize == 0) return P;
1257 P = AfterEscape+NewLineSize;
1258 }
1259}
1260
1261std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
1262 const SourceManager &SM,
1263 const LangOptions &LangOpts) {
1264 if (Loc.isMacroID()) {
1265 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1266 return std::nullopt;
1267 }
1268 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1269
1270 // Break down the source location.
1271 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1272
1273 // Try to load the file buffer.
1274 bool InvalidTemp = false;
1275 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1276 if (InvalidTemp)
1277 return std::nullopt;
1278
1279 const char *TokenBegin = File.data() + LocInfo.second;
1280
1281 // Lex from the start of the given location.
1282 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1283 TokenBegin, File.end());
1284 // Find the token.
1285 Token Tok;
1286 lexer.LexFromRawLexer(Tok);
1287 return Tok;
1288}
1289
1290/// Checks that the given token is the first token that occurs after the
1291/// given location (this excludes comments and whitespace). Returns the location
1292/// immediately after the specified token. If the token is not found or the
1293/// location is inside a macro, the returned source location will be invalid.
1294SourceLocation Lexer::findLocationAfterToken(
1295 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1296 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1297 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1298 if (!Tok || Tok->isNot(TKind))
1299 return {};
1300 SourceLocation TokenLoc = Tok->getLocation();
1301
1302 // Calculate how much whitespace needs to be skipped if any.
1303 unsigned NumWhitespaceChars = 0;
1304 if (SkipTrailingWhitespaceAndNewLine) {
1305 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1306 unsigned char C = *TokenEnd;
1307 while (isHorizontalWhitespace(C)) {
1308 C = *(++TokenEnd);
1309 NumWhitespaceChars++;
1310 }
1311
1312 // Skip \r, \n, \r\n, or \n\r
1313 if (C == '\n' || C == '\r') {
1314 char PrevC = C;
1315 C = *(++TokenEnd);
1316 NumWhitespaceChars++;
1317 if ((C == '\n' || C == '\r') && C != PrevC)
1318 NumWhitespaceChars++;
1319 }
1320 }
1321
1322 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1323}
1324
1325/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1326/// get its size, and return it. This is tricky in several cases:
1327/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1328/// then either return the trigraph (skipping 3 chars) or the '?',
1329/// depending on whether trigraphs are enabled or not.
1330/// 2. If this is an escaped newline (potentially with whitespace between
1331/// the backslash and newline), implicitly skip the newline and return
1332/// the char after it.
1333///
1334/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1335/// know that we can accumulate into Size, and that we have already incremented
1336/// Ptr by Size bytes.
1337///
1338/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1339/// be updated to match.
1340char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1341 Token *Tok) {
1342 // If we have a slash, look for an escaped newline.
1343 if (Ptr[0] == '\\') {
1344 ++Size;
1345 ++Ptr;
1346Slash:
1347 // Common case, backslash-char where the char is not whitespace.
1348 if (!isWhitespace(Ptr[0])) return '\\';
1349
1350 // See if we have optional whitespace characters between the slash and
1351 // newline.
1352 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1353 // Remember that this token needs to be cleaned.
1354 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1355
1356 // Warn if there was whitespace between the backslash and newline.
1357 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1358 Diag(Ptr, diag::backslash_newline_space);
1359
1360 // Found backslash<whitespace><newline>. Parse the char after it.
1361 Size += EscapedNewLineSize;
1362 Ptr += EscapedNewLineSize;
1363
1364 // Use slow version to accumulate a correct size field.
1365 return getCharAndSizeSlow(Ptr, Size, Tok);
1366 }
1367
1368 // Otherwise, this is not an escaped newline, just return the slash.
1369 return '\\';
1370 }
1371
1372 // If this is a trigraph, process it.
1373 if (Ptr[0] == '?' && Ptr[1] == '?') {
1374 // If this is actually a legal trigraph (not something like "??x"), emit
1375 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1376 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1377 LangOpts.Trigraphs)) {
1378 // Remember that this token needs to be cleaned.
1379 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1380
1381 Ptr += 3;
1382 Size += 3;
1383 if (C == '\\') goto Slash;
1384 return C;
1385 }
1386 }
1387
1388 // If this is neither, return a single character.
1389 ++Size;
1390 return *Ptr;
1391}
1392
1393/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1394/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1395/// and that we have already incremented Ptr by Size bytes.
1396///
1397/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1398/// be updated to match.
1399char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1400 const LangOptions &LangOpts) {
1401 // If we have a slash, look for an escaped newline.
1402 if (Ptr[0] == '\\') {
1403 ++Size;
1404 ++Ptr;
1405Slash:
1406 // Common case, backslash-char where the char is not whitespace.
1407 if (!isWhitespace(Ptr[0])) return '\\';
1408
1409 // See if we have optional whitespace characters followed by a newline.
1410 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1411 // Found backslash<whitespace><newline>. Parse the char after it.
1412 Size += EscapedNewLineSize;
1413 Ptr += EscapedNewLineSize;
1414
1415 // Use slow version to accumulate a correct size field.
1416 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1417 }
1418
1419 // Otherwise, this is not an escaped newline, just return the slash.
1420 return '\\';
1421 }
1422
1423 // If this is a trigraph, process it.
1424 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1425 // If this is actually a legal trigraph (not something like "??x"), return
1426 // it.
1427 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1428 Ptr += 3;
1429 Size += 3;
1430 if (C == '\\') goto Slash;
1431 return C;
1432 }
1433 }
1434
1435 // If this is neither, return a single character.
1436 ++Size;
1437 return *Ptr;
1438}
1439
1440//===----------------------------------------------------------------------===//
1441// Helper methods for lexing.
1442//===----------------------------------------------------------------------===//
1443
1444/// Routine that indiscriminately sets the offset into the source file.
1445void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1446 BufferPtr = BufferStart + Offset;
1447 if (BufferPtr > BufferEnd)
1448 BufferPtr = BufferEnd;
1449 // FIXME: What exactly does the StartOfLine bit mean? There are two
1450 // possible meanings for the "start" of the line: the first token on the
1451 // unexpanded line, or the first token on the expanded line.
1452 IsAtStartOfLine = StartOfLine;
1453 IsAtPhysicalStartOfLine = StartOfLine;
1454}
1455
1456static bool isUnicodeWhitespace(uint32_t Codepoint) {
1457 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1458 UnicodeWhitespaceCharRanges);
1459 return UnicodeWhitespaceChars.contains(Codepoint);
1460}
1461
1462static llvm::SmallString<5> codepointAsHexString(uint32_t C) {
1463 llvm::SmallString<5> CharBuf;
1464 llvm::raw_svector_ostream CharOS(CharBuf);
1465 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1466 return CharBuf;
1467}
1468
1469// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1470// we allow "Mathematical Notation Characters" in identifiers.
1471// This is a proposed profile that extends the XID_Start/XID_continue
1472// with mathematical symbols, superscipts and subscripts digits
1473// found in some production software.
1474// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1475static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1476 bool IsStart, bool &IsExtension) {
1477 static const llvm::sys::UnicodeCharSet MathStartChars(
1478 MathematicalNotationProfileIDStartRanges);
1479 static const llvm::sys::UnicodeCharSet MathContinueChars(
1480 MathematicalNotationProfileIDContinueRanges);
1481 if (MathStartChars.contains(C) ||
1482 (!IsStart && MathContinueChars.contains(C))) {
1483 IsExtension = true;
1484 return true;
1485 }
1486 return false;
1487}
1488
1489static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1490 bool &IsExtension) {
1491 if (LangOpts.AsmPreprocessor) {
1492 return false;
1493 } else if (LangOpts.DollarIdents && '$' == C) {
1494 return true;
1495 } else if (LangOpts.CPlusPlus || LangOpts.C2x) {
1496 // A non-leading codepoint must have the XID_Continue property.
1497 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1498 // so we need to check both tables.
1499 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1500 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1501 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1502 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1503 return true;
1504 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1505 IsExtension);
1506 } else if (LangOpts.C11) {
1507 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1508 C11AllowedIDCharRanges);
1509 return C11AllowedIDChars.contains(C);
1510 } else {
1511 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1512 C99AllowedIDCharRanges);
1513 return C99AllowedIDChars.contains(C);
1514 }
1515}
1516
1517static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1518 bool &IsExtension) {
1519 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint")(static_cast <bool> (C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"
) ? void (0) : __assert_fail ("C > 0x7F && \"isAllowedInitiallyIDChar called with an ASCII codepoint\""
, "clang/lib/Lex/Lexer.cpp", 1519, __extension__ __PRETTY_FUNCTION__
))
;
1520 IsExtension = false;
1521 if (LangOpts.AsmPreprocessor) {
1522 return false;
1523 }
1524 if (LangOpts.CPlusPlus || LangOpts.C2x) {
1525 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1526 if (XIDStartChars.contains(C))
1527 return true;
1528 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1529 IsExtension);
1530 }
1531 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1532 return false;
1533 if (LangOpts.C11) {
1534 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1535 C11DisallowedInitialIDCharRanges);
1536 return !C11DisallowedInitialIDChars.contains(C);
1537 }
1538 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1539 C99DisallowedInitialIDCharRanges);
1540 return !C99DisallowedInitialIDChars.contains(C);
1541}
1542
1543static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
1544 CharSourceRange Range) {
1545
1546 static const llvm::sys::UnicodeCharSet MathStartChars(
1547 MathematicalNotationProfileIDStartRanges);
1548 static const llvm::sys::UnicodeCharSet MathContinueChars(
1549 MathematicalNotationProfileIDContinueRanges);
1550
1551 (void)MathStartChars;
1552 (void)MathContinueChars;
1553 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&(static_cast <bool> ((MathStartChars.contains(C) || MathContinueChars
.contains(C)) && "Unexpected mathematical notation codepoint"
) ? void (0) : __assert_fail ("(MathStartChars.contains(C) || MathContinueChars.contains(C)) && \"Unexpected mathematical notation codepoint\""
, "clang/lib/Lex/Lexer.cpp", 1554, __extension__ __PRETTY_FUNCTION__
))
1554 "Unexpected mathematical notation codepoint")(static_cast <bool> ((MathStartChars.contains(C) || MathContinueChars
.contains(C)) && "Unexpected mathematical notation codepoint"
) ? void (0) : __assert_fail ("(MathStartChars.contains(C) || MathContinueChars.contains(C)) && \"Unexpected mathematical notation codepoint\""
, "clang/lib/Lex/Lexer.cpp", 1554, __extension__ __PRETTY_FUNCTION__
))
;
1555 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1556 << codepointAsHexString(C) << Range;
1557}
1558
1559static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1560 const char *End) {
1561 return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1562 L.getSourceLocation(End));
1563}
1564
1565static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1566 CharSourceRange Range, bool IsFirst) {
1567 // Check C99 compatibility.
1568 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1569 enum {
1570 CannotAppearInIdentifier = 0,
1571 CannotStartIdentifier
1572 };
1573
1574 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1575 C99AllowedIDCharRanges);
1576 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1577 C99DisallowedInitialIDCharRanges);
1578 if (!C99AllowedIDChars.contains(C)) {
1579 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1580 << Range
1581 << CannotAppearInIdentifier;
1582 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1583 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1584 << Range
1585 << CannotStartIdentifier;
1586 }
1587 }
1588}
1589
1590/// After encountering UTF-8 character C and interpreting it as an identifier
1591/// character, check whether it's a homoglyph for a common non-identifier
1592/// source character that is unlikely to be an intentional identifier
1593/// character and warn if so.
1594static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1595 CharSourceRange Range) {
1596 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1597 struct HomoglyphPair {
1598 uint32_t Character;
1599 char LooksLike;
1600 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1601 };
1602 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1603 {U'\u00ad', 0}, // SOFT HYPHEN
1604 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1605 {U'\u037e', ';'}, // GREEK QUESTION MARK
1606 {U'\u200b', 0}, // ZERO WIDTH SPACE
1607 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1608 {U'\u200d', 0}, // ZERO WIDTH JOINER
1609 {U'\u2060', 0}, // WORD JOINER
1610 {U'\u2061', 0}, // FUNCTION APPLICATION
1611 {U'\u2062', 0}, // INVISIBLE TIMES
1612 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1613 {U'\u2064', 0}, // INVISIBLE PLUS
1614 {U'\u2212', '-'}, // MINUS SIGN
1615 {U'\u2215', '/'}, // DIVISION SLASH
1616 {U'\u2216', '\\'}, // SET MINUS
1617 {U'\u2217', '*'}, // ASTERISK OPERATOR
1618 {U'\u2223', '|'}, // DIVIDES
1619 {U'\u2227', '^'}, // LOGICAL AND
1620 {U'\u2236', ':'}, // RATIO
1621 {U'\u223c', '~'}, // TILDE OPERATOR
1622 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1623 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1624 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1625 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1626 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1627 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1628 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1629 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1630 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1631 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1632 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1633 {U'\uff0c', ','}, // FULLWIDTH COMMA
1634 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1635 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1636 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1637 {U'\uff1a', ':'}, // FULLWIDTH COLON
1638 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1639 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1640 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1641 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1642 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1643 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1644 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1645 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1646 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1647 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1648 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1649 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1650 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1651 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1652 {0, 0}
1653 };
1654 auto Homoglyph =
1655 std::lower_bound(std::begin(SortedHomoglyphs),
1656 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1657 if (Homoglyph->Character == C) {
1658 if (Homoglyph->LooksLike) {
1659 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1660 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1661 << Range << codepointAsHexString(C) << LooksLikeStr;
1662 } else {
1663 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1664 << Range << codepointAsHexString(C);
1665 }
1666 }
1667}
1668
1669static void diagnoseInvalidUnicodeCodepointInIdentifier(
1670 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1671 CharSourceRange Range, bool IsFirst) {
1672 if (isASCII(CodePoint))
1673 return;
1674
1675 bool IsExtension;
1676 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1677 bool IsIDContinue =
1678 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1679
1680 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1681 return;
1682
1683 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1684
1685 if (!IsFirst || InvalidOnlyAtStart) {
1686 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1687 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1688 << FixItHint::CreateRemoval(Range);
1689 } else {
1690 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1691 << Range << codepointAsHexString(CodePoint)
1692 << FixItHint::CreateRemoval(Range);
1693 }
1694}
1695
1696bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1697 Token &Result) {
1698 const char *UCNPtr = CurPtr + Size;
1699 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1700 if (CodePoint == 0) {
1701 return false;
1702 }
1703 bool IsExtension = false;
1704 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1705 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1706 return false;
1707 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1708 !PP->isPreprocessedOutput())
1709 diagnoseInvalidUnicodeCodepointInIdentifier(
1710 PP->getDiagnostics(), LangOpts, CodePoint,
1711 makeCharRange(*this, CurPtr, UCNPtr),
1712 /*IsFirst=*/false);
1713
1714 // We got a unicode codepoint that is neither a space nor a
1715 // a valid identifier part.
1716 // Carry on as if the codepoint was valid for recovery purposes.
1717 } else if (!isLexingRawMode()) {
1718 if (IsExtension)
1719 diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
1720 makeCharRange(*this, CurPtr, UCNPtr));
1721
1722 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1723 makeCharRange(*this, CurPtr, UCNPtr),
1724 /*IsFirst=*/false);
1725 }
1726
1727 Result.setFlag(Token::HasUCN);
1728 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1729 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1730 CurPtr = UCNPtr;
1731 else
1732 while (CurPtr != UCNPtr)
1733 (void)getAndAdvanceChar(CurPtr, Result);
1734 return true;
1735}
1736
1737bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1738 const char *UnicodePtr = CurPtr;
1739 llvm::UTF32 CodePoint;
1740 llvm::ConversionResult Result =
1741 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1742 (const llvm::UTF8 *)BufferEnd,
1743 &CodePoint,
1744 llvm::strictConversion);
1745 if (Result != llvm::conversionOK)
1746 return false;
1747
1748 bool IsExtension = false;
1749 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1750 IsExtension)) {
1751 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1752 return false;
1753
1754 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1755 !PP->isPreprocessedOutput())
1756 diagnoseInvalidUnicodeCodepointInIdentifier(
1757 PP->getDiagnostics(), LangOpts, CodePoint,
1758 makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
1759 // We got a unicode codepoint that is neither a space nor a
1760 // a valid identifier part. Carry on as if the codepoint was
1761 // valid for recovery purposes.
1762 } else if (!isLexingRawMode()) {
1763 if (IsExtension)
1764 diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
1765 makeCharRange(*this, CurPtr, UnicodePtr));
1766 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1767 makeCharRange(*this, CurPtr, UnicodePtr),
1768 /*IsFirst=*/false);
1769 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1770 makeCharRange(*this, CurPtr, UnicodePtr));
1771 }
1772
1773 CurPtr = UnicodePtr;
1774 return true;
1775}
1776
1777bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1778 const char *CurPtr) {
1779 bool IsExtension = false;
1780 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1781 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1782 !PP->isPreprocessedOutput()) {
1783 if (IsExtension)
1784 diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,
1785 makeCharRange(*this, BufferPtr, CurPtr));
1786 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
1787 makeCharRange(*this, BufferPtr, CurPtr),
1788 /*IsFirst=*/true);
1789 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
1790 makeCharRange(*this, BufferPtr, CurPtr));
1791 }
1792
1793 MIOpt.ReadToken();
1794 return LexIdentifierContinue(Result, CurPtr);
1795 }
1796
1797 if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1798 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1799 !isUnicodeWhitespace(C)) {
1800 // Non-ASCII characters tend to creep into source code unintentionally.
1801 // Instead of letting the parser complain about the unknown token,
1802 // just drop the character.
1803 // Note that we can /only/ do this when the non-ASCII character is actually
1804 // spelled as Unicode, not written as a UCN. The standard requires that
1805 // we not throw away any possible preprocessor tokens, but there's a
1806 // loophole in the mapping of Unicode characters to basic character set
1807 // characters that allows us to map these particular characters to, say,
1808 // whitespace.
1809 diagnoseInvalidUnicodeCodepointInIdentifier(
1810 PP->getDiagnostics(), LangOpts, C,
1811 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1812 BufferPtr = CurPtr;
1813 return false;
1814 }
1815
1816 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1817 // up by accident.
1818 MIOpt.ReadToken();
1819 FormTokenWithChars(Result, CurPtr, tok::unknown);
1820 return true;
1821}
1822
1823bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1824 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1825 while (true) {
1826 unsigned char C = *CurPtr;
1827 // Fast path.
1828 if (isAsciiIdentifierContinue(C)) {
1829 ++CurPtr;
1830 continue;
1831 }
1832
1833 unsigned Size;
1834 // Slow path: handle trigraph, unicode codepoints, UCNs.
1835 C = getCharAndSize(CurPtr, Size);
1836 if (isAsciiIdentifierContinue(C)) {
1837 CurPtr = ConsumeChar(CurPtr, Size, Result);
1838 continue;
1839 }
1840 if (C == '$') {
1841 // If we hit a $ and they are not supported in identifiers, we are done.
1842 if (!LangOpts.DollarIdents)
1843 break;
1844 // Otherwise, emit a diagnostic and continue.
1845 if (!isLexingRawMode())
1846 Diag(CurPtr, diag::ext_dollar_in_identifier);
1847 CurPtr = ConsumeChar(CurPtr, Size, Result);
1848 continue;
1849 }
1850 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1851 continue;
1852 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1853 continue;
1854 // Neither an expected Unicode codepoint nor a UCN.
1855 break;
1856 }
1857
1858 const char *IdStart = BufferPtr;
1859 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1860 Result.setRawIdentifierData(IdStart);
1861
1862 // If we are in raw mode, return this identifier raw. There is no need to
1863 // look up identifier information or attempt to macro expand it.
1864 if (LexingRawMode)
1865 return true;
1866
1867 // Fill in Result.IdentifierInfo and update the token kind,
1868 // looking up the identifier in the identifier table.
1869 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1870 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1871 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1872
1873 // If the completion point is at the end of an identifier, we want to treat
1874 // the identifier as incomplete even if it resolves to a macro or a keyword.
1875 // This allows e.g. 'class^' to complete to 'classifier'.
1876 if (isCodeCompletionPoint(CurPtr)) {
1877 // Return the code-completion token.
1878 Result.setKind(tok::code_completion);
1879 // Skip the code-completion char and all immediate identifier characters.
1880 // This ensures we get consistent behavior when completing at any point in
1881 // an identifier (i.e. at the start, in the middle, at the end). Note that
1882 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1883 // simpler.
1884 assert(*CurPtr == 0 && "Completion character must be 0")(static_cast <bool> (*CurPtr == 0 && "Completion character must be 0"
) ? void (0) : __assert_fail ("*CurPtr == 0 && \"Completion character must be 0\""
, "clang/lib/Lex/Lexer.cpp", 1884, __extension__ __PRETTY_FUNCTION__
))
;
1885 ++CurPtr;
1886 // Note that code completion token is not added as a separate character
1887 // when the completion point is at the end of the buffer. Therefore, we need
1888 // to check if the buffer has ended.
1889 if (CurPtr < BufferEnd) {
1890 while (isAsciiIdentifierContinue(*CurPtr))
1891 ++CurPtr;
1892 }
1893 BufferPtr = CurPtr;
1894 return true;
1895 }
1896
1897 // Finally, now that we know we have an identifier, pass this off to the
1898 // preprocessor, which may macro expand it or something.
1899 if (II->isHandleIdentifierCase())
1900 return PP->HandleIdentifier(Result);
1901
1902 return true;
1903}
1904
1905/// isHexaLiteral - Return true if Start points to a hex constant.
1906/// in microsoft mode (where this is supposed to be several different tokens).
1907bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1908 unsigned Size;
1909 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1910 if (C1 != '0')
1911 return false;
1912 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1913 return (C2 == 'x' || C2 == 'X');
1914}
1915
1916/// LexNumericConstant - Lex the remainder of a integer or floating point
1917/// constant. From[-1] is the first character lexed. Return the end of the
1918/// constant.
1919bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1920 unsigned Size;
1921 char C = getCharAndSize(CurPtr, Size);
1922 char PrevCh = 0;
1923 while (isPreprocessingNumberBody(C)) {
1924 CurPtr = ConsumeChar(CurPtr, Size, Result);
1925 PrevCh = C;
1926 C = getCharAndSize(CurPtr, Size);
1927 }
1928
1929 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1930 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1931 // If we are in Microsoft mode, don't continue if the constant is hex.
1932 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1933 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1934 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1935 }
1936
1937 // If we have a hex FP constant, continue.
1938 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1939 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1940 // not-quite-conforming extension. Only do so if this looks like it's
1941 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1942 bool IsHexFloat = true;
1943 if (!LangOpts.C99) {
1944 if (!isHexaLiteral(BufferPtr, LangOpts))
1945 IsHexFloat = false;
1946 else if (!LangOpts.CPlusPlus17 &&
1947 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1948 IsHexFloat = false;
1949 }
1950 if (IsHexFloat)
1951 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1952 }
1953
1954 // If we have a digit separator, continue.
1955 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C2x)) {
1956 unsigned NextSize;
1957 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
1958 if (isAsciiIdentifierContinue(Next)) {
1959 if (!isLexingRawMode())
1960 Diag(CurPtr, LangOpts.CPlusPlus
1961 ? diag::warn_cxx11_compat_digit_separator
1962 : diag::warn_c2x_compat_digit_separator);
1963 CurPtr = ConsumeChar(CurPtr, Size, Result);
1964 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1965 return LexNumericConstant(Result, CurPtr);
1966 }
1967 }
1968
1969 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1970 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1971 return LexNumericConstant(Result, CurPtr);
1972 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1973 return LexNumericConstant(Result, CurPtr);
1974
1975 // Update the location of token as well as BufferPtr.
1976 const char *TokStart = BufferPtr;
1977 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1978 Result.setLiteralData(TokStart);
1979 return true;
1980}
1981
1982/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1983/// in C++11, or warn on a ud-suffix in C++98.
1984const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1985 bool IsStringLiteral) {
1986 assert(LangOpts.CPlusPlus)(static_cast <bool> (LangOpts.CPlusPlus) ? void (0) : __assert_fail
("LangOpts.CPlusPlus", "clang/lib/Lex/Lexer.cpp", 1986, __extension__
__PRETTY_FUNCTION__))
;
1987
1988 // Maximally munch an identifier.
1989 unsigned Size;
1990 char C = getCharAndSize(CurPtr, Size);
1991 bool Consumed = false;
1992
1993 if (!isAsciiIdentifierStart(C)) {
1994 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1995 Consumed = true;
1996 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1997 Consumed = true;
1998 else
1999 return CurPtr;
2000 }
2001
2002 if (!LangOpts.CPlusPlus11) {
2003 if (!isLexingRawMode())
2004 Diag(CurPtr,
2005 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2006 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2007 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
2008 return CurPtr;
2009 }
2010
2011 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2012 // that does not start with an underscore is ill-formed. As a conforming
2013 // extension, we treat all such suffixes as if they had whitespace before
2014 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2015 // likely to be a ud-suffix than a macro, however, and accept that.
2016 if (!Consumed) {
2017 bool IsUDSuffix = false;
2018 if (C == '_')
2019 IsUDSuffix = true;
2020 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2021 // In C++1y, we need to look ahead a few characters to see if this is a
2022 // valid suffix for a string literal or a numeric literal (this could be
2023 // the 'operator""if' defining a numeric literal operator).
2024 const unsigned MaxStandardSuffixLength = 3;
2025 char Buffer[MaxStandardSuffixLength] = { C };
2026 unsigned Consumed = Size;
2027 unsigned Chars = 1;
2028 while (true) {
2029 unsigned NextSize;
2030 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
2031 if (!isAsciiIdentifierContinue(Next)) {
2032 // End of suffix. Check whether this is on the allowed list.
2033 const StringRef CompleteSuffix(Buffer, Chars);
2034 IsUDSuffix =
2035 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2036 break;
2037 }
2038
2039 if (Chars == MaxStandardSuffixLength)
2040 // Too long: can't be a standard suffix.
2041 break;
2042
2043 Buffer[Chars++] = Next;
2044 Consumed += NextSize;
2045 }
2046 }
2047
2048 if (!IsUDSuffix) {
2049 if (!isLexingRawMode())
2050 Diag(CurPtr, LangOpts.MSVCCompat
2051 ? diag::ext_ms_reserved_user_defined_literal
2052 : diag::ext_reserved_user_defined_literal)
2053 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
2054 return CurPtr;
2055 }
2056
2057 CurPtr = ConsumeChar(CurPtr, Size, Result);
2058 }
2059
2060 Result.setFlag(Token::HasUDSuffix);
2061 while (true) {
2062 C = getCharAndSize(CurPtr, Size);
2063 if (isAsciiIdentifierContinue(C)) {
2064 CurPtr = ConsumeChar(CurPtr, Size, Result);
2065 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2066 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
2067 } else
2068 break;
2069 }
2070
2071 return CurPtr;
2072}
2073
2074/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2075/// either " or L" or u8" or u" or U".
2076bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2077 tok::TokenKind Kind) {
2078 const char *AfterQuote = CurPtr;
2079 // Does this string contain the \0 character?
2080 const char *NulCharacter = nullptr;
2081
2082 if (!isLexingRawMode() &&
2083 (Kind == tok::utf8_string_literal ||
2084 Kind == tok::utf16_string_literal ||
2085 Kind == tok::utf32_string_literal))
2086 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2087 : diag::warn_c99_compat_unicode_literal);
2088
2089 char C = getAndAdvanceChar(CurPtr, Result);
2090 while (C != '"') {
2091 // Skip escaped characters. Escaped newlines will already be processed by
2092 // getAndAdvanceChar.
2093 if (C == '\\')
2094 C = getAndAdvanceChar(CurPtr, Result);
2095
2096 if (C == '\n' || C == '\r' || // Newline.
2097 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2098 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2099 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2100 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2101 return true;
2102 }
2103
2104 if (C == 0) {
2105 if (isCodeCompletionPoint(CurPtr-1)) {
2106 if (ParsingFilename)
2107 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2108 else
2109 PP->CodeCompleteNaturalLanguage();
2110 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2111 cutOffLexing();
2112 return true;
2113 }
2114
2115 NulCharacter = CurPtr-1;
2116 }
2117 C = getAndAdvanceChar(CurPtr, Result);
2118 }
2119
2120 // If we are in C++11, lex the optional ud-suffix.
2121 if (LangOpts.CPlusPlus)
2122 CurPtr = LexUDSuffix(Result, CurPtr, true);
2123
2124 // If a nul character existed in the string, warn about it.
2125 if (NulCharacter && !isLexingRawMode())
2126 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2127
2128 // Update the location of the token as well as the BufferPtr instance var.
2129 const char *TokStart = BufferPtr;
2130 FormTokenWithChars(Result, CurPtr, Kind);
2131 Result.setLiteralData(TokStart);
2132 return true;
2133}
2134
2135/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2136/// having lexed R", LR", u8R", uR", or UR".
2137bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2138 tok::TokenKind Kind) {
2139 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2140 // Between the initial and final double quote characters of the raw string,
2141 // any transformations performed in phases 1 and 2 (trigraphs,
2142 // universal-character-names, and line splicing) are reverted.
2143
2144 if (!isLexingRawMode())
2145 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2146
2147 unsigned PrefixLen = 0;
2148
2149 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
2150 ++PrefixLen;
2151
2152 // If the last character was not a '(', then we didn't lex a valid delimiter.
2153 if (CurPtr[PrefixLen] != '(') {
2154 if (!isLexingRawMode()) {
2155 const char *PrefixEnd = &CurPtr[PrefixLen];
2156 if (PrefixLen == 16) {
2157 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2158 } else {
2159 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2160 << StringRef(PrefixEnd, 1);
2161 }
2162 }
2163
2164 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2165 // it's possible the '"' was intended to be part of the raw string, but
2166 // there's not much we can do about that.
2167 while (true) {
2168 char C = *CurPtr++;
2169
2170 if (C == '"')
2171 break;
2172 if (C == 0 && CurPtr-1 == BufferEnd) {
2173 --CurPtr;
2174 break;
2175 }
2176 }
2177
2178 FormTokenWithChars(Result, CurPtr, tok::unknown);
2179 return true;
2180 }
2181
2182 // Save prefix and move CurPtr past it
2183 const char *Prefix = CurPtr;
2184 CurPtr += PrefixLen + 1; // skip over prefix and '('
2185
2186 while (true) {
2187 char C = *CurPtr++;
2188
2189 if (C == ')') {
2190 // Check for prefix match and closing quote.
2191 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2192 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2193 break;
2194 }
2195 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2196 if (!isLexingRawMode())
2197 Diag(BufferPtr, diag::err_unterminated_raw_string)
2198 << StringRef(Prefix, PrefixLen);
2199 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2200 return true;
2201 }
2202 }
2203
2204 // If we are in C++11, lex the optional ud-suffix.
2205 if (LangOpts.CPlusPlus)
2206 CurPtr = LexUDSuffix(Result, CurPtr, true);
2207
2208 // Update the location of token as well as BufferPtr.
2209 const char *TokStart = BufferPtr;
2210 FormTokenWithChars(Result, CurPtr, Kind);
2211 Result.setLiteralData(TokStart);
2212 return true;
2213}
2214
2215/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2216/// after having lexed the '<' character. This is used for #include filenames.
2217bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2218 // Does this string contain the \0 character?
2219 const char *NulCharacter = nullptr;
2220 const char *AfterLessPos = CurPtr;
2221 char C = getAndAdvanceChar(CurPtr, Result);
2222 while (C != '>') {
2223 // Skip escaped characters. Escaped newlines will already be processed by
2224 // getAndAdvanceChar.
2225 if (C == '\\')
2226 C = getAndAdvanceChar(CurPtr, Result);
2227
2228 if (isVerticalWhitespace(C) || // Newline.
2229 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2230 // If the filename is unterminated, then it must just be a lone <
2231 // character. Return this as such.
2232 FormTokenWithChars(Result, AfterLessPos, tok::less);
2233 return true;
2234 }
2235
2236 if (C == 0) {
2237 if (isCodeCompletionPoint(CurPtr - 1)) {
2238 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2239 cutOffLexing();
2240 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2241 return true;
2242 }
2243 NulCharacter = CurPtr-1;
2244 }
2245 C = getAndAdvanceChar(CurPtr, Result);
2246 }
2247
2248 // If a nul character existed in the string, warn about it.
2249 if (NulCharacter && !isLexingRawMode())
2250 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2251
2252 // Update the location of token as well as BufferPtr.
2253 const char *TokStart = BufferPtr;
2254 FormTokenWithChars(Result, CurPtr, tok::header_name);
2255 Result.setLiteralData(TokStart);
2256 return true;
2257}
2258
2259void Lexer::codeCompleteIncludedFile(const char *PathStart,
2260 const char *CompletionPoint,
2261 bool IsAngled) {
2262 // Completion only applies to the filename, after the last slash.
2263 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2264 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2265 auto Slash = PartialPath.find_last_of(SlashChars);
2266 StringRef Dir =
2267 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2268 const char *StartOfFilename =
2269 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2270 // Code completion filter range is the filename only, up to completion point.
2271 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2272 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2273 // We should replace the characters up to the closing quote or closest slash,
2274 // if any.
2275 while (CompletionPoint < BufferEnd) {
2276 char Next = *(CompletionPoint + 1);
2277 if (Next == 0 || Next == '\r' || Next == '\n')
2278 break;
2279 ++CompletionPoint;
2280 if (Next == (IsAngled ? '>' : '"'))
2281 break;
2282 if (SlashChars.contains(Next))
2283 break;
2284 }
2285
2286 PP->setCodeCompletionTokenRange(
2287 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2288 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2289 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2290}
2291
2292/// LexCharConstant - Lex the remainder of a character constant, after having
2293/// lexed either ' or L' or u8' or u' or U'.
2294bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2295 tok::TokenKind Kind) {
2296 // Does this character contain the \0 character?
2297 const char *NulCharacter = nullptr;
2298
2299 if (!isLexingRawMode()) {
2300 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2301 Diag(BufferPtr, LangOpts.CPlusPlus
2302 ? diag::warn_cxx98_compat_unicode_literal
2303 : diag::warn_c99_compat_unicode_literal);
2304 else if (Kind == tok::utf8_char_constant)
2305 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2306 }
2307
2308 char C = getAndAdvanceChar(CurPtr, Result);
2309 if (C == '\'') {
2310 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2311 Diag(BufferPtr, diag::ext_empty_character);
2312 FormTokenWithChars(Result, CurPtr, tok::unknown);
2313 return true;
2314 }
2315
2316 while (C != '\'') {
2317 // Skip escaped characters.
2318 if (C == '\\')
2319 C = getAndAdvanceChar(CurPtr, Result);
2320
2321 if (C == '\n' || C == '\r' || // Newline.
2322 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2323 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2324 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2325 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2326 return true;
2327 }
2328
2329 if (C == 0) {
2330 if (isCodeCompletionPoint(CurPtr-1)) {
2331 PP->CodeCompleteNaturalLanguage();
2332 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2333 cutOffLexing();
2334 return true;
2335 }
2336
2337 NulCharacter = CurPtr-1;
2338 }
2339 C = getAndAdvanceChar(CurPtr, Result);
2340 }
2341
2342 // If we are in C++11, lex the optional ud-suffix.
2343 if (LangOpts.CPlusPlus)
2344 CurPtr = LexUDSuffix(Result, CurPtr, false);
2345
2346 // If a nul character existed in the character, warn about it.
2347 if (NulCharacter && !isLexingRawMode())
2348 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2349
2350 // Update the location of token as well as BufferPtr.
2351 const char *TokStart = BufferPtr;
2352 FormTokenWithChars(Result, CurPtr, Kind);
2353 Result.setLiteralData(TokStart);
2354 return true;
2355}
2356
2357/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2358/// Update BufferPtr to point to the next non-whitespace character and return.
2359///
2360/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2361bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2362 bool &TokAtPhysicalStartOfLine) {
2363 // Whitespace - Skip it, then return the token after the whitespace.
2364 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2365
2366 unsigned char Char = *CurPtr;
2367
2368 const char *lastNewLine = nullptr;
2369 auto setLastNewLine = [&](const char *Ptr) {
2370 lastNewLine = Ptr;
2371 if (!NewLinePtr)
2372 NewLinePtr = Ptr;
2373 };
2374 if (SawNewline)
2375 setLastNewLine(CurPtr - 1);
2376
2377 // Skip consecutive spaces efficiently.
2378 while (true) {
2379 // Skip horizontal whitespace very aggressively.
2380 while (isHorizontalWhitespace(Char))
2381 Char = *++CurPtr;
2382
2383 // Otherwise if we have something other than whitespace, we're done.
2384 if (!isVerticalWhitespace(Char))
2385 break;
2386
2387 if (ParsingPreprocessorDirective) {
2388 // End of preprocessor directive line, let LexTokenInternal handle this.
2389 BufferPtr = CurPtr;
2390 return false;
2391 }
2392
2393 // OK, but handle newline.
2394 if (*CurPtr == '\n')
2395 setLastNewLine(CurPtr);
2396 SawNewline = true;
2397 Char = *++CurPtr;
2398 }
2399
2400 // If the client wants us to return whitespace, return it now.
2401 if (isKeepWhitespaceMode()) {
2402 FormTokenWithChars(Result, CurPtr, tok::unknown);
2403 if (SawNewline) {
2404 IsAtStartOfLine = true;
2405 IsAtPhysicalStartOfLine = true;
2406 }
2407 // FIXME: The next token will not have LeadingSpace set.
2408 return true;
2409 }
2410
2411 // If this isn't immediately after a newline, there is leading space.
2412 char PrevChar = CurPtr[-1];
2413 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2414
2415 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2416 if (SawNewline) {
2417 Result.setFlag(Token::StartOfLine);
2418 TokAtPhysicalStartOfLine = true;
2419
2420 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2421 if (auto *Handler = PP->getEmptylineHandler())
2422 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2423 getSourceLocation(lastNewLine)));
2424 }
2425 }
2426
2427 BufferPtr = CurPtr;
2428 return false;
2429}
2430
2431/// We have just read the // characters from input. Skip until we find the
2432/// newline character that terminates the comment. Then update BufferPtr and
2433/// return.
2434///
2435/// If we're in KeepCommentMode or any CommentHandler has inserted
2436/// some tokens, this will store the first token and return true.
2437bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2438 bool &TokAtPhysicalStartOfLine) {
2439 // If Line comments aren't explicitly enabled for this language, emit an
2440 // extension warning.
2441 if (!LineComment) {
2442 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2443 Diag(BufferPtr, diag::ext_line_comment);
2444
2445 // Mark them enabled so we only emit one warning for this translation
2446 // unit.
2447 LineComment = true;
2448 }
2449
2450 // Scan over the body of the comment. The common case, when scanning, is that
2451 // the comment contains normal ascii characters with nothing interesting in
2452 // them. As such, optimize for this case with the inner loop.
2453 //
2454 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2455 // character that ends the line comment.
2456
2457 // C++23 [lex.phases] p1
2458 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2459 // diagnostic only once per entire ill-formed subsequence to avoid
2460 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2461 bool UnicodeDecodingAlreadyDiagnosed = false;
2462
2463 char C;
2464 while (true) {
2465 C = *CurPtr;
2466 // Skip over characters in the fast loop.
2467 while (isASCII(C) && C != 0 && // Potentially EOF.
2468 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2469 C = *++CurPtr;
2470 UnicodeDecodingAlreadyDiagnosed = false;
2471 }
2472
2473 if (!isASCII(C)) {
2474 unsigned Length = llvm::getUTF8SequenceSize(
2475 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2476 if (Length == 0) {
2477 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2478 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2479 UnicodeDecodingAlreadyDiagnosed = true;
2480 ++CurPtr;
2481 } else {
2482 UnicodeDecodingAlreadyDiagnosed = false;
2483 CurPtr += Length;
2484 }
2485 continue;
2486 }
2487
2488 const char *NextLine = CurPtr;
2489 if (C != 0) {
2490 // We found a newline, see if it's escaped.
2491 const char *EscapePtr = CurPtr-1;
2492 bool HasSpace = false;
2493 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2494 --EscapePtr;
2495 HasSpace = true;
2496 }
2497
2498 if (*EscapePtr == '\\')
2499 // Escaped newline.
2500 CurPtr = EscapePtr;
2501 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2502 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2503 // Trigraph-escaped newline.
2504 CurPtr = EscapePtr-2;
2505 else
2506 break; // This is a newline, we're done.
2507
2508 // If there was space between the backslash and newline, warn about it.
2509 if (HasSpace && !isLexingRawMode())
2510 Diag(EscapePtr, diag::backslash_newline_space);
2511 }
2512
2513 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2514 // properly decode the character. Read it in raw mode to avoid emitting
2515 // diagnostics about things like trigraphs. If we see an escaped newline,
2516 // we'll handle it below.
2517 const char *OldPtr = CurPtr;
2518 bool OldRawMode = isLexingRawMode();
2519 LexingRawMode = true;
2520 C = getAndAdvanceChar(CurPtr, Result);
2521 LexingRawMode = OldRawMode;
2522
2523 // If we only read only one character, then no special handling is needed.
2524 // We're done and can skip forward to the newline.
2525 if (C != 0 && CurPtr == OldPtr+1) {
2526 CurPtr = NextLine;
2527 break;
2528 }
2529
2530 // If we read multiple characters, and one of those characters was a \r or
2531 // \n, then we had an escaped newline within the comment. Emit diagnostic
2532 // unless the next line is also a // comment.
2533 if (CurPtr != OldPtr + 1 && C != '/' &&
2534 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2535 for (; OldPtr != CurPtr; ++OldPtr)
2536 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2537 // Okay, we found a // comment that ends in a newline, if the next
2538 // line is also a // comment, but has spaces, don't emit a diagnostic.
2539 if (isWhitespace(C)) {
2540 const char *ForwardPtr = CurPtr;
2541 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2542 ++ForwardPtr;
2543 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2544 break;
2545 }
2546
2547 if (!isLexingRawMode())
2548 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2549 break;
2550 }
2551 }
2552
2553 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2554 --CurPtr;
2555 break;
2556 }
2557
2558 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2559 PP->CodeCompleteNaturalLanguage();
2560 cutOffLexing();
2561 return false;
2562 }
2563 }
2564
2565 // Found but did not consume the newline. Notify comment handlers about the
2566 // comment unless we're in a #if 0 block.
2567 if (PP && !isLexingRawMode() &&
2568 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2569 getSourceLocation(CurPtr)))) {
2570 BufferPtr = CurPtr;
2571 return true; // A token has to be returned.
2572 }
2573
2574 // If we are returning comments as tokens, return this comment as a token.
2575 if (inKeepCommentMode())
2576 return SaveLineComment(Result, CurPtr);
2577
2578 // If we are inside a preprocessor directive and we see the end of line,
2579 // return immediately, so that the lexer can return this as an EOD token.
2580 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2581 BufferPtr = CurPtr;
2582 return false;
2583 }
2584
2585 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2586 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2587 // contribute to another token), it isn't needed for correctness. Note that
2588 // this is ok even in KeepWhitespaceMode, because we would have returned the
2589 /// comment above in that mode.
2590 NewLinePtr = CurPtr++;
2591
2592 // The next returned token is at the start of the line.
2593 Result.setFlag(Token::StartOfLine);
2594 TokAtPhysicalStartOfLine = true;
2595 // No leading whitespace seen so far.
2596 Result.clearFlag(Token::LeadingSpace);
2597 BufferPtr = CurPtr;
2598 return false;
2599}
2600
2601/// If in save-comment mode, package up this Line comment in an appropriate
2602/// way and return it.
2603bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2604 // If we're not in a preprocessor directive, just return the // comment
2605 // directly.
2606 FormTokenWithChars(Result, CurPtr, tok::comment);
2607
2608 if (!ParsingPreprocessorDirective || LexingRawMode)
2609 return true;
2610
2611 // If this Line-style comment is in a macro definition, transmogrify it into
2612 // a C-style block comment.
2613 bool Invalid = false;
2614 std::string Spelling = PP->getSpelling(Result, &Invalid);
2615 if (Invalid)
2616 return true;
2617
2618 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?")(static_cast <bool> (Spelling[0] == '/' && Spelling
[1] == '/' && "Not line comment?") ? void (0) : __assert_fail
("Spelling[0] == '/' && Spelling[1] == '/' && \"Not line comment?\""
, "clang/lib/Lex/Lexer.cpp", 2618, __extension__ __PRETTY_FUNCTION__
))
;
2619 Spelling[1] = '*'; // Change prefix to "/*".
2620 Spelling += "*/"; // add suffix.
2621
2622 Result.setKind(tok::comment);
2623 PP->CreateString(Spelling, Result,
2624 Result.getLocation(), Result.getLocation());
2625 return true;
2626}
2627
2628/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2629/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2630/// a diagnostic if so. We know that the newline is inside of a block comment.
2631static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2632 bool Trigraphs) {
2633 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r')(static_cast <bool> (CurPtr[0] == '\n' || CurPtr[0] == '\r'
) ? void (0) : __assert_fail ("CurPtr[0] == '\\n' || CurPtr[0] == '\\r'"
, "clang/lib/Lex/Lexer.cpp", 2633, __extension__ __PRETTY_FUNCTION__
))
;
2634
2635 // Position of the first trigraph in the ending sequence.
2636 const char *TrigraphPos = nullptr;
2637 // Position of the first whitespace after a '\' in the ending sequence.
2638 const char *SpacePos = nullptr;
2639
2640 while (true) {
2641 // Back up off the newline.
2642 --CurPtr;
2643
2644 // If this is a two-character newline sequence, skip the other character.
2645 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2646 // \n\n or \r\r -> not escaped newline.
2647 if (CurPtr[0] == CurPtr[1])
2648 return false;
2649 // \n\r or \r\n -> skip the newline.
2650 --CurPtr;
2651 }
2652
2653 // If we have horizontal whitespace, skip over it. We allow whitespace
2654 // between the slash and newline.
2655 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2656 SpacePos = CurPtr;
2657 --CurPtr;
2658 }
2659
2660 // If we have a slash, this is an escaped newline.
2661 if (*CurPtr == '\\') {
2662 --CurPtr;
2663 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2664 // This is a trigraph encoding of a slash.
2665 TrigraphPos = CurPtr - 2;
2666 CurPtr -= 3;
2667 } else {
2668 return false;
2669 }
2670
2671 // If the character preceding the escaped newline is a '*', then after line
2672 // splicing we have a '*/' ending the comment.
2673 if (*CurPtr == '*')
2674 break;
2675
2676 if (*CurPtr != '\n' && *CurPtr != '\r')
2677 return false;
2678 }
2679
2680 if (TrigraphPos) {
2681 // If no trigraphs are enabled, warn that we ignored this trigraph and
2682 // ignore this * character.
2683 if (!Trigraphs) {
2684 if (!L->isLexingRawMode())
2685 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2686 return false;
2687 }
2688 if (!L->isLexingRawMode())
2689 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2690 }
2691
2692 // Warn about having an escaped newline between the */ characters.
2693 if (!L->isLexingRawMode())
2694 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2695
2696 // If there was space between the backslash and newline, warn about it.
2697 if (SpacePos && !L->isLexingRawMode())
2698 L->Diag(SpacePos, diag::backslash_newline_space);
2699
2700 return true;
2701}
2702
2703#ifdef __SSE2__1
2704#include <emmintrin.h>
2705#elif __ALTIVEC__
2706#include <altivec.h>
2707#undef bool
2708#endif
2709
2710/// We have just read from input the / and * characters that started a comment.
2711/// Read until we find the * and / characters that terminate the comment.
2712/// Note that we don't bother decoding trigraphs or escaped newlines in block
2713/// comments, because they cannot cause the comment to end. The only thing
2714/// that can happen is the comment could end with an escaped newline between
2715/// the terminating * and /.
2716///
2717/// If we're in KeepCommentMode or any CommentHandler has inserted
2718/// some tokens, this will store the first token and return true.
2719bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2720 bool &TokAtPhysicalStartOfLine) {
2721 // Scan one character past where we should, looking for a '/' character. Once
2722 // we find it, check to see if it was preceded by a *. This common
2723 // optimization helps people who like to put a lot of * characters in their
2724 // comments.
2725
2726 // The first character we get with newlines and trigraphs skipped to handle
2727 // the degenerate /*/ case below correctly if the * has an escaped newline
2728 // after it.
2729 unsigned CharSize;
2730 unsigned char C = getCharAndSize(CurPtr, CharSize);
13
Calling 'Lexer::getCharAndSize'
16
Returning from 'Lexer::getCharAndSize'
2731 CurPtr += CharSize;
2732 if (C == 0 && CurPtr == BufferEnd+1) {
17
Assuming 'C' is not equal to 0
2733 if (!isLexingRawMode())
2734 Diag(BufferPtr, diag::err_unterminated_block_comment);
2735 --CurPtr;
2736
2737 // KeepWhitespaceMode should return this broken comment as a token. Since
2738 // it isn't a well formed comment, just return it as an 'unknown' token.
2739 if (isKeepWhitespaceMode()) {
2740 FormTokenWithChars(Result, CurPtr, tok::unknown);
2741 return true;
2742 }
2743
2744 BufferPtr = CurPtr;
2745 return false;
2746 }
2747
2748 // Check to see if the first character after the '/*' is another /. If so,
2749 // then this slash does not end the block comment, it is part of it.
2750 if (C == '/')
18
Assuming the condition is false
19
Taking false branch
2751 C = *CurPtr++;
2752
2753 // C++23 [lex.phases] p1
2754 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2755 // diagnostic only once per entire ill-formed subsequence to avoid
2756 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2757 bool UnicodeDecodingAlreadyDiagnosed = false;
2758
2759 while (true) {
2760 // Skip over all non-interesting characters until we find end of buffer or a
2761 // (probably ending) '/' character.
2762 if (CurPtr + 24 < BufferEnd &&
20
Assuming the condition is true
2763 // If there is a code-completion point avoid the fast scan because it
2764 // doesn't check for '\0'.
2765 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
21
Assuming field 'PP' is null
2766 // While not aligned to a 16-byte boundary.
2767 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
22
Loop condition is true. Entering loop body
24
Assuming the condition is true
25
Loop condition is true. Entering loop body
2768 if (!isASCII(C))
23
Taking false branch
26
Taking true branch
2769 goto MultiByteUTF8;
27
Control jumps to line 2845
2770 C = *CurPtr++;
2771 }
2772 if (C == '/') goto FoundSlash;
2773
2774#ifdef __SSE2__1
2775 __m128i Slashes = _mm_set1_epi8('/');
2776 while (CurPtr + 16 < BufferEnd) {
2777 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2778 if (LLVM_UNLIKELY(Mask != 0)__builtin_expect((bool)(Mask != 0), false)) {
2779 goto MultiByteUTF8;
2780 }
2781 // look for slashes
2782 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2783 Slashes));
2784 if (cmp != 0) {
2785 // Adjust the pointer to point directly after the first slash. It's
2786 // not necessary to set C here, it will be overwritten at the end of
2787 // the outer loop.
2788 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2789 goto FoundSlash;
2790 }
2791 CurPtr += 16;
2792 }
2793#elif __ALTIVEC__
2794 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2795 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2796 0x80, 0x80, 0x80, 0x80};
2797 __vector unsigned char Slashes = {
2798 '/', '/', '/', '/', '/', '/', '/', '/',
2799 '/', '/', '/', '/', '/', '/', '/', '/'
2800 };
2801 while (CurPtr + 16 < BufferEnd) {
2802 if (LLVM_UNLIKELY(__builtin_expect((bool)(vec_any_ge(*(const __vector unsigned char
*)CurPtr, LongUTF)), false)
2803 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))__builtin_expect((bool)(vec_any_ge(*(const __vector unsigned char
*)CurPtr, LongUTF)), false)
)
2804 goto MultiByteUTF8;
2805 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2806 break;
2807 }
2808 CurPtr += 16;
2809 }
2810
2811#else
2812 while (CurPtr + 16 < BufferEnd) {
2813 bool HasNonASCII = false;
2814 for (unsigned I = 0; I < 16; ++I)
2815 HasNonASCII |= !isASCII(CurPtr[I]);
2816
2817 if (LLVM_UNLIKELY(HasNonASCII)__builtin_expect((bool)(HasNonASCII), false))
2818 goto MultiByteUTF8;
2819
2820 bool HasSlash = false;
2821 for (unsigned I = 0; I < 16; ++I)
2822 HasSlash |= CurPtr[I] == '/';
2823 if (HasSlash)
2824 break;
2825 CurPtr += 16;
2826 }
2827#endif
2828
2829 // It has to be one of the bytes scanned, increment to it and read one.
2830 C = *CurPtr++;
2831 }
2832
2833 // Loop to scan the remainder, warning on invalid UTF-8
2834 // if the corresponding warning is enabled, emitting a diagnostic only once
2835 // per sequence that cannot be decoded.
2836 while (C != '/' && C != '\0') {
2837 if (isASCII(C)) {
2838 UnicodeDecodingAlreadyDiagnosed = false;
2839 C = *CurPtr++;
2840 continue;
2841 }
2842 MultiByteUTF8:
2843 // CurPtr is 1 code unit past C, so to decode
2844 // the codepoint, we need to read from the previous position.
2845 unsigned Length = llvm::getUTF8SequenceSize(
2846 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2847 if (Length == 0) {
28
Assuming 'Length' is equal to 0
2848 if (!UnicodeDecodingAlreadyDiagnosed
28.1
'UnicodeDecodingAlreadyDiagnosed' is false
28.1
'UnicodeDecodingAlreadyDiagnosed' is false
&& !isLexingRawMode())
29
Assuming the condition is true
30
Taking true branch
2849 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
31
Calling 'Lexer::Diag'
2850 UnicodeDecodingAlreadyDiagnosed = true;
2851 } else {
2852 UnicodeDecodingAlreadyDiagnosed = false;
2853 CurPtr += Length - 1;
2854 }
2855 C = *CurPtr++;
2856 }
2857
2858 if (C == '/') {
2859 FoundSlash:
2860 if (CurPtr[-2] == '*') // We found the final */. We're done!
2861 break;
2862
2863 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2864 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2865 LangOpts.Trigraphs)) {
2866 // We found the final */, though it had an escaped newline between the
2867 // * and /. We're done!
2868 break;
2869 }
2870 }
2871 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2872 // If this is a /* inside of the comment, emit a warning. Don't do this
2873 // if this is a /*/, which will end the comment. This misses cases with
2874 // embedded escaped newlines, but oh well.
2875 if (!isLexingRawMode())
2876 Diag(CurPtr-1, diag::warn_nested_block_comment);
2877 }
2878 } else if (C == 0 && CurPtr == BufferEnd+1) {
2879 if (!isLexingRawMode())
2880 Diag(BufferPtr, diag::err_unterminated_block_comment);
2881 // Note: the user probably forgot a */. We could continue immediately
2882 // after the /*, but this would involve lexing a lot of what really is the
2883 // comment, which surely would confuse the parser.
2884 --CurPtr;
2885
2886 // KeepWhitespaceMode should return this broken comment as a token. Since
2887 // it isn't a well formed comment, just return it as an 'unknown' token.
2888 if (isKeepWhitespaceMode()) {
2889 FormTokenWithChars(Result, CurPtr, tok::unknown);
2890 return true;
2891 }
2892
2893 BufferPtr = CurPtr;
2894 return false;
2895 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2896 PP->CodeCompleteNaturalLanguage();
2897 cutOffLexing();
2898 return false;
2899 }
2900
2901 C = *CurPtr++;
2902 }
2903
2904 // Notify comment handlers about the comment unless we're in a #if 0 block.
2905 if (PP && !isLexingRawMode() &&
2906 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2907 getSourceLocation(CurPtr)))) {
2908 BufferPtr = CurPtr;
2909 return true; // A token has to be returned.
2910 }
2911
2912 // If we are returning comments as tokens, return this comment as a token.
2913 if (inKeepCommentMode()) {
2914 FormTokenWithChars(Result, CurPtr, tok::comment);
2915 return true;
2916 }
2917
2918 // It is common for the tokens immediately after a /**/ comment to be
2919 // whitespace. Instead of going through the big switch, handle it
2920 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2921 // have already returned above with the comment as a token.
2922 if (isHorizontalWhitespace(*CurPtr)) {
2923 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2924 return false;
2925 }
2926
2927 // Otherwise, just return so that the next character will be lexed as a token.
2928 BufferPtr = CurPtr;
2929 Result.setFlag(Token::LeadingSpace);
2930 return false;
2931}
2932
2933//===----------------------------------------------------------------------===//
2934// Primary Lexing Entry Points
2935//===----------------------------------------------------------------------===//
2936
2937/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2938/// uninterpreted string. This switches the lexer out of directive mode.
2939void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2940 assert(ParsingPreprocessorDirective && ParsingFilename == false &&(static_cast <bool> (ParsingPreprocessorDirective &&
ParsingFilename == false && "Must be in a preprocessing directive!"
) ? void (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "clang/lib/Lex/Lexer.cpp", 2941, __extension__ __PRETTY_FUNCTION__
))
2941 "Must be in a preprocessing directive!")(static_cast <bool> (ParsingPreprocessorDirective &&
ParsingFilename == false && "Must be in a preprocessing directive!"
) ? void (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\""
, "clang/lib/Lex/Lexer.cpp", 2941, __extension__ __PRETTY_FUNCTION__
))
;
2942 Token Tmp;
2943 Tmp.startToken();
2944
2945 // CurPtr - Cache BufferPtr in an automatic variable.
2946 const char *CurPtr = BufferPtr;
2947 while (true) {
2948 char Char = getAndAdvanceChar(CurPtr, Tmp);
2949 switch (Char) {
2950 default:
2951 if (Result)
2952 Result->push_back(Char);
2953 break;
2954 case 0: // Null.
2955 // Found end of file?
2956 if (CurPtr-1 != BufferEnd) {
2957 if (isCodeCompletionPoint(CurPtr-1)) {
2958 PP->CodeCompleteNaturalLanguage();
2959 cutOffLexing();
2960 return;
2961 }
2962
2963 // Nope, normal character, continue.
2964 if (Result)
2965 Result->push_back(Char);
2966 break;
2967 }
2968 // FALL THROUGH.
2969 [[fallthrough]];
2970 case '\r':
2971 case '\n':
2972 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2973 assert(CurPtr[-1] == Char && "Trigraphs for newline?")(static_cast <bool> (CurPtr[-1] == Char && "Trigraphs for newline?"
) ? void (0) : __assert_fail ("CurPtr[-1] == Char && \"Trigraphs for newline?\""
, "clang/lib/Lex/Lexer.cpp", 2973, __extension__ __PRETTY_FUNCTION__
))
;
2974 BufferPtr = CurPtr-1;
2975
2976 // Next, lex the character, which should handle the EOD transition.
2977 Lex(Tmp);
2978 if (Tmp.is(tok::code_completion)) {
2979 if (PP)
2980 PP->CodeCompleteNaturalLanguage();
2981 Lex(Tmp);
2982 }
2983 assert(Tmp.is(tok::eod) && "Unexpected token!")(static_cast <bool> (Tmp.is(tok::eod) && "Unexpected token!"
) ? void (0) : __assert_fail ("Tmp.is(tok::eod) && \"Unexpected token!\""
, "clang/lib/Lex/Lexer.cpp", 2983, __extension__ __PRETTY_FUNCTION__
))
;
2984
2985 // Finally, we're done;
2986 return;
2987 }
2988 }
2989}
2990
2991/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2992/// condition, reporting diagnostics and handling other edge cases as required.
2993/// This returns true if Result contains a token, false if PP.Lex should be
2994/// called again.
2995bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2996 // If we hit the end of the file while parsing a preprocessor directive,
2997 // end the preprocessor directive first. The next token returned will
2998 // then be the end of file.
2999 if (ParsingPreprocessorDirective) {
3000 // Done parsing the "line".
3001 ParsingPreprocessorDirective = false;
3002 // Update the location of token as well as BufferPtr.
3003 FormTokenWithChars(Result, CurPtr, tok::eod);
3004
3005 // Restore comment saving mode, in case it was disabled for directive.
3006 if (PP)
3007 resetExtendedTokenMode();
3008 return true; // Have a token.
3009 }
3010
3011 // If we are in raw mode, return this event as an EOF token. Let the caller
3012 // that put us in raw mode handle the event.
3013 if (isLexingRawMode()) {
3014 Result.startToken();
3015 BufferPtr = BufferEnd;
3016 FormTokenWithChars(Result, BufferEnd, tok::eof);
3017 return true;
3018 }
3019
3020 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
3021 PP->setRecordedPreambleConditionalStack(ConditionalStack);
3022 // If the preamble cuts off the end of a header guard, consider it guarded.
3023 // The guard is valid for the preamble content itself, and for tools the
3024 // most useful answer is "yes, this file has a header guard".
3025 if (!ConditionalStack.empty())
3026 MIOpt.ExitTopLevelConditional();
3027 ConditionalStack.clear();
3028 }
3029
3030 // Issue diagnostics for unterminated #if and missing newline.
3031
3032 // If we are in a #if directive, emit an error.
3033 while (!ConditionalStack.empty()) {
3034 if (PP->getCodeCompletionFileLoc() != FileLoc)
3035 PP->Diag(ConditionalStack.back().IfLoc,
3036 diag::err_pp_unterminated_conditional);
3037 ConditionalStack.pop_back();
3038 }
3039
3040 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3041 // a pedwarn.
3042 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3043 DiagnosticsEngine &Diags = PP->getDiagnostics();
3044 SourceLocation EndLoc = getSourceLocation(BufferEnd);
3045 unsigned DiagID;
3046
3047 if (LangOpts.CPlusPlus11) {
3048 // C++11 [lex.phases] 2.2 p2
3049 // Prefer the C++98 pedantic compatibility warning over the generic,
3050 // non-extension, user-requested "missing newline at EOF" warning.
3051 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3052 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3053 } else {
3054 DiagID = diag::warn_no_newline_eof;
3055 }
3056 } else {
3057 DiagID = diag::ext_no_newline_eof;
3058 }
3059
3060 Diag(BufferEnd, DiagID)
3061 << FixItHint::CreateInsertion(EndLoc, "\n");
3062 }
3063
3064 BufferPtr = CurPtr;
3065
3066 // Finally, let the preprocessor handle this.
3067 return PP->HandleEndOfFile(Result, isPragmaLexer());
3068}
3069
3070/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3071/// the specified lexer will return a tok::l_paren token, 0 if it is something
3072/// else and 2 if there are no more tokens in the buffer controlled by the
3073/// lexer.
3074unsigned Lexer::isNextPPTokenLParen() {
3075 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?")(static_cast <bool> (!LexingRawMode && "How can we expand a macro from a skipping buffer?"
) ? void (0) : __assert_fail ("!LexingRawMode && \"How can we expand a macro from a skipping buffer?\""
, "clang/lib/Lex/Lexer.cpp", 3075, __extension__ __PRETTY_FUNCTION__
))
;
3076
3077 if (isDependencyDirectivesLexer()) {
3078 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3079 return 2;
3080 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3081 tok::l_paren);
3082 }
3083
3084 // Switch to 'skipping' mode. This will ensure that we can lex a token
3085 // without emitting diagnostics, disables macro expansion, and will cause EOF
3086 // to return an EOF token instead of popping the include stack.
3087 LexingRawMode = true;
3088
3089 // Save state that can be changed while lexing so that we can restore it.
3090 const char *TmpBufferPtr = BufferPtr;
3091 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3092 bool atStartOfLine = IsAtStartOfLine;
3093 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3094 bool leadingSpace = HasLeadingSpace;
3095
3096 Token Tok;
3097 Lex(Tok);
3098
3099 // Restore state that may have changed.
3100 BufferPtr = TmpBufferPtr;
3101 ParsingPreprocessorDirective = inPPDirectiveMode;
3102 HasLeadingSpace = leadingSpace;
3103 IsAtStartOfLine = atStartOfLine;
3104 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3105
3106 // Restore the lexer back to non-skipping mode.
3107 LexingRawMode = false;
3108
3109 if (Tok.is(tok::eof))
3110 return 2;
3111 return Tok.is(tok::l_paren);
3112}
3113
3114/// Find the end of a version control conflict marker.
3115static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3116 ConflictMarkerKind CMK) {
3117 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3118 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3119 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3120 size_t Pos = RestOfBuffer.find(Terminator);
3121 while (Pos != StringRef::npos) {
3122 // Must occur at start of line.
3123 if (Pos == 0 ||
3124 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3125 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3126 Pos = RestOfBuffer.find(Terminator);
3127 continue;
3128 }
3129 return RestOfBuffer.data()+Pos;
3130 }
3131 return nullptr;
3132}
3133
3134/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3135/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3136/// and recover nicely. This returns true if it is a conflict marker and false
3137/// if not.
3138bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3139 // Only a conflict marker if it starts at the beginning of a line.
3140 if (CurPtr != BufferStart &&
3141 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3142 return false;
3143
3144 // Check to see if we have <<<<<<< or >>>>.
3145 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
3146 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
3147 return false;
3148
3149 // If we have a situation where we don't care about conflict markers, ignore
3150 // it.
3151 if (CurrentConflictMarkerState || isLexingRawMode())
3152 return false;
3153
3154 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3155
3156 // Check to see if there is an ending marker somewhere in the buffer at the
3157 // start of a line to terminate this conflict marker.
3158 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3159 // We found a match. We are really in a conflict marker.
3160 // Diagnose this, and ignore to the end of line.
3161 Diag(CurPtr, diag::err_conflict_marker);
3162 CurrentConflictMarkerState = Kind;
3163
3164 // Skip ahead to the end of line. We know this exists because the
3165 // end-of-conflict marker starts with \r or \n.
3166 while (*CurPtr != '\r' && *CurPtr != '\n') {
3167 assert(CurPtr != BufferEnd && "Didn't find end of line")(static_cast <bool> (CurPtr != BufferEnd && "Didn't find end of line"
) ? void (0) : __assert_fail ("CurPtr != BufferEnd && \"Didn't find end of line\""
, "clang/lib/Lex/Lexer.cpp", 3167, __extension__ __PRETTY_FUNCTION__
))
;
3168 ++CurPtr;
3169 }
3170 BufferPtr = CurPtr;
3171 return true;
3172 }
3173
3174 // No end of conflict marker found.
3175 return false;
3176}
3177
3178/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3179/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3180/// is the end of a conflict marker. Handle it by ignoring up until the end of
3181/// the line. This returns true if it is a conflict marker and false if not.
3182bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3183 // Only a conflict marker if it starts at the beginning of a line.
3184 if (CurPtr != BufferStart &&
3185 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3186 return false;
3187
3188 // If we have a situation where we don't care about conflict markers, ignore
3189 // it.
3190 if (!CurrentConflictMarkerState || isLexingRawMode())
3191 return false;
3192
3193 // Check to see if we have the marker (4 characters in a row).
3194 for (unsigned i = 1; i != 4; ++i)
3195 if (CurPtr[i] != CurPtr[0])
3196 return false;
3197
3198 // If we do have it, search for the end of the conflict marker. This could
3199 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3200 // be the end of conflict marker.
3201 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3202 CurrentConflictMarkerState)) {
3203 CurPtr = End;
3204
3205 // Skip ahead to the end of line.
3206 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3207 ++CurPtr;
3208
3209 BufferPtr = CurPtr;
3210
3211 // No longer in the conflict marker.
3212 CurrentConflictMarkerState = CMK_None;
3213 return true;
3214 }
3215
3216 return false;
3217}
3218
3219static const char *findPlaceholderEnd(const char *CurPtr,
3220 const char *BufferEnd) {
3221 if (CurPtr == BufferEnd)
3222 return nullptr;
3223 BufferEnd -= 1; // Scan until the second last character.
3224 for (; CurPtr != BufferEnd; ++CurPtr) {
3225 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3226 return CurPtr + 2;
3227 }
3228 return nullptr;
3229}
3230
3231bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3232 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!")(static_cast <bool> (CurPtr[-1] == '<' && CurPtr
[0] == '#' && "Not a placeholder!") ? void (0) : __assert_fail
("CurPtr[-1] == '<' && CurPtr[0] == '#' && \"Not a placeholder!\""
, "clang/lib/Lex/Lexer.cpp", 3232, __extension__ __PRETTY_FUNCTION__
))
;
3233 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
3234 return false;
3235 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3236 if (!End)
3237 return false;
3238 const char *Start = CurPtr - 1;
3239 if (!LangOpts.AllowEditorPlaceholders)
3240 Diag(Start, diag::err_placeholder_in_source);
3241 Result.startToken();
3242 FormTokenWithChars(Result, End, tok::raw_identifier);
3243 Result.setRawIdentifierData(Start);
3244 PP->LookUpIdentifierInfo(Result);
3245 Result.setFlag(Token::IsEditorPlaceholder);
3246 BufferPtr = End;
3247 return true;
3248}
3249
3250bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3251 if (PP && PP->isCodeCompletionEnabled()) {
3252 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3253 return Loc == PP->getCodeCompletionLoc();
3254 }
3255
3256 return false;
3257}
3258
3259std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3260 const char *SlashLoc,
3261 Token *Result) {
3262 unsigned CharSize;
3263 char Kind = getCharAndSize(StartPtr, CharSize);
3264 assert((Kind == 'u' || Kind == 'U') && "expected a UCN")(static_cast <bool> ((Kind == 'u' || Kind == 'U') &&
"expected a UCN") ? void (0) : __assert_fail ("(Kind == 'u' || Kind == 'U') && \"expected a UCN\""
, "clang/lib/Lex/Lexer.cpp", 3264, __extension__ __PRETTY_FUNCTION__
))
;
3265
3266 unsigned NumHexDigits;
3267 if (Kind == 'u')
3268 NumHexDigits = 4;
3269 else if (Kind == 'U')
3270 NumHexDigits = 8;
3271
3272 bool Delimited = false;
3273 bool FoundEndDelimiter = false;
3274 unsigned Count = 0;
3275 bool Diagnose = Result && !isLexingRawMode();
3276
3277 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3278 if (Diagnose)
3279 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3280 return std::nullopt;
3281 }
3282
3283 const char *CurPtr = StartPtr + CharSize;
3284 const char *KindLoc = &CurPtr[-1];
3285
3286 uint32_t CodePoint = 0;
3287 while (Count != NumHexDigits || Delimited) {
3288 char C = getCharAndSize(CurPtr, CharSize);
3289 if (!Delimited && Count == 0 && C == '{') {
3290 Delimited = true;
3291 CurPtr += CharSize;
3292 continue;
3293 }
3294
3295 if (Delimited && C == '}') {
3296 CurPtr += CharSize;
3297 FoundEndDelimiter = true;
3298 break;
3299 }
3300
3301 unsigned Value = llvm::hexDigitValue(C);
3302 if (Value == -1U) {
3303 if (!Delimited)
3304 break;
3305 if (Diagnose)
3306 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3307 << StringRef(KindLoc, 1);
3308 return std::nullopt;
3309 }
3310
3311 if (CodePoint & 0xF000'0000) {
3312 if (Diagnose)
3313 Diag(KindLoc, diag::err_escape_too_large) << 0;
3314 return std::nullopt;
3315 }
3316
3317 CodePoint <<= 4;
3318 CodePoint |= Value;
3319 CurPtr += CharSize;
3320 Count++;
3321 }
3322
3323 if (Count == 0) {
3324 if (Diagnose)
3325 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3326 : diag::warn_ucn_escape_no_digits)
3327 << StringRef(KindLoc, 1);
3328 return std::nullopt;
3329 }
3330
3331 if (Delimited && Kind == 'U') {
3332 if (Diagnose)
3333 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3334 return std::nullopt;
3335 }
3336
3337 if (!Delimited && Count != NumHexDigits) {
3338 if (Diagnose) {
3339 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3340 // If the user wrote \U1234, suggest a fixit to \u.
3341 if (Count == 4 && NumHexDigits == 8) {
3342 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3343 Diag(KindLoc, diag::note_ucn_four_not_eight)
3344 << FixItHint::CreateReplacement(URange, "u");
3345 }
3346 }
3347 return std::nullopt;
3348 }
3349
3350 if (Delimited && PP) {
3351 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3352 ? diag::warn_cxx23_delimited_escape_sequence
3353 : diag::ext_delimited_escape_sequence)
3354 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3355 }
3356
3357 if (Result) {
3358 Result->setFlag(Token::HasUCN);
3359 // If the UCN contains either a trigraph or a line splicing,
3360 // we need to call getAndAdvanceChar again to set the appropriate flags
3361 // on Result.
3362 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3363 StartPtr = CurPtr;
3364 else
3365 while (StartPtr != CurPtr)
3366 (void)getAndAdvanceChar(StartPtr, *Result);
3367 } else {
3368 StartPtr = CurPtr;
3369 }
3370 return CodePoint;
3371}
3372
3373std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3374 const char *SlashLoc,
3375 Token *Result) {
3376 unsigned CharSize;
3377 bool Diagnose = Result && !isLexingRawMode();
3378
3379 char C = getCharAndSize(StartPtr, CharSize);
3380 assert(C == 'N' && "expected \\N{...}")(static_cast <bool> (C == 'N' && "expected \\N{...}"
) ? void (0) : __assert_fail ("C == 'N' && \"expected \\\\N{...}\""
, "clang/lib/Lex/Lexer.cpp", 3380, __extension__ __PRETTY_FUNCTION__
))
;
3381
3382 const char *CurPtr = StartPtr + CharSize;
3383 const char *KindLoc = &CurPtr[-1];
3384
3385 C = getCharAndSize(CurPtr, CharSize);
3386 if (C != '{') {
3387 if (Diagnose)
3388 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3389 return std::nullopt;
3390 }
3391 CurPtr += CharSize;
3392 const char *StartName = CurPtr;
3393 bool FoundEndDelimiter = false;
3394 llvm::SmallVector<char, 30> Buffer;
3395 while (C) {
3396 C = getCharAndSize(CurPtr, CharSize);
3397 CurPtr += CharSize;
3398 if (C == '}') {
3399 FoundEndDelimiter = true;
3400 break;
3401 }
3402
3403 if (isVerticalWhitespace(C))
3404 break;
3405 Buffer.push_back(C);
3406 }
3407
3408 if (!FoundEndDelimiter || Buffer.empty()) {
3409 if (Diagnose)
3410 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3411 : diag::warn_delimited_ucn_incomplete)
3412 << StringRef(KindLoc, 1);
3413 return std::nullopt;
3414 }
3415
3416 StringRef Name(Buffer.data(), Buffer.size());
3417 std::optional<char32_t> Match =
3418 llvm::sys::unicode::nameToCodepointStrict(Name);
3419 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3420 if (!Match) {
3421 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3422 if (Diagnose) {
3423 Diag(StartName, diag::err_invalid_ucn_name)
3424 << StringRef(Buffer.data(), Buffer.size())
3425 << makeCharRange(*this, StartName, CurPtr - CharSize);
3426 if (LooseMatch) {
3427 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3428 << FixItHint::CreateReplacement(
3429 makeCharRange(*this, StartName, CurPtr - CharSize),
3430 LooseMatch->Name);
3431 }
3432 }
3433 // We do not offer misspelled character names suggestions here
3434 // as the set of what would be a valid suggestion depends on context,
3435 // and we should not make invalid suggestions.
3436 }
3437
3438 if (Diagnose && Match)
3439 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3440 ? diag::warn_cxx23_delimited_escape_sequence
3441 : diag::ext_delimited_escape_sequence)
3442 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3443
3444 // If no diagnostic has been emitted yet, likely because we are doing a
3445 // tentative lexing, we do not want to recover here to make sure the token
3446 // will not be incorrectly considered valid. This function will be called
3447 // again and a diagnostic emitted then.
3448 if (LooseMatch && Diagnose)
3449 Match = LooseMatch->CodePoint;
3450
3451 if (Result) {
3452 Result->setFlag(Token::HasUCN);
3453 // If the UCN contains either a trigraph or a line splicing,
3454 // we need to call getAndAdvanceChar again to set the appropriate flags
3455 // on Result.
3456 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3457 StartPtr = CurPtr;
3458 else
3459 while (StartPtr != CurPtr)
3460 (void)getAndAdvanceChar(StartPtr, *Result);
3461 } else {
3462 StartPtr = CurPtr;
3463 }
3464 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3465}
3466
3467uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3468 Token *Result) {
3469
3470 unsigned CharSize;
3471 std::optional<uint32_t> CodePointOpt;
3472 char Kind = getCharAndSize(StartPtr, CharSize);
3473 if (Kind == 'u' || Kind == 'U')
3474 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3475 else if (Kind == 'N')
3476 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3477
3478 if (!CodePointOpt)
3479 return 0;
3480
3481 uint32_t CodePoint = *CodePointOpt;
3482
3483 // Don't apply C family restrictions to UCNs in assembly mode
3484 if (LangOpts.AsmPreprocessor)
3485 return CodePoint;
3486
3487 // C99 6.4.3p2: A universal character name shall not specify a character whose
3488 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3489 // 0060 (`), nor one in the range D800 through DFFF inclusive.)
3490 // C++11 [lex.charset]p2: If the hexadecimal value for a
3491 // universal-character-name corresponds to a surrogate code point (in the
3492 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3493 // if the hexadecimal value for a universal-character-name outside the
3494 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3495 // string literal corresponds to a control character (in either of the
3496 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3497 // basic source character set, the program is ill-formed.
3498 if (CodePoint < 0xA0) {
3499 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
3500 return CodePoint;
3501
3502 // We don't use isLexingRawMode() here because we need to warn about bad
3503 // UCNs even when skipping preprocessing tokens in a #if block.
3504 if (Result && PP) {
3505 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3506 Diag(BufferPtr, diag::err_ucn_control_character);
3507 else {
3508 char C = static_cast<char>(CodePoint);
3509 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3510 }
3511 }
3512
3513 return 0;
3514 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3515 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3516 // We don't use isLexingRawMode() here because we need to diagnose bad
3517 // UCNs even when skipping preprocessing tokens in a #if block.
3518 if (Result && PP) {
3519 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3520 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3521 else
3522 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3523 }
3524 return 0;
3525 }
3526
3527 return CodePoint;
3528}
3529
3530bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3531 const char *CurPtr) {
3532 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3533 isUnicodeWhitespace(C)) {
3534 Diag(BufferPtr, diag::ext_unicode_whitespace)
3535 << makeCharRange(*this, BufferPtr, CurPtr);
3536
3537 Result.setFlag(Token::LeadingSpace);
3538 return true;
3539 }
3540 return false;
3541}
3542
3543void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3544 IsAtStartOfLine = Result.isAtStartOfLine();
3545 HasLeadingSpace = Result.hasLeadingSpace();
3546 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3547 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3548}
3549
3550bool Lexer::Lex(Token &Result) {
3551 assert(!isDependencyDirectivesLexer())(static_cast <bool> (!isDependencyDirectivesLexer()) ? void
(0) : __assert_fail ("!isDependencyDirectivesLexer()", "clang/lib/Lex/Lexer.cpp"
, 3551, __extension__ __PRETTY_FUNCTION__))
;
3552
3553 // Start a new token.
3554 Result.startToken();
3555
3556 // Set up misc whitespace flags for LexTokenInternal.
3557 if (IsAtStartOfLine) {
3558 Result.setFlag(Token::StartOfLine);
3559 IsAtStartOfLine = false;
3560 }
3561
3562 if (HasLeadingSpace) {
3563 Result.setFlag(Token::LeadingSpace);
3564 HasLeadingSpace = false;
3565 }
3566
3567 if (HasLeadingEmptyMacro) {
3568 Result.setFlag(Token::LeadingEmptyMacro);
3569 HasLeadingEmptyMacro = false;
3570 }
3571
3572 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3573 IsAtPhysicalStartOfLine = false;
3574 bool isRawLex = isLexingRawMode();
3575 (void) isRawLex;
3576 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3577 // (After the LexTokenInternal call, the lexer might be destroyed.)
3578 assert((returnedToken || !isRawLex) && "Raw lex must succeed")(static_cast <bool> ((returnedToken || !isRawLex) &&
"Raw lex must succeed") ? void (0) : __assert_fail ("(returnedToken || !isRawLex) && \"Raw lex must succeed\""
, "clang/lib/Lex/Lexer.cpp", 3578, __extension__ __PRETTY_FUNCTION__
))
;
3579 return returnedToken;
3580}
3581
3582/// LexTokenInternal - This implements a simple C family lexer. It is an
3583/// extremely performance critical piece of code. This assumes that the buffer
3584/// has a null character at the end of the file. This returns a preprocessing
3585/// token, not a normal token, as such, it is an internal interface. It assumes
3586/// that the Flags of result have been cleared before calling this.
3587bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3588LexStart:
3589 assert(!Result.needsCleaning() && "Result needs cleaning")(static_cast <bool> (!Result.needsCleaning() &&
"Result needs cleaning") ? void (0) : __assert_fail ("!Result.needsCleaning() && \"Result needs cleaning\""
, "clang/lib/Lex/Lexer.cpp", 3589, __extension__ __PRETTY_FUNCTION__
))
;
1
'?' condition is true
3590 assert(!Result.hasPtrData() && "Result has not been reset")(static_cast <bool> (!Result.hasPtrData() && "Result has not been reset"
) ? void (0) : __assert_fail ("!Result.hasPtrData() && \"Result has not been reset\""
, "clang/lib/Lex/Lexer.cpp", 3590, __extension__ __PRETTY_FUNCTION__
))
;
2
'?' condition is true
3591
3592 // CurPtr - Cache BufferPtr in an automatic variable.
3593 const char *CurPtr = BufferPtr;
3594
3595 // Small amounts of horizontal whitespace is very common between tokens.
3596 if (isHorizontalWhitespace(*CurPtr)) {
3
Assuming the condition is false
4
Taking false branch
3597 do {
3598 ++CurPtr;
3599 } while (isHorizontalWhitespace(*CurPtr));
3600
3601 // If we are keeping whitespace and other tokens, just return what we just
3602 // skipped. The next lexer invocation will return the token after the
3603 // whitespace.
3604 if (isKeepWhitespaceMode()) {
3605 FormTokenWithChars(Result, CurPtr, tok::unknown);
3606 // FIXME: The next token will not have LeadingSpace set.
3607 return true;
3608 }
3609
3610 BufferPtr = CurPtr;
3611 Result.setFlag(Token::LeadingSpace);
3612 }
3613
3614 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3615
3616 // Read a character, advancing over it.
3617 char Char = getAndAdvanceChar(CurPtr, Result);
3618 tok::TokenKind Kind;
3619
3620 if (!isVerticalWhitespace(Char))
5
Assuming the condition is false
6
Taking false branch
3621 NewLinePtr = nullptr;
3622
3623 switch (Char) {
7
Control jumps to 'case 47:' at line 4005
3624 case 0: // Null.
3625 // Found end of file?
3626 if (CurPtr-1 == BufferEnd)
3627 return LexEndOfFile(Result, CurPtr-1);
3628
3629 // Check if we are performing code completion.
3630 if (isCodeCompletionPoint(CurPtr-1)) {
3631 // Return the code-completion token.
3632 Result.startToken();
3633 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3634 return true;
3635 }
3636
3637 if (!isLexingRawMode())
3638 Diag(CurPtr-1, diag::null_in_file);
3639 Result.setFlag(Token::LeadingSpace);
3640 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3641 return true; // KeepWhitespaceMode
3642
3643 // We know the lexer hasn't changed, so just try again with this lexer.
3644 // (We manually eliminate the tail call to avoid recursion.)
3645 goto LexNextToken;
3646
3647 case 26: // DOS & CP/M EOF: "^Z".
3648 // If we're in Microsoft extensions mode, treat this as end of file.
3649 if (LangOpts.MicrosoftExt) {
3650 if (!isLexingRawMode())
3651 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3652 return LexEndOfFile(Result, CurPtr-1);
3653 }
3654
3655 // If Microsoft extensions are disabled, this is just random garbage.
3656 Kind = tok::unknown;
3657 break;
3658
3659 case '\r':
3660 if (CurPtr[0] == '\n')
3661 (void)getAndAdvanceChar(CurPtr, Result);
3662 [[fallthrough]];
3663 case '\n':
3664 // If we are inside a preprocessor directive and we see the end of line,
3665 // we know we are done with the directive, so return an EOD token.
3666 if (ParsingPreprocessorDirective) {
3667 // Done parsing the "line".
3668 ParsingPreprocessorDirective = false;
3669
3670 // Restore comment saving mode, in case it was disabled for directive.
3671 if (PP)
3672 resetExtendedTokenMode();
3673
3674 // Since we consumed a newline, we are back at the start of a line.
3675 IsAtStartOfLine = true;
3676 IsAtPhysicalStartOfLine = true;
3677 NewLinePtr = CurPtr - 1;
3678
3679 Kind = tok::eod;
3680 break;
3681 }
3682
3683 // No leading whitespace seen so far.
3684 Result.clearFlag(Token::LeadingSpace);
3685
3686 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3687 return true; // KeepWhitespaceMode
3688
3689 // We only saw whitespace, so just try again with this lexer.
3690 // (We manually eliminate the tail call to avoid recursion.)
3691 goto LexNextToken;
3692 case ' ':
3693 case '\t':
3694 case '\f':
3695 case '\v':
3696 SkipHorizontalWhitespace:
3697 Result.setFlag(Token::LeadingSpace);
3698 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3699 return true; // KeepWhitespaceMode
3700
3701 SkipIgnoredUnits:
3702 CurPtr = BufferPtr;
3703
3704 // If the next token is obviously a // or /* */ comment, skip it efficiently
3705 // too (without going through the big switch stmt).
3706 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3707 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3708 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3709 return true; // There is a token to return.
3710 goto SkipIgnoredUnits;
3711 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3712 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3713 return true; // There is a token to return.
3714 goto SkipIgnoredUnits;
3715 } else if (isHorizontalWhitespace(*CurPtr)) {
3716 goto SkipHorizontalWhitespace;
3717 }
3718 // We only saw whitespace, so just try again with this lexer.
3719 // (We manually eliminate the tail call to avoid recursion.)
3720 goto LexNextToken;
3721
3722 // C99 6.4.4.1: Integer Constants.
3723 // C99 6.4.4.2: Floating Constants.
3724 case '0': case '1': case '2': case '3': case '4':
3725 case '5': case '6': case '7': case '8': case '9':
3726 // Notify MIOpt that we read a non-whitespace/non-comment token.
3727 MIOpt.ReadToken();
3728 return LexNumericConstant(Result, CurPtr);
3729
3730 // Identifier (e.g., uber), or
3731 // UTF-8 (C2x/C++17) or UTF-16 (C11/C++11) character literal, or
3732 // UTF-8 or UTF-16 string literal (C11/C++11).
3733 case 'u':
3734 // Notify MIOpt that we read a non-whitespace/non-comment token.
3735 MIOpt.ReadToken();
3736
3737 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3738 Char = getCharAndSize(CurPtr, SizeTmp);
3739
3740 // UTF-16 string literal
3741 if (Char == '"')
3742 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3743 tok::utf16_string_literal);
3744
3745 // UTF-16 character constant
3746 if (Char == '\'')
3747 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3748 tok::utf16_char_constant);
3749
3750 // UTF-16 raw string literal
3751 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3752 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3753 return LexRawStringLiteral(Result,
3754 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3755 SizeTmp2, Result),
3756 tok::utf16_string_literal);
3757
3758 if (Char == '8') {
3759 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3760
3761 // UTF-8 string literal
3762 if (Char2 == '"')
3763 return LexStringLiteral(Result,
3764 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3765 SizeTmp2, Result),
3766 tok::utf8_string_literal);
3767 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C2x))
3768 return LexCharConstant(
3769 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3770 SizeTmp2, Result),
3771 tok::utf8_char_constant);
3772
3773 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3774 unsigned SizeTmp3;
3775 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3776 // UTF-8 raw string literal
3777 if (Char3 == '"') {
3778 return LexRawStringLiteral(Result,
3779 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3780 SizeTmp2, Result),
3781 SizeTmp3, Result),
3782 tok::utf8_string_literal);
3783 }
3784 }
3785 }
3786 }
3787
3788 // treat u like the start of an identifier.
3789 return LexIdentifierContinue(Result, CurPtr);
3790
3791 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3792 // Notify MIOpt that we read a non-whitespace/non-comment token.
3793 MIOpt.ReadToken();
3794
3795 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3796 Char = getCharAndSize(CurPtr, SizeTmp);
3797
3798 // UTF-32 string literal
3799 if (Char == '"')
3800 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3801 tok::utf32_string_literal);
3802
3803 // UTF-32 character constant
3804 if (Char == '\'')
3805 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3806 tok::utf32_char_constant);
3807
3808 // UTF-32 raw string literal
3809 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3810 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3811 return LexRawStringLiteral(Result,
3812 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3813 SizeTmp2, Result),
3814 tok::utf32_string_literal);
3815 }
3816
3817 // treat U like the start of an identifier.
3818 return LexIdentifierContinue(Result, CurPtr);
3819
3820 case 'R': // Identifier or C++0x raw string literal
3821 // Notify MIOpt that we read a non-whitespace/non-comment token.
3822 MIOpt.ReadToken();
3823
3824 if (LangOpts.CPlusPlus11) {
3825 Char = getCharAndSize(CurPtr, SizeTmp);
3826
3827 if (Char == '"')
3828 return LexRawStringLiteral(Result,
3829 ConsumeChar(CurPtr, SizeTmp, Result),
3830 tok::string_literal);
3831 }
3832
3833 // treat R like the start of an identifier.
3834 return LexIdentifierContinue(Result, CurPtr);
3835
3836 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3837 // Notify MIOpt that we read a non-whitespace/non-comment token.
3838 MIOpt.ReadToken();
3839 Char = getCharAndSize(CurPtr, SizeTmp);
3840
3841 // Wide string literal.
3842 if (Char == '"')
3843 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3844 tok::wide_string_literal);
3845
3846 // Wide raw string literal.
3847 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3848 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3849 return LexRawStringLiteral(Result,
3850 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3851 SizeTmp2, Result),
3852 tok::wide_string_literal);
3853
3854 // Wide character constant.
3855 if (Char == '\'')
3856 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3857 tok::wide_char_constant);
3858 // FALL THROUGH, treating L like the start of an identifier.
3859 [[fallthrough]];
3860
3861 // C99 6.4.2: Identifiers.
3862 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3863 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3864 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3865 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3866 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3867 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3868 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3869 case 'v': case 'w': case 'x': case 'y': case 'z':
3870 case '_':
3871 // Notify MIOpt that we read a non-whitespace/non-comment token.
3872 MIOpt.ReadToken();
3873 return LexIdentifierContinue(Result, CurPtr);
3874
3875 case '$': // $ in identifiers.
3876 if (LangOpts.DollarIdents) {
3877 if (!isLexingRawMode())
3878 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3879 // Notify MIOpt that we read a non-whitespace/non-comment token.
3880 MIOpt.ReadToken();
3881 return LexIdentifierContinue(Result, CurPtr);
3882 }
3883
3884 Kind = tok::unknown;
3885 break;
3886
3887 // C99 6.4.4: Character Constants.
3888 case '\'':
3889 // Notify MIOpt that we read a non-whitespace/non-comment token.
3890 MIOpt.ReadToken();
3891 return LexCharConstant(Result, CurPtr, tok::char_constant);
3892
3893 // C99 6.4.5: String Literals.
3894 case '"':
3895 // Notify MIOpt that we read a non-whitespace/non-comment token.
3896 MIOpt.ReadToken();
3897 return LexStringLiteral(Result, CurPtr,
3898 ParsingFilename ? tok::header_name
3899 : tok::string_literal);
3900
3901 // C99 6.4.6: Punctuators.
3902 case '?':
3903 Kind = tok::question;
3904 break;
3905 case '[':
3906 Kind = tok::l_square;
3907 break;
3908 case ']':
3909 Kind = tok::r_square;
3910 break;
3911 case '(':
3912 Kind = tok::l_paren;
3913 break;
3914 case ')':
3915 Kind = tok::r_paren;
3916 break;
3917 case '{':
3918 Kind = tok::l_brace;
3919 break;
3920 case '}':
3921 Kind = tok::r_brace;
3922 break;
3923 case '.':
3924 Char = getCharAndSize(CurPtr, SizeTmp);
3925 if (Char >= '0' && Char <= '9') {
3926 // Notify MIOpt that we read a non-whitespace/non-comment token.
3927 MIOpt.ReadToken();
3928
3929 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3930 } else if (LangOpts.CPlusPlus && Char == '*') {
3931 Kind = tok::periodstar;
3932 CurPtr += SizeTmp;
3933 } else if (Char == '.' &&
3934 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3935 Kind = tok::ellipsis;
3936 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3937 SizeTmp2, Result);
3938 } else {
3939 Kind = tok::period;
3940 }
3941 break;
3942 case '&':
3943 Char = getCharAndSize(CurPtr, SizeTmp);
3944 if (Char == '&') {
3945 Kind = tok::ampamp;
3946 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3947 } else if (Char == '=') {
3948 Kind = tok::ampequal;
3949 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3950 } else {
3951 Kind = tok::amp;
3952 }
3953 break;
3954 case '*':
3955 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3956 Kind = tok::starequal;
3957 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3958 } else {
3959 Kind = tok::star;
3960 }
3961 break;
3962 case '+':
3963 Char = getCharAndSize(CurPtr, SizeTmp);
3964 if (Char == '+') {
3965 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3966 Kind = tok::plusplus;
3967 } else if (Char == '=') {
3968 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3969 Kind = tok::plusequal;
3970 } else {
3971 Kind = tok::plus;
3972 }
3973 break;
3974 case '-':
3975 Char = getCharAndSize(CurPtr, SizeTmp);
3976 if (Char == '-') { // --
3977 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3978 Kind = tok::minusminus;
3979 } else if (Char == '>' && LangOpts.CPlusPlus &&
3980 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3981 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3982 SizeTmp2, Result);
3983 Kind = tok::arrowstar;
3984 } else if (Char == '>') { // ->
3985 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3986 Kind = tok::arrow;
3987 } else if (Char == '=') { // -=
3988 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3989 Kind = tok::minusequal;
3990 } else {
3991 Kind = tok::minus;
3992 }
3993 break;
3994 case '~':
3995 Kind = tok::tilde;
3996 break;
3997 case '!':
3998 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3999 Kind = tok::exclaimequal;
4000 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4001 } else {
4002 Kind = tok::exclaim;
4003 }
4004 break;
4005 case '/':
4006 // 6.4.9: Comments
4007 Char = getCharAndSize(CurPtr, SizeTmp);
4008 if (Char == '/') { // Line comment.
8
Assuming the condition is false
9
Taking false branch
4009 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4010 // want to lex this as a comment. There is one problem with this though,
4011 // that in one particular corner case, this can change the behavior of the
4012 // resultant program. For example, In "foo //**/ bar", C89 would lex
4013 // this as "foo / bar" and languages with Line comments would lex it as
4014 // "foo". Check to see if the character after the second slash is a '*'.
4015 // If so, we will lex that as a "/" instead of the start of a comment.
4016 // However, we never do this if we are just preprocessing.
4017 bool TreatAsComment =
4018 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4019 if (!TreatAsComment)
4020 if (!(PP && PP->isPreprocessedOutput()))
4021 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4022
4023 if (TreatAsComment) {
4024 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4025 TokAtPhysicalStartOfLine))
4026 return true; // There is a token to return.
4027
4028 // It is common for the tokens immediately after a // comment to be
4029 // whitespace (indentation for the next line). Instead of going through
4030 // the big switch, handle it efficiently now.
4031 goto SkipIgnoredUnits;
4032 }
4033 }
4034
4035 if (Char == '*') { // /**/ comment.
10
Assuming the condition is true
11
Taking true branch
4036 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
12
Calling 'Lexer::SkipBlockComment'
4037 TokAtPhysicalStartOfLine))
4038 return true; // There is a token to return.
4039
4040 // We only saw whitespace, so just try again with this lexer.
4041 // (We manually eliminate the tail call to avoid recursion.)
4042 goto LexNextToken;
4043 }
4044
4045 if (Char == '=') {
4046 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4047 Kind = tok::slashequal;
4048 } else {
4049 Kind = tok::slash;
4050 }
4051 break;
4052 case '%':
4053 Char = getCharAndSize(CurPtr, SizeTmp);
4054 if (Char == '=') {
4055 Kind = tok::percentequal;
4056 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4057 } else if (LangOpts.Digraphs && Char == '>') {
4058 Kind = tok::r_brace; // '%>' -> '}'
4059 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4060 } else if (LangOpts.Digraphs && Char == ':') {
4061 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4062 Char = getCharAndSize(CurPtr, SizeTmp);
4063 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4064 Kind = tok::hashhash; // '%:%:' -> '##'
4065 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4066 SizeTmp2, Result);
4067 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4068 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4069 if (!isLexingRawMode())
4070 Diag(BufferPtr, diag::ext_charize_microsoft);
4071 Kind = tok::hashat;
4072 } else { // '%:' -> '#'
4073 // We parsed a # character. If this occurs at the start of the line,
4074 // it's actually the start of a preprocessing directive. Callback to
4075 // the preprocessor to handle it.
4076 // TODO: -fpreprocessed mode??
4077 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4078 goto HandleDirective;
4079
4080 Kind = tok::hash;
4081 }
4082 } else {
4083 Kind = tok::percent;
4084 }
4085 break;
4086 case '<':
4087 Char = getCharAndSize(CurPtr, SizeTmp);
4088 if (ParsingFilename) {
4089 return LexAngledStringLiteral(Result, CurPtr);
4090 } else if (Char == '<') {
4091 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4092 if (After == '=') {
4093 Kind = tok::lesslessequal;
4094 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4095 SizeTmp2, Result);
4096 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4097 // If this is actually a '<<<<<<<' version control conflict marker,
4098 // recognize it as such and recover nicely.
4099 goto LexNextToken;
4100 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4101 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4102 // ignore it.
4103 goto LexNextToken;
4104 } else if (LangOpts.CUDA && After == '<') {
4105 Kind = tok::lesslessless;
4106 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4107 SizeTmp2, Result);
4108 } else {
4109 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4110 Kind = tok::lessless;
4111 }
4112 } else if (Char == '=') {
4113 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4114 if (After == '>') {
4115 if (LangOpts.CPlusPlus20) {
4116 if (!isLexingRawMode())
4117 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4118 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4119 SizeTmp2, Result);
4120 Kind = tok::spaceship;
4121 break;
4122 }
4123 // Suggest adding a space between the '<=' and the '>' to avoid a
4124 // change in semantics if this turns up in C++ <=17 mode.
4125 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4126 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4127 << FixItHint::CreateInsertion(
4128 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4129 }
4130 }
4131 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4132 Kind = tok::lessequal;
4133 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4134 if (LangOpts.CPlusPlus11 &&
4135 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4136 // C++0x [lex.pptoken]p3:
4137 // Otherwise, if the next three characters are <:: and the subsequent
4138 // character is neither : nor >, the < is treated as a preprocessor
4139 // token by itself and not as the first character of the alternative
4140 // token <:.
4141 unsigned SizeTmp3;
4142 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4143 if (After != ':' && After != '>') {
4144 Kind = tok::less;
4145 if (!isLexingRawMode())
4146 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4147 break;
4148 }
4149 }
4150
4151 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4152 Kind = tok::l_square;
4153 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4154 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4155 Kind = tok::l_brace;
4156 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4157 lexEditorPlaceholder(Result, CurPtr)) {
4158 return true;
4159 } else {
4160 Kind = tok::less;
4161 }
4162 break;
4163 case '>':
4164 Char = getCharAndSize(CurPtr, SizeTmp);
4165 if (Char == '=') {
4166 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4167 Kind = tok::greaterequal;
4168 } else if (Char == '>') {
4169 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4170 if (After == '=') {
4171 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4172 SizeTmp2, Result);
4173 Kind = tok::greatergreaterequal;
4174 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4175 // If this is actually a '>>>>' conflict marker, recognize it as such
4176 // and recover nicely.
4177 goto LexNextToken;
4178 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4179 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4180 goto LexNextToken;
4181 } else if (LangOpts.CUDA && After == '>') {
4182 Kind = tok::greatergreatergreater;
4183 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4184 SizeTmp2, Result);
4185 } else {
4186 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4187 Kind = tok::greatergreater;
4188 }
4189 } else {
4190 Kind = tok::greater;
4191 }
4192 break;
4193 case '^':
4194 Char = getCharAndSize(CurPtr, SizeTmp);
4195 if (Char == '=') {
4196 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4197 Kind = tok::caretequal;
4198 } else if (LangOpts.OpenCL && Char == '^') {
4199 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4200 Kind = tok::caretcaret;
4201 } else {
4202 Kind = tok::caret;
4203 }
4204 break;
4205 case '|':
4206 Char = getCharAndSize(CurPtr, SizeTmp);
4207 if (Char == '=') {
4208 Kind = tok::pipeequal;
4209 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4210 } else if (Char == '|') {
4211 // If this is '|||||||' and we're in a conflict marker, ignore it.
4212 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4213 goto LexNextToken;
4214 Kind = tok::pipepipe;
4215 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4216 } else {
4217 Kind = tok::pipe;
4218 }
4219 break;
4220 case ':':
4221 Char = getCharAndSize(CurPtr, SizeTmp);
4222 if (LangOpts.Digraphs && Char == '>') {
4223 Kind = tok::r_square; // ':>' -> ']'
4224 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4225 } else if ((LangOpts.CPlusPlus ||
4226 LangOpts.DoubleSquareBracketAttributes) &&
4227 Char == ':') {
4228 Kind = tok::coloncolon;
4229 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4230 } else {
4231 Kind = tok::colon;
4232 }
4233 break;
4234 case ';':
4235 Kind = tok::semi;
4236 break;
4237 case '=':
4238 Char = getCharAndSize(CurPtr, SizeTmp);
4239 if (Char == '=') {
4240 // If this is '====' and we're in a conflict marker, ignore it.
4241 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4242 goto LexNextToken;
4243
4244 Kind = tok::equalequal;
4245 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4246 } else {
4247 Kind = tok::equal;
4248 }
4249 break;
4250 case ',':
4251 Kind = tok::comma;
4252 break;
4253 case '#':
4254 Char = getCharAndSize(CurPtr, SizeTmp);
4255 if (Char == '#') {
4256 Kind = tok::hashhash;
4257 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4258 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4259 Kind = tok::hashat;
4260 if (!isLexingRawMode())
4261 Diag(BufferPtr, diag::ext_charize_microsoft);
4262 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4263 } else {
4264 // We parsed a # character. If this occurs at the start of the line,
4265 // it's actually the start of a preprocessing directive. Callback to
4266 // the preprocessor to handle it.
4267 // TODO: -fpreprocessed mode??
4268 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4269 goto HandleDirective;
4270
4271 Kind = tok::hash;
4272 }
4273 break;
4274
4275 case '@':
4276 // Objective C support.
4277 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4278 Kind = tok::at;
4279 else
4280 Kind = tok::unknown;
4281 break;
4282
4283 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4284 case '\\':
4285 if (!LangOpts.AsmPreprocessor) {
4286 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4287 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4288 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4289 return true; // KeepWhitespaceMode
4290
4291 // We only saw whitespace, so just try again with this lexer.
4292 // (We manually eliminate the tail call to avoid recursion.)
4293 goto LexNextToken;
4294 }
4295
4296 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4297 }
4298 }
4299
4300 Kind = tok::unknown;
4301 break;
4302
4303 default: {
4304 if (isASCII(Char)) {
4305 Kind = tok::unknown;
4306 break;
4307 }
4308
4309 llvm::UTF32 CodePoint;
4310
4311 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4312 // an escaped newline.
4313 --CurPtr;
4314 llvm::ConversionResult Status =
4315 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4316 (const llvm::UTF8 *)BufferEnd,
4317 &CodePoint,
4318 llvm::strictConversion);
4319 if (Status == llvm::conversionOK) {
4320 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4321 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4322 return true; // KeepWhitespaceMode
4323
4324 // We only saw whitespace, so just try again with this lexer.
4325 // (We manually eliminate the tail call to avoid recursion.)
4326 goto LexNextToken;
4327 }
4328 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4329 }
4330
4331 if (isLexingRawMode() || ParsingPreprocessorDirective ||
4332 PP->isPreprocessedOutput()) {
4333 ++CurPtr;
4334 Kind = tok::unknown;
4335 break;
4336 }
4337
4338 // Non-ASCII characters tend to creep into source code unintentionally.
4339 // Instead of letting the parser complain about the unknown token,
4340 // just diagnose the invalid UTF-8, then drop the character.
4341 Diag(CurPtr, diag::err_invalid_utf8);
4342
4343 BufferPtr = CurPtr+1;
4344 // We're pretending the character didn't exist, so just try again with
4345 // this lexer.
4346 // (We manually eliminate the tail call to avoid recursion.)
4347 goto LexNextToken;
4348 }
4349 }
4350
4351 // Notify MIOpt that we read a non-whitespace/non-comment token.
4352 MIOpt.ReadToken();
4353
4354 // Update the location of token as well as BufferPtr.
4355 FormTokenWithChars(Result, CurPtr, Kind);
4356 return true;
4357
4358HandleDirective:
4359 // We parsed a # character and it's the start of a preprocessing directive.
4360
4361 FormTokenWithChars(Result, CurPtr, tok::hash);
4362 PP->HandleDirective(Result);
4363
4364 if (PP->hadModuleLoaderFatalFailure())
4365 // With a fatal failure in the module loader, we abort parsing.
4366 return true;
4367
4368 // We parsed the directive; lex a token with the new state.
4369 return false;
4370
4371LexNextToken:
4372 Result.clearFlag(Token::NeedsCleaning);
4373 goto LexStart;
4374}
4375
4376const char *Lexer::convertDependencyDirectiveToken(
4377 const dependency_directives_scan::Token &DDTok, Token &Result) {
4378 const char *TokPtr = BufferStart + DDTok.Offset;
4379 Result.startToken();
4380 Result.setLocation(getSourceLocation(TokPtr));
4381 Result.setKind(DDTok.Kind);
4382 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4383 Result.setLength(DDTok.Length);
4384 BufferPtr = TokPtr + DDTok.Length;
4385 return TokPtr;
4386}
4387
4388bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4389 assert(isDependencyDirectivesLexer())(static_cast <bool> (isDependencyDirectivesLexer()) ? void
(0) : __assert_fail ("isDependencyDirectivesLexer()", "clang/lib/Lex/Lexer.cpp"
, 4389, __extension__ __PRETTY_FUNCTION__))
;
4390
4391 using namespace dependency_directives_scan;
4392
4393 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4394 if (DepDirectives.front().Kind == pp_eof)
4395 return LexEndOfFile(Result, BufferEnd);
4396 if (DepDirectives.front().Kind == tokens_present_before_eof)
4397 MIOpt.ReadToken();
4398 NextDepDirectiveTokenIndex = 0;
4399 DepDirectives = DepDirectives.drop_front();
4400 }
4401
4402 const dependency_directives_scan::Token &DDTok =
4403 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4404 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4405 // Read something other than a preprocessor directive hash.
4406 MIOpt.ReadToken();
4407 }
4408
4409 if (ParsingFilename && DDTok.is(tok::less)) {
4410 BufferPtr = BufferStart + DDTok.Offset;
4411 LexAngledStringLiteral(Result, BufferPtr + 1);
4412 if (Result.isNot(tok::header_name))
4413 return true;
4414 // Advance the index of lexed tokens.
4415 while (true) {
4416 const dependency_directives_scan::Token &NextTok =
4417 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4418 if (BufferStart + NextTok.Offset >= BufferPtr)
4419 break;
4420 ++NextDepDirectiveTokenIndex;
4421 }
4422 return true;
4423 }
4424
4425 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4426
4427 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4428 PP->HandleDirective(Result);
4429 return false;
4430 }
4431 if (Result.is(tok::raw_identifier)) {
4432 Result.setRawIdentifierData(TokPtr);
4433 if (!isLexingRawMode()) {
4434 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
4435 if (II->isHandleIdentifierCase())
4436 return PP->HandleIdentifier(Result);
4437 }
4438 return true;
4439 }
4440 if (Result.isLiteral()) {
4441 Result.setLiteralData(TokPtr);
4442 return true;
4443 }
4444 if (Result.is(tok::colon) &&
4445 (LangOpts.CPlusPlus || LangOpts.DoubleSquareBracketAttributes)) {
4446 // Convert consecutive colons to 'tok::coloncolon'.
4447 if (*BufferPtr == ':') {
4448 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is((static_cast <bool> (DepDirectives.front().Tokens[NextDepDirectiveTokenIndex
].is( tok::colon)) ? void (0) : __assert_fail ("DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( tok::colon)"
, "clang/lib/Lex/Lexer.cpp", 4449, __extension__ __PRETTY_FUNCTION__
))
4449 tok::colon))(static_cast <bool> (DepDirectives.front().Tokens[NextDepDirectiveTokenIndex
].is( tok::colon)) ? void (0) : __assert_fail ("DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( tok::colon)"
, "clang/lib/Lex/Lexer.cpp", 4449, __extension__ __PRETTY_FUNCTION__
))
;
4450 ++NextDepDirectiveTokenIndex;
4451 Result.setKind(tok::coloncolon);
4452 }
4453 return true;
4454 }
4455 if (Result.is(tok::eod))
4456 ParsingPreprocessorDirective = false;
4457
4458 return true;
4459}
4460
4461bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4462 assert(isDependencyDirectivesLexer())(static_cast <bool> (isDependencyDirectivesLexer()) ? void
(0) : __assert_fail ("isDependencyDirectivesLexer()", "clang/lib/Lex/Lexer.cpp"
, 4462, __extension__ __PRETTY_FUNCTION__))
;
4463
4464 using namespace dependency_directives_scan;
4465
4466 bool Stop = false;
4467 unsigned NestedIfs = 0;
4468 do {
4469 DepDirectives = DepDirectives.drop_front();
4470 switch (DepDirectives.front().Kind) {
4471 case pp_none:
4472 llvm_unreachable("unexpected 'pp_none'")::llvm::llvm_unreachable_internal("unexpected 'pp_none'", "clang/lib/Lex/Lexer.cpp"
, 4472)
;
4473 case pp_include:
4474 case pp___include_macros:
4475 case pp_define:
4476 case pp_undef:
4477 case pp_import:
4478 case pp_pragma_import:
4479 case pp_pragma_once:
4480 case pp_pragma_push_macro:
4481 case pp_pragma_pop_macro:
4482 case pp_pragma_include_alias:
4483 case pp_pragma_system_header:
4484 case pp_include_next:
4485 case decl_at_import:
4486 case cxx_module_decl:
4487 case cxx_import_decl:
4488 case cxx_export_module_decl:
4489 case cxx_export_import_decl:
4490 case tokens_present_before_eof:
4491 break;
4492 case pp_if:
4493 case pp_ifdef:
4494 case pp_ifndef:
4495 ++NestedIfs;
4496 break;
4497 case pp_elif:
4498 case pp_elifdef:
4499 case pp_elifndef:
4500 case pp_else:
4501 if (!NestedIfs) {
4502 Stop = true;
4503 }
4504 break;
4505 case pp_endif:
4506 if (!NestedIfs) {
4507 Stop = true;
4508 } else {
4509 --NestedIfs;
4510 }
4511 break;
4512 case pp_eof:
4513 NextDepDirectiveTokenIndex = 0;
4514 return LexEndOfFile(Result, BufferEnd);
4515 }
4516 } while (!Stop);
4517
4518 const dependency_directives_scan::Token &DDTok =
4519 DepDirectives.front().Tokens.front();
4520 assert(DDTok.is(tok::hash))(static_cast <bool> (DDTok.is(tok::hash)) ? void (0) : __assert_fail
("DDTok.is(tok::hash)", "clang/lib/Lex/Lexer.cpp", 4520, __extension__
__PRETTY_FUNCTION__))
;
4521 NextDepDirectiveTokenIndex = 1;
4522
4523 convertDependencyDirectiveToken(DDTok, Result);
4524 return false;
4525}

/build/source/clang/include/clang/Lex/Lexer.h

1//===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the Lexer interface.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_LEX_LEXER_H
14#define LLVM_CLANG_LEX_LEXER_H
15
16#include "clang/Basic/LangOptions.h"
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/TokenKinds.h"
19#include "clang/Lex/DependencyDirectivesScanner.h"
20#include "clang/Lex/PreprocessorLexer.h"
21#include "clang/Lex/Token.h"
22#include "llvm/ADT/SmallVector.h"
23#include "llvm/ADT/StringRef.h"
24#include <cassert>
25#include <cstdint>
26#include <optional>
27#include <string>
28
29namespace llvm {
30
31class MemoryBufferRef;
32
33} // namespace llvm
34
35namespace clang {
36
37class DiagnosticBuilder;
38class Preprocessor;
39class SourceManager;
40class LangOptions;
41
42/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
43/// recovering from.
44enum ConflictMarkerKind {
45 /// Not within a conflict marker.
46 CMK_None,
47
48 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
49 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
50 CMK_Normal,
51
52 /// A Perforce-style conflict marker, initiated by 4 ">"s,
53 /// separated by 4 "="s, and terminated by 4 "<"s.
54 CMK_Perforce
55};
56
57/// Describes the bounds (start, size) of the preamble and a flag required by
58/// PreprocessorOptions::PrecompiledPreambleBytes.
59/// The preamble includes the BOM, if any.
60struct PreambleBounds {
61 /// Size of the preamble in bytes.
62 unsigned Size;
63
64 /// Whether the preamble ends at the start of a new line.
65 ///
66 /// Used to inform the lexer as to whether it's starting at the beginning of
67 /// a line after skipping the preamble.
68 bool PreambleEndsAtStartOfLine;
69
70 PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
71 : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
72};
73
74/// Lexer - This provides a simple interface that turns a text buffer into a
75/// stream of tokens. This provides no support for file reading or buffering,
76/// or buffering/seeking of tokens, only forward lexing is supported. It relies
77/// on the specified Preprocessor object to handle preprocessor directives, etc.
78class Lexer : public PreprocessorLexer {
79 friend class Preprocessor;
80
81 void anchor() override;
82
83 //===--------------------------------------------------------------------===//
84 // Constant configuration values for this lexer.
85
86 // Start of the buffer.
87 const char *BufferStart;
88
89 // End of the buffer.
90 const char *BufferEnd;
91
92 // Location for start of file.
93 SourceLocation FileLoc;
94
95 // LangOpts enabled by this language.
96 // Storing LangOptions as reference here is important from performance point
97 // of view. Lack of reference means that LangOptions copy constructor would be
98 // called by Lexer(..., const LangOptions &LangOpts,...). Given that local
99 // Lexer objects are created thousands times (in Lexer::getRawToken,
100 // Preprocessor::EnterSourceFile and other places) during single module
101 // processing in frontend it would make std::vector<std::string> copy
102 // constructors surprisingly hot.
103 const LangOptions &LangOpts;
104
105 // True if '//' line comments are enabled.
106 bool LineComment;
107
108 // True if lexer for _Pragma handling.
109 bool Is_PragmaLexer;
110
111 //===--------------------------------------------------------------------===//
112 // Context-specific lexing flags set by the preprocessor.
113 //
114
115 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
116 /// and return them as tokens. This is used for -C and -CC modes, and
117 /// whitespace preservation can be useful for some clients that want to lex
118 /// the file in raw mode and get every character from the file.
119 ///
120 /// When this is set to 2 it returns comments and whitespace. When set to 1
121 /// it returns comments, when it is set to 0 it returns normal tokens only.
122 unsigned char ExtendedTokenMode;
123
124 //===--------------------------------------------------------------------===//
125 // Context that changes as the file is lexed.
126 // NOTE: any state that mutates when in raw mode must have save/restore code
127 // in Lexer::isNextPPTokenLParen.
128
129 // BufferPtr - Current pointer into the buffer. This is the next character
130 // to be lexed.
131 const char *BufferPtr;
132
133 // IsAtStartOfLine - True if the next lexed token should get the "start of
134 // line" flag set on it.
135 bool IsAtStartOfLine;
136
137 bool IsAtPhysicalStartOfLine;
138
139 bool HasLeadingSpace;
140
141 bool HasLeadingEmptyMacro;
142
143 /// True if this is the first time we're lexing the input file.
144 bool IsFirstTimeLexingFile;
145
146 // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
147 // it also points to '\n.'
148 const char *NewLinePtr;
149
150 // CurrentConflictMarkerState - The kind of conflict marker we are handling.
151 ConflictMarkerKind CurrentConflictMarkerState;
152
153 /// Non-empty if this \p Lexer is \p isDependencyDirectivesLexer().
154 ArrayRef<dependency_directives_scan::Directive> DepDirectives;
155
156 /// If this \p Lexer is \p isDependencyDirectivesLexer(), it represents the
157 /// next token to use from the current dependency directive.
158 unsigned NextDepDirectiveTokenIndex = 0;
159
160 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
161
162public:
163 /// Lexer constructor - Create a new lexer object for the specified buffer
164 /// with the specified preprocessor managing the lexing process. This lexer
165 /// assumes that the associated file buffer and Preprocessor objects will
166 /// outlive it, so it doesn't take ownership of either of them.
167 Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP,
168 bool IsFirstIncludeOfFile = true);
169
170 /// Lexer constructor - Create a new raw lexer object. This object is only
171 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
172 /// text range will outlive it, so it doesn't take ownership of it.
173 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
174 const char *BufStart, const char *BufPtr, const char *BufEnd,
175 bool IsFirstIncludeOfFile = true);
176
177 /// Lexer constructor - Create a new raw lexer object. This object is only
178 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
179 /// text range will outlive it, so it doesn't take ownership of it.
180 Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
181 const SourceManager &SM, const LangOptions &LangOpts,
182 bool IsFirstIncludeOfFile = true);
183
184 Lexer(const Lexer &) = delete;
185 Lexer &operator=(const Lexer &) = delete;
186
187 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
188 /// _Pragma expansion. This has a variety of magic semantics that this method
189 /// sets up. It returns a new'd Lexer that must be delete'd when done.
190 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
191 SourceLocation ExpansionLocStart,
192 SourceLocation ExpansionLocEnd,
193 unsigned TokLen, Preprocessor &PP);
194
195 /// getFileLoc - Return the File Location for the file we are lexing out of.
196 /// The physical location encodes the location where the characters come from,
197 /// the virtual location encodes where we should *claim* the characters came
198 /// from. Currently this is only used by _Pragma handling.
199 SourceLocation getFileLoc() const { return FileLoc; }
200
201private:
202 /// Lex - Return the next token in the file. If this is the end of file, it
203 /// return the tok::eof token. This implicitly involves the preprocessor.
204 bool Lex(Token &Result);
205
206 /// Called when the preprocessor is in 'dependency scanning lexing mode'.
207 bool LexDependencyDirectiveToken(Token &Result);
208
209 /// Called when the preprocessor is in 'dependency scanning lexing mode' and
210 /// is skipping a conditional block.
211 bool LexDependencyDirectiveTokenWhileSkipping(Token &Result);
212
213 /// True when the preprocessor is in 'dependency scanning lexing mode' and
214 /// created this \p Lexer for lexing a set of dependency directive tokens.
215 bool isDependencyDirectivesLexer() const { return !DepDirectives.empty(); }
216
217 /// Initializes \p Result with data from \p DDTok and advances \p BufferPtr to
218 /// the position just after the token.
219 /// \returns the buffer pointer at the beginning of the token.
220 const char *convertDependencyDirectiveToken(
221 const dependency_directives_scan::Token &DDTok, Token &Result);
222
223public:
224 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
225 bool isPragmaLexer() const { return Is_PragmaLexer; }
226
227private:
228 /// IndirectLex - An indirect call to 'Lex' that can be invoked via
229 /// the PreprocessorLexer interface.
230 void IndirectLex(Token &Result) override { Lex(Result); }
231
232public:
233 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
234 /// associated preprocessor object. Return true if the 'next character to
235 /// read' pointer points at the end of the lexer buffer, false otherwise.
236 bool LexFromRawLexer(Token &Result) {
237 assert(LexingRawMode && "Not already in raw mode!")(static_cast <bool> (LexingRawMode && "Not already in raw mode!"
) ? void (0) : __assert_fail ("LexingRawMode && \"Not already in raw mode!\""
, "clang/include/clang/Lex/Lexer.h", 237, __extension__ __PRETTY_FUNCTION__
))
;
238 Lex(Result);
239 // Note that lexing to the end of the buffer doesn't implicitly delete the
240 // lexer when in raw mode.
241 return BufferPtr == BufferEnd;
242 }
243
244 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
245 /// every character in the file, including whitespace and comments. This
246 /// should only be used in raw mode, as the preprocessor is not prepared to
247 /// deal with the excess tokens.
248 bool isKeepWhitespaceMode() const {
249 return ExtendedTokenMode > 1;
250 }
251
252 /// SetKeepWhitespaceMode - This method lets clients enable or disable
253 /// whitespace retention mode.
254 void SetKeepWhitespaceMode(bool Val) {
255 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&(static_cast <bool> ((!Val || LexingRawMode || LangOpts
.TraditionalCPP) && "Can only retain whitespace in raw mode or -traditional-cpp"
) ? void (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "clang/include/clang/Lex/Lexer.h", 256, __extension__ __PRETTY_FUNCTION__
))
256 "Can only retain whitespace in raw mode or -traditional-cpp")(static_cast <bool> ((!Val || LexingRawMode || LangOpts
.TraditionalCPP) && "Can only retain whitespace in raw mode or -traditional-cpp"
) ? void (0) : __assert_fail ("(!Val || LexingRawMode || LangOpts.TraditionalCPP) && \"Can only retain whitespace in raw mode or -traditional-cpp\""
, "clang/include/clang/Lex/Lexer.h", 256, __extension__ __PRETTY_FUNCTION__
))
;
257 ExtendedTokenMode = Val ? 2 : 0;
258 }
259
260 /// inKeepCommentMode - Return true if the lexer should return comments as
261 /// tokens.
262 bool inKeepCommentMode() const {
263 return ExtendedTokenMode > 0;
264 }
265
266 /// SetCommentRetentionMode - Change the comment retention mode of the lexer
267 /// to the specified mode. This is really only useful when lexing in raw
268 /// mode, because otherwise the lexer needs to manage this.
269 void SetCommentRetentionState(bool Mode) {
270 assert(!isKeepWhitespaceMode() &&(static_cast <bool> (!isKeepWhitespaceMode() &&
"Can't play with comment retention state when retaining whitespace"
) ? void (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "clang/include/clang/Lex/Lexer.h", 271, __extension__ __PRETTY_FUNCTION__
))
271 "Can't play with comment retention state when retaining whitespace")(static_cast <bool> (!isKeepWhitespaceMode() &&
"Can't play with comment retention state when retaining whitespace"
) ? void (0) : __assert_fail ("!isKeepWhitespaceMode() && \"Can't play with comment retention state when retaining whitespace\""
, "clang/include/clang/Lex/Lexer.h", 271, __extension__ __PRETTY_FUNCTION__
))
;
272 ExtendedTokenMode = Mode ? 1 : 0;
273 }
274
275 /// Sets the extended token mode back to its initial value, according to the
276 /// language options and preprocessor. This controls whether the lexer
277 /// produces comment and whitespace tokens.
278 ///
279 /// This requires the lexer to have an associated preprocessor. A standalone
280 /// lexer has nothing to reset to.
281 void resetExtendedTokenMode();
282
283 /// Gets source code buffer.
284 StringRef getBuffer() const {
285 return StringRef(BufferStart, BufferEnd - BufferStart);
286 }
287
288 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
289 /// uninterpreted string. This switches the lexer out of directive mode.
290 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
291
292
293 /// Diag - Forwarding function for diagnostics. This translate a source
294 /// position in the current buffer into a SourceLocation object for rendering.
295 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
296
297 /// getSourceLocation - Return a source location identifier for the specified
298 /// offset in the current file.
299 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
300
301 /// getSourceLocation - Return a source location for the next character in
302 /// the current file.
303 SourceLocation getSourceLocation() override {
304 return getSourceLocation(BufferPtr);
305 }
306
307 /// Return the current location in the buffer.
308 const char *getBufferLocation() const { return BufferPtr; }
309
310 /// Returns the current lexing offset.
311 unsigned getCurrentBufferOffset() {
312 assert(BufferPtr >= BufferStart && "Invalid buffer state")(static_cast <bool> (BufferPtr >= BufferStart &&
"Invalid buffer state") ? void (0) : __assert_fail ("BufferPtr >= BufferStart && \"Invalid buffer state\""
, "clang/include/clang/Lex/Lexer.h", 312, __extension__ __PRETTY_FUNCTION__
))
;
313 return BufferPtr - BufferStart;
314 }
315
316 /// Set the lexer's buffer pointer to \p Offset.
317 void seek(unsigned Offset, bool IsAtStartOfLine);
318
319 /// Stringify - Convert the specified string into a C string by i) escaping
320 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
321 /// If Charify is true, this escapes the ' character instead of ".
322 static std::string Stringify(StringRef Str, bool Charify = false);
323
324 /// Stringify - Convert the specified string into a C string by i) escaping
325 /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
326 static void Stringify(SmallVectorImpl<char> &Str);
327
328 /// getSpelling - This method is used to get the spelling of a token into a
329 /// preallocated buffer, instead of as an std::string. The caller is required
330 /// to allocate enough space for the token, which is guaranteed to be at least
331 /// Tok.getLength() bytes long. The length of the actual result is returned.
332 ///
333 /// Note that this method may do two possible things: it may either fill in
334 /// the buffer specified with characters, or it may *change the input pointer*
335 /// to point to a constant buffer with the data already in it (avoiding a
336 /// copy). The caller is not allowed to modify the returned buffer pointer
337 /// if an internal buffer is returned.
338 static unsigned getSpelling(const Token &Tok, const char *&Buffer,
339 const SourceManager &SourceMgr,
340 const LangOptions &LangOpts,
341 bool *Invalid = nullptr);
342
343 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
344 /// token is the characters used to represent the token in the source file
345 /// after trigraph expansion and escaped-newline folding. In particular, this
346 /// wants to get the true, uncanonicalized, spelling of things like digraphs
347 /// UCNs, etc.
348 static std::string getSpelling(const Token &Tok,
349 const SourceManager &SourceMgr,
350 const LangOptions &LangOpts,
351 bool *Invalid = nullptr);
352
353 /// getSpelling - This method is used to get the spelling of the
354 /// token at the given source location. If, as is usually true, it
355 /// is not necessary to copy any data, then the returned string may
356 /// not point into the provided buffer.
357 ///
358 /// This method lexes at the expansion depth of the given
359 /// location and does not jump to the expansion or spelling
360 /// location.
361 static StringRef getSpelling(SourceLocation loc,
362 SmallVectorImpl<char> &buffer,
363 const SourceManager &SM,
364 const LangOptions &options,
365 bool *invalid = nullptr);
366
367 /// MeasureTokenLength - Relex the token at the specified location and return
368 /// its length in bytes in the input file. If the token needs cleaning (e.g.
369 /// includes a trigraph or an escaped newline) then this count includes bytes
370 /// that are part of that.
371 static unsigned MeasureTokenLength(SourceLocation Loc,
372 const SourceManager &SM,
373 const LangOptions &LangOpts);
374
375 /// Relex the token at the specified location.
376 /// \returns true if there was a failure, false on success.
377 static bool getRawToken(SourceLocation Loc, Token &Result,
378 const SourceManager &SM,
379 const LangOptions &LangOpts,
380 bool IgnoreWhiteSpace = false);
381
382 /// Given a location any where in a source buffer, find the location
383 /// that corresponds to the beginning of the token in which the original
384 /// source location lands.
385 static SourceLocation GetBeginningOfToken(SourceLocation Loc,
386 const SourceManager &SM,
387 const LangOptions &LangOpts);
388
389 /// Get the physical length (including trigraphs and escaped newlines) of the
390 /// first \p Characters characters of the token starting at TokStart.
391 static unsigned getTokenPrefixLength(SourceLocation TokStart,
392 unsigned CharNo,
393 const SourceManager &SM,
394 const LangOptions &LangOpts);
395
396 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
397 /// location at the start of a token, return a new location that specifies a
398 /// character within the token. This handles trigraphs and escaped newlines.
399 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
400 unsigned Characters,
401 const SourceManager &SM,
402 const LangOptions &LangOpts) {
403 return TokStart.getLocWithOffset(
404 getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
405 }
406
407 /// Computes the source location just past the end of the
408 /// token at this source location.
409 ///
410 /// This routine can be used to produce a source location that
411 /// points just past the end of the token referenced by \p Loc, and
412 /// is generally used when a diagnostic needs to point just after a
413 /// token where it expected something different that it received. If
414 /// the returned source location would not be meaningful (e.g., if
415 /// it points into a macro), this routine returns an invalid
416 /// source location.
417 ///
418 /// \param Offset an offset from the end of the token, where the source
419 /// location should refer to. The default offset (0) produces a source
420 /// location pointing just past the end of the token; an offset of 1 produces
421 /// a source location pointing to the last character in the token, etc.
422 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
423 const SourceManager &SM,
424 const LangOptions &LangOpts);
425
426 /// Given a token range, produce a corresponding CharSourceRange that
427 /// is not a token range. This allows the source range to be used by
428 /// components that don't have access to the lexer and thus can't find the
429 /// end of the range for themselves.
430 static CharSourceRange getAsCharRange(SourceRange Range,
431 const SourceManager &SM,
432 const LangOptions &LangOpts) {
433 SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
434 return End.isInvalid() ? CharSourceRange()
435 : CharSourceRange::getCharRange(
436 Range.getBegin(), End);
437 }
438 static CharSourceRange getAsCharRange(CharSourceRange Range,
439 const SourceManager &SM,
440 const LangOptions &LangOpts) {
441 return Range.isTokenRange()
442 ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
443 : Range;
444 }
445
446 /// Returns true if the given MacroID location points at the first
447 /// token of the macro expansion.
448 ///
449 /// \param MacroBegin If non-null and function returns true, it is set to
450 /// begin location of the macro.
451 static bool isAtStartOfMacroExpansion(SourceLocation loc,
452 const SourceManager &SM,
453 const LangOptions &LangOpts,
454 SourceLocation *MacroBegin = nullptr);
455
456 /// Returns true if the given MacroID location points at the last
457 /// token of the macro expansion.
458 ///
459 /// \param MacroEnd If non-null and function returns true, it is set to
460 /// end location of the macro.
461 static bool isAtEndOfMacroExpansion(SourceLocation loc,
462 const SourceManager &SM,
463 const LangOptions &LangOpts,
464 SourceLocation *MacroEnd = nullptr);
465
466 /// Accepts a range and returns a character range with file locations.
467 ///
468 /// Returns a null range if a part of the range resides inside a macro
469 /// expansion or the range does not reside on the same FileID.
470 ///
471 /// This function is trying to deal with macros and return a range based on
472 /// file locations. The cases where it can successfully handle macros are:
473 ///
474 /// -begin or end range lies at the start or end of a macro expansion, in
475 /// which case the location will be set to the expansion point, e.g:
476 /// \#define M 1 2
477 /// a M
478 /// If you have a range [a, 2] (where 2 came from the macro), the function
479 /// will return a range for "a M"
480 /// if you have range [a, 1], the function will fail because the range
481 /// overlaps with only a part of the macro
482 ///
483 /// -The macro is a function macro and the range can be mapped to the macro
484 /// arguments, e.g:
485 /// \#define M 1 2
486 /// \#define FM(x) x
487 /// FM(a b M)
488 /// if you have range [b, 2], the function will return the file range "b M"
489 /// inside the macro arguments.
490 /// if you have range [a, 2], the function will return the file range
491 /// "FM(a b M)" since the range includes all of the macro expansion.
492 static CharSourceRange makeFileCharRange(CharSourceRange Range,
493 const SourceManager &SM,
494 const LangOptions &LangOpts);
495
496 /// Returns a string for the source that the range encompasses.
497 static StringRef getSourceText(CharSourceRange Range,
498 const SourceManager &SM,
499 const LangOptions &LangOpts,
500 bool *Invalid = nullptr);
501
502 /// Retrieve the name of the immediate macro expansion.
503 ///
504 /// This routine starts from a source location, and finds the name of the macro
505 /// responsible for its immediate expansion. It looks through any intervening
506 /// macro argument expansions to compute this. It returns a StringRef which
507 /// refers to the SourceManager-owned buffer of the source where that macro
508 /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
509 static StringRef getImmediateMacroName(SourceLocation Loc,
510 const SourceManager &SM,
511 const LangOptions &LangOpts);
512
513 /// Retrieve the name of the immediate macro expansion.
514 ///
515 /// This routine starts from a source location, and finds the name of the
516 /// macro responsible for its immediate expansion. It looks through any
517 /// intervening macro argument expansions to compute this. It returns a
518 /// StringRef which refers to the SourceManager-owned buffer of the source
519 /// where that macro name is spelled. Thus, the result shouldn't out-live
520 /// that SourceManager.
521 ///
522 /// This differs from Lexer::getImmediateMacroName in that any macro argument
523 /// location will result in the topmost function macro that accepted it.
524 /// e.g.
525 /// \code
526 /// MAC1( MAC2(foo) )
527 /// \endcode
528 /// for location of 'foo' token, this function will return "MAC1" while
529 /// Lexer::getImmediateMacroName will return "MAC2".
530 static StringRef getImmediateMacroNameForDiagnostics(
531 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
532
533 /// Compute the preamble of the given file.
534 ///
535 /// The preamble of a file contains the initial comments, include directives,
536 /// and other preprocessor directives that occur before the code in this
537 /// particular file actually begins. The preamble of the main source file is
538 /// a potential prefix header.
539 ///
540 /// \param Buffer The memory buffer containing the file's contents.
541 ///
542 /// \param MaxLines If non-zero, restrict the length of the preamble
543 /// to fewer than this number of lines.
544 ///
545 /// \returns The offset into the file where the preamble ends and the rest
546 /// of the file begins along with a boolean value indicating whether
547 /// the preamble ends at the beginning of a new line.
548 static PreambleBounds ComputePreamble(StringRef Buffer,
549 const LangOptions &LangOpts,
550 unsigned MaxLines = 0);
551
552 /// Finds the token that comes right after the given location.
553 ///
554 /// Returns the next token, or std::nullopt if the location is inside a macro.
555 static std::optional<Token> findNextToken(SourceLocation Loc,
556 const SourceManager &SM,
557 const LangOptions &LangOpts);
558
559 /// Checks that the given token is the first token that occurs after
560 /// the given location (this excludes comments and whitespace). Returns the
561 /// location immediately after the specified token. If the token is not found
562 /// or the location is inside a macro, the returned source location will be
563 /// invalid.
564 static SourceLocation findLocationAfterToken(SourceLocation loc,
565 tok::TokenKind TKind,
566 const SourceManager &SM,
567 const LangOptions &LangOpts,
568 bool SkipTrailingWhitespaceAndNewLine);
569
570 /// Returns true if the given character could appear in an identifier.
571 static bool isAsciiIdentifierContinueChar(char c,
572 const LangOptions &LangOpts);
573
574 /// Checks whether new line pointed by Str is preceded by escape
575 /// sequence.
576 static bool isNewLineEscaped(const char *BufferStart, const char *Str);
577
578 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
579 /// emit a warning.
580 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
581 const LangOptions &LangOpts) {
582 // If this is not a trigraph and not a UCN or escaped newline, return
583 // quickly.
584 if (isObviouslySimpleCharacter(Ptr[0])) {
585 Size = 1;
586 return *Ptr;
587 }
588
589 Size = 0;
590 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
591 }
592
593 /// Returns the leading whitespace for line that corresponds to the given
594 /// location \p Loc.
595 static StringRef getIndentationForLine(SourceLocation Loc,
596 const SourceManager &SM);
597
598 /// Check if this is the first time we're lexing the input file.
599 bool isFirstTimeLexingFile() const { return IsFirstTimeLexingFile; }
600
601private:
602 //===--------------------------------------------------------------------===//
603 // Internal implementation interfaces.
604
605 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
606 /// by Lex.
607 ///
608 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
609
610 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
611
612 bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);
613
614 /// FormTokenWithChars - When we lex a token, we have identified a span
615 /// starting at BufferPtr, going to TokEnd that forms the token. This method
616 /// takes that range and assigns it to the token as its location and size. In
617 /// addition, since tokens cannot overlap, this also updates BufferPtr to be
618 /// TokEnd.
619 void FormTokenWithChars(Token &Result, const char *TokEnd,
620 tok::TokenKind Kind) {
621 unsigned TokLen = TokEnd-BufferPtr;
622 Result.setLength(TokLen);
623 Result.setLocation(getSourceLocation(BufferPtr, TokLen));
624 Result.setKind(Kind);
625 BufferPtr = TokEnd;
626 }
627
628 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
629 /// tok::l_paren token, 0 if it is something else and 2 if there are no more
630 /// tokens in the buffer controlled by this lexer.
631 unsigned isNextPPTokenLParen();
632
633 //===--------------------------------------------------------------------===//
634 // Lexer character reading interfaces.
635
636 // This lexer is built on two interfaces for reading characters, both of which
637 // automatically provide phase 1/2 translation. getAndAdvanceChar is used
638 // when we know that we will be reading a character from the input buffer and
639 // that this character will be part of the result token. This occurs in (f.e.)
640 // string processing, because we know we need to read until we find the
641 // closing '"' character.
642 //
643 // The second interface is the combination of getCharAndSize with
644 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
645 // returning it and its size. If the lexer decides that this character is
646 // part of the current token, it calls ConsumeChar on it. This two stage
647 // approach allows us to emit diagnostics for characters (e.g. warnings about
648 // trigraphs), knowing that they only are emitted if the character is
649 // consumed.
650
651 /// isObviouslySimpleCharacter - Return true if the specified character is
652 /// obviously the same in translation phase 1 and translation phase 3. This
653 /// can return false for characters that end up being the same, but it will
654 /// never return true for something that needs to be mapped.
655 static bool isObviouslySimpleCharacter(char C) {
656 return C != '?' && C != '\\';
657 }
658
659 /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
660 /// advance over it, and return it. This is tricky in several cases. Here we
661 /// just handle the trivial case and fall-back to the non-inlined
662 /// getCharAndSizeSlow method to handle the hard case.
663 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
664 // If this is not a trigraph and not a UCN or escaped newline, return
665 // quickly.
666 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
667
668 unsigned Size = 0;
669 char C = getCharAndSizeSlow(Ptr, Size, &Tok);
670 Ptr += Size;
671 return C;
672 }
673
674 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
675 /// and added to a given token, check to see if there are diagnostics that
676 /// need to be emitted or flags that need to be set on the token. If so, do
677 /// it.
678 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
679 // Normal case, we consumed exactly one token. Just return it.
680 if (Size == 1)
681 return Ptr+Size;
682
683 // Otherwise, re-lex the character with a current token, allowing
684 // diagnostics to be emitted and flags to be set.
685 Size = 0;
686 getCharAndSizeSlow(Ptr, Size, &Tok);
687 return Ptr+Size;
688 }
689
690 /// getCharAndSize - Peek a single 'character' from the specified buffer,
691 /// get its size, and return it. This is tricky in several cases. Here we
692 /// just handle the trivial case and fall-back to the non-inlined
693 /// getCharAndSizeSlow method to handle the hard case.
694 inline char getCharAndSize(const char *Ptr, unsigned &Size) {
695 // If this is not a trigraph and not a UCN or escaped newline, return
696 // quickly.
697 if (isObviouslySimpleCharacter(Ptr[0])) {
14
Taking false branch
698 Size = 1;
699 return *Ptr;
700 }
701
702 Size = 0;
703 return getCharAndSizeSlow(Ptr, Size);
15
Value assigned to field 'PP'
704 }
705
706 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
707 /// method.
708 char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
709 Token *Tok = nullptr);
710
711 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
712 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
713 /// to this function.
714 static unsigned getEscapedNewLineSize(const char *P);
715
716 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
717 /// them), skip over them and return the first non-escaped-newline found,
718 /// otherwise return P.
719 static const char *SkipEscapedNewLines(const char *P);
720
721 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
722 /// diagnostic.
723 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
724 const LangOptions &LangOpts);
725
726 //===--------------------------------------------------------------------===//
727 // Other lexer functions.
728
729 void SetByteOffset(unsigned Offset, bool StartOfLine);
730
731 void PropagateLineStartLeadingSpaceInfo(Token &Result);
732
733 const char *LexUDSuffix(Token &Result, const char *CurPtr,
734 bool IsStringLiteral);
735
736 // Helper functions to lex the remainder of a token of the specific type.
737
738 // This function handles both ASCII and Unicode identifiers after
739 // the first codepoint of the identifyier has been parsed.
740 bool LexIdentifierContinue(Token &Result, const char *CurPtr);
741
742 bool LexNumericConstant (Token &Result, const char *CurPtr);
743 bool LexStringLiteral (Token &Result, const char *CurPtr,
744 tok::TokenKind Kind);
745 bool LexRawStringLiteral (Token &Result, const char *CurPtr,
746 tok::TokenKind Kind);
747 bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
748 bool LexCharConstant (Token &Result, const char *CurPtr,
749 tok::TokenKind Kind);
750 bool LexEndOfFile (Token &Result, const char *CurPtr);
751 bool SkipWhitespace (Token &Result, const char *CurPtr,
752 bool &TokAtPhysicalStartOfLine);
753 bool SkipLineComment (Token &Result, const char *CurPtr,
754 bool &TokAtPhysicalStartOfLine);
755 bool SkipBlockComment (Token &Result, const char *CurPtr,
756 bool &TokAtPhysicalStartOfLine);
757 bool SaveLineComment (Token &Result, const char *CurPtr);
758
759 bool IsStartOfConflictMarker(const char *CurPtr);
760 bool HandleEndOfConflictMarker(const char *CurPtr);
761
762 bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
763
764 bool isCodeCompletionPoint(const char *CurPtr) const;
765 void cutOffLexing() { BufferPtr = BufferEnd; }
766
767 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
768
769 void codeCompleteIncludedFile(const char *PathStart,
770 const char *CompletionPoint, bool IsAngled);
771
772 std::optional<uint32_t>
773 tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
774 std::optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
775 const char *SlashLoc, Token *Result);
776
777 /// Read a universal character name.
778 ///
779 /// \param StartPtr The position in the source buffer after the initial '\'.
780 /// If the UCN is syntactically well-formed (but not
781 /// necessarily valid), this parameter will be updated to
782 /// point to the character after the UCN.
783 /// \param SlashLoc The position in the source buffer of the '\'.
784 /// \param Result The token being formed. Pass \c nullptr to suppress
785 /// diagnostics and handle token formation in the caller.
786 ///
787 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
788 /// invalid.
789 uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
790
791 /// Try to consume a UCN as part of an identifier at the current
792 /// location.
793 /// \param CurPtr Initially points to the range of characters in the source
794 /// buffer containing the '\'. Updated to point past the end of
795 /// the UCN on success.
796 /// \param Size The number of characters occupied by the '\' (including
797 /// trigraphs and escaped newlines).
798 /// \param Result The token being produced. Marked as containing a UCN on
799 /// success.
800 /// \return \c true if a UCN was lexed and it produced an acceptable
801 /// identifier character, \c false otherwise.
802 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
803 Token &Result);
804
805 /// Try to consume an identifier character encoded in UTF-8.
806 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
807 /// sequence. On success, updated to point past the end of it.
808 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
809 /// character was lexed, \c false otherwise.
810 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
811};
812
813} // namespace clang
814
815#endif // LLVM_CLANG_LEX_LEXER_H