/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp

Bug Summary

File:	tools/clang/lib/Lex/Lexer.cpp
Warning:	line 690, column 7 Value stored to 'InPreprocessorDirective' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name Lexer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -relaxed-aliasing -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-9/lib/clang/9.0.0 -D CLANG_VENDOR="Debian " -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-9~svn362543/build-llvm/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex -I /build/llvm-toolchain-snapshot-9~svn362543/tools/clang/include -I /build/llvm-toolchain-snapshot-9~svn362543/build-llvm/tools/clang/include -I /build/llvm-toolchain-snapshot-9~svn362543/build-llvm/include -I /build/llvm-toolchain-snapshot-9~svn362543/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/include/clang/9.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-9/lib/clang/9.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-9~svn362543/build-llvm/tools/clang/lib/Lex -fdebug-prefix-map=/build/llvm-toolchain-snapshot-9~svn362543=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fobjc-runtime=gcc -fno-common -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2019-06-05-060531-1271-1 -x c++ /build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp -faddrsig

1	//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the Lexer and Token interfaces.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "clang/Lex/Lexer.h"
14	#include "UnicodeCharSets.h"
15	#include "clang/Basic/CharInfo.h"
16	#include "clang/Basic/IdentifierTable.h"
17	#include "clang/Basic/LangOptions.h"
18	#include "clang/Basic/SourceLocation.h"
19	#include "clang/Basic/SourceManager.h"
20	#include "clang/Basic/TokenKinds.h"
21	#include "clang/Lex/LexDiagnostic.h"
22	#include "clang/Lex/LiteralSupport.h"
23	#include "clang/Lex/MultipleIncludeOpt.h"
24	#include "clang/Lex/Preprocessor.h"
25	#include "clang/Lex/PreprocessorOptions.h"
26	#include "clang/Lex/Token.h"
27	#include "clang/Basic/Diagnostic.h"
28	#include "clang/Basic/LLVM.h"
29	#include "clang/Basic/TokenKinds.h"
30	#include "llvm/ADT/None.h"
31	#include "llvm/ADT/Optional.h"
32	#include "llvm/ADT/StringExtras.h"
33	#include "llvm/ADT/StringSwitch.h"
34	#include "llvm/ADT/StringRef.h"
35	#include "llvm/Support/Compiler.h"
36	#include "llvm/Support/ConvertUTF.h"
37	#include "llvm/Support/MathExtras.h"
38	#include "llvm/Support/MemoryBuffer.h"
39	#include "llvm/Support/NativeFormatting.h"
40	#include "llvm/Support/UnicodeCharRanges.h"
41	#include <algorithm>
42	#include <cassert>
43	#include <cstddef>
44	#include <cstdint>
45	#include <cstring>
46	#include <string>
47	#include <tuple>
48	#include <utility>
49
50	using namespace clang;
51
52	//===----------------------------------------------------------------------===//
53	// Token Class Implementation
54	//===----------------------------------------------------------------------===//
55
56	/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
57	bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
58	if (isAnnotation())
59	return false;
60	if (IdentifierInfo *II = getIdentifierInfo())
61	return II->getObjCKeywordID() == objcKey;
62	return false;
63	}
64
65	/// getObjCKeywordID - Return the ObjC keyword kind.
66	tok::ObjCKeywordKind Token::getObjCKeywordID() const {
67	if (isAnnotation())
68	return tok::objc_not_keyword;
69	IdentifierInfo *specId = getIdentifierInfo();
70	return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
71	}
72
73	//===----------------------------------------------------------------------===//
74	// Lexer Class Implementation
75	//===----------------------------------------------------------------------===//
76
77	void Lexer::anchor() {}
78
79	void Lexer::InitLexer(const char BufStart, const char BufPtr,
80	const char *BufEnd) {
81	BufferStart = BufStart;
82	BufferPtr = BufPtr;
83	BufferEnd = BufEnd;
84
85	assert(BufEnd[0] == 0 &&((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end" " to simplify lexing!") ? static_cast<void> (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 87, __PRETTY_FUNCTION__))
86	"We assume that the input buffer has a null character at the end"((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end" " to simplify lexing!") ? static_cast<void> (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 87, __PRETTY_FUNCTION__))
87	" to simplify lexing!")((BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end" " to simplify lexing!") ? static_cast<void> (0) : __assert_fail ("BufEnd[0] == 0 && \"We assume that the input buffer has a null character at the end\" \" to simplify lexing!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 87, __PRETTY_FUNCTION__));
88
89	// Check whether we have a BOM in the beginning of the buffer. If yes - act
90	// accordingly. Right now we support only UTF-8 with and without BOM, so, just
91	// skip the UTF-8 BOM if it's present.
92	if (BufferStart == BufferPtr) {
93	// Determine the size of the BOM.
94	StringRef Buf(BufferStart, BufferEnd - BufferStart);
95	size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96	.StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97	.Default(0);
98
99	// Skip the BOM.
100	BufferPtr += BOMLength;
101	}
102
103	Is_PragmaLexer = false;
104	CurrentConflictMarkerState = CMK_None;
105
106	// Start of the file is a start of line.
107	IsAtStartOfLine = true;
108	IsAtPhysicalStartOfLine = true;
109
110	HasLeadingSpace = false;
111	HasLeadingEmptyMacro = false;
112
113	// We are not after parsing a #.
114	ParsingPreprocessorDirective = false;
115
116	// We are not after parsing #include.
117	ParsingFilename = false;
118
119	// We are not in raw mode. Raw mode disables diagnostics and interpretation
120	// of tokens (e.g. identifiers, thus disabling macro expansion). It is used
121	// to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122	// or otherwise skipping over tokens.
123	LexingRawMode = false;
124
125	// Default to not keeping comments.
126	ExtendedTokenMode = 0;
127	}
128
129	/// Lexer constructor - Create a new lexer object for the specified buffer
130	/// with the specified preprocessor managing the lexing process. This lexer
131	/// assumes that the associated file buffer and Preprocessor objects will
132	/// outlive it, so it doesn't take ownership of either of them.
133	Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
134	: PreprocessorLexer(&PP, FID),
135	FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
136	LangOpts(PP.getLangOpts()) {
137	InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
138	InputFile->getBufferEnd());
139
140	resetExtendedTokenMode();
141	}
142
143	/// Lexer constructor - Create a new raw lexer object. This object is only
144	/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
145	/// range will outlive it, so it doesn't take ownership of it.
146	Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
147	const char BufStart, const char BufPtr, const char *BufEnd)
148	: FileLoc(fileloc), LangOpts(langOpts) {
149	InitLexer(BufStart, BufPtr, BufEnd);
150
151	// We are in raw mode.
152	LexingRawMode = true;
153	}
154
155	/// Lexer constructor - Create a new raw lexer object. This object is only
156	/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
157	/// range will outlive it, so it doesn't take ownership of it.
158	Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
159	const SourceManager &SM, const LangOptions &langOpts)
160	: Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
161	FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
162
163	void Lexer::resetExtendedTokenMode() {
164	assert(PP && "Cannot reset token mode without a preprocessor")((PP && "Cannot reset token mode without a preprocessor" ) ? static_cast<void> (0) : __assert_fail ("PP && \"Cannot reset token mode without a preprocessor\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 164, __PRETTY_FUNCTION__));
165	if (LangOpts.TraditionalCPP)
166	SetKeepWhitespaceMode(true);
167	else
168	SetCommentRetentionState(PP->getCommentRetentionState());
169	}
170
171	/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
172	/// _Pragma expansion. This has a variety of magic semantics that this method
173	/// sets up. It returns a new'd Lexer that must be delete'd when done.
174	///
175	/// On entrance to this routine, TokStartLoc is a macro location which has a
176	/// spelling loc that indicates the bytes to be lexed for the token and an
177	/// expansion location that indicates where all lexed tokens should be
178	/// "expanded from".
179	///
180	/// TODO: It would really be nice to make _Pragma just be a wrapper around a
181	/// normal lexer that remaps tokens as they fly by. This would require making
182	/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
183	/// interface that could handle this stuff. This would pull GetMappedTokenLoc
184	/// out of the critical path of the lexer!
185	///
186	Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
187	SourceLocation ExpansionLocStart,
188	SourceLocation ExpansionLocEnd,
189	unsigned TokLen, Preprocessor &PP) {
190	SourceManager &SM = PP.getSourceManager();
191
192	// Create the lexer as if we were going to lex the file normally.
193	FileID SpellingFID = SM.getFileID(SpellingLoc);
194	const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
195	Lexer *L = new Lexer(SpellingFID, InputFile, PP);
196
197	// Now that the lexer is created, change the start/end locations so that we
198	// just lex the subsection of the file that we want. This is lexing from a
199	// scratch buffer.
200	const char *StrData = SM.getCharacterData(SpellingLoc);
201
202	L->BufferPtr = StrData;
203	L->BufferEnd = StrData+TokLen;
204	assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!")((L->BufferEnd[0] == 0 && "Buffer is not nul terminated!" ) ? static_cast<void> (0) : __assert_fail ("L->BufferEnd[0] == 0 && \"Buffer is not nul terminated!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 204, __PRETTY_FUNCTION__));
205
206	// Set the SourceLocation with the remapping information. This ensures that
207	// GetMappedTokenLoc will remap the tokens as they are lexed.
208	L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
209	ExpansionLocStart,
210	ExpansionLocEnd, TokLen);
211
212	// Ensure that the lexer thinks it is inside a directive, so that end \n will
213	// return an EOD token.
214	L->ParsingPreprocessorDirective = true;
215
216	// This lexer really is for _Pragma.
217	L->Is_PragmaLexer = true;
218	return L;
219	}
220
221	template <typename T> static void StringifyImpl(T &Str, char Quote) {
222	typename T::size_type i = 0, e = Str.size();
223	while (i < e) {
224	if (Str[i] == '\\' \|\| Str[i] == Quote) {
225	Str.insert(Str.begin() + i, '\\');
226	i += 2;
227	++e;
228	} else if (Str[i] == '\n' \|\| Str[i] == '\r') {
229	// Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
230	if ((i < e - 1) && (Str[i + 1] == '\n' \|\| Str[i + 1] == '\r') &&
231	Str[i] != Str[i + 1]) {
232	Str[i] = '\\';
233	Str[i + 1] = 'n';
234	} else {
235	// Replace '\n' and '\r' to '\\' followed by 'n'.
236	Str[i] = '\\';
237	Str.insert(Str.begin() + i + 1, 'n');
238	++e;
239	}
240	i += 2;
241	} else
242	++i;
243	}
244	}
245
246	std::string Lexer::Stringify(StringRef Str, bool Charify) {
247	std::string Result = Str;
248	char Quote = Charify ? '\'' : '"';
249	StringifyImpl(Result, Quote);
250	return Result;
251	}
252
253	void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
254
255	//===----------------------------------------------------------------------===//
256	// Token Spelling
257	//===----------------------------------------------------------------------===//
258
259	/// Slow case of getSpelling. Extract the characters comprising the
260	/// spelling of this token from the provided input buffer.
261	static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
262	const LangOptions &LangOpts, char *Spelling) {
263	assert(Tok.needsCleaning() && "getSpellingSlow called on simple token")((Tok.needsCleaning() && "getSpellingSlow called on simple token" ) ? static_cast<void> (0) : __assert_fail ("Tok.needsCleaning() && \"getSpellingSlow called on simple token\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 263, __PRETTY_FUNCTION__));
264
265	size_t Length = 0;
266	const char *BufEnd = BufPtr + Tok.getLength();
267
268	if (tok::isStringLiteral(Tok.getKind())) {
269	// Munch the encoding-prefix and opening double-quote.
270	while (BufPtr < BufEnd) {
271	unsigned Size;
272	Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
273	BufPtr += Size;
274
275	if (Spelling[Length - 1] == '"')
276	break;
277	}
278
279	// Raw string literals need special handling; trigraph expansion and line
280	// splicing do not occur within their d-char-sequence nor within their
281	// r-char-sequence.
282	if (Length >= 2 &&
283	Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
284	// Search backwards from the end of the token to find the matching closing
285	// quote.
286	const char *RawEnd = BufEnd;
287	do --RawEnd; while (*RawEnd != '"');
288	size_t RawLength = RawEnd - BufPtr + 1;
289
290	// Everything between the quotes is included verbatim in the spelling.
291	memcpy(Spelling + Length, BufPtr, RawLength);
292	Length += RawLength;
293	BufPtr += RawLength;
294
295	// The rest of the token is lexed normally.
296	}
297	}
298
299	while (BufPtr < BufEnd) {
300	unsigned Size;
301	Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
302	BufPtr += Size;
303	}
304
305	assert(Length < Tok.getLength() &&((Length < Tok.getLength() && "NeedsCleaning flag set on token that didn't need cleaning!" ) ? static_cast<void> (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 306, __PRETTY_FUNCTION__))
306	"NeedsCleaning flag set on token that didn't need cleaning!")((Length < Tok.getLength() && "NeedsCleaning flag set on token that didn't need cleaning!" ) ? static_cast<void> (0) : __assert_fail ("Length < Tok.getLength() && \"NeedsCleaning flag set on token that didn't need cleaning!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 306, __PRETTY_FUNCTION__));
307	return Length;
308	}
309
310	/// getSpelling() - Return the 'spelling' of this token. The spelling of a
311	/// token are the characters used to represent the token in the source file
312	/// after trigraph expansion and escaped-newline folding. In particular, this
313	/// wants to get the true, uncanonicalized, spelling of things like digraphs
314	/// UCNs, etc.
315	StringRef Lexer::getSpelling(SourceLocation loc,
316	SmallVectorImpl<char> &buffer,
317	const SourceManager &SM,
318	const LangOptions &options,
319	bool *invalid) {
320	// Break down the source location.
321	std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
322
323	// Try to the load the file buffer.
324	bool invalidTemp = false;
325	StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
326	if (invalidTemp) {
327	if (invalid) *invalid = true;
328	return {};
329	}
330
331	const char *tokenBegin = file.data() + locInfo.second;
332
333	// Lex from the start of the given location.
334	Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
335	file.begin(), tokenBegin, file.end());
336	Token token;
337	lexer.LexFromRawLexer(token);
338
339	unsigned length = token.getLength();
340
341	// Common case: no need for cleaning.
342	if (!token.needsCleaning())
343	return StringRef(tokenBegin, length);
344
345	// Hard case, we need to relex the characters into the string.
346	buffer.resize(length);
347	buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
348	return StringRef(buffer.data(), buffer.size());
349	}
350
351	/// getSpelling() - Return the 'spelling' of this token. The spelling of a
352	/// token are the characters used to represent the token in the source file
353	/// after trigraph expansion and escaped-newline folding. In particular, this
354	/// wants to get the true, uncanonicalized, spelling of things like digraphs
355	/// UCNs, etc.
356	std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
357	const LangOptions &LangOpts, bool *Invalid) {
358	assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(((int)Tok.getLength() >= 0 && "Token character range is bogus!" ) ? static_cast<void> (0) : __assert_fail ("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 358, __PRETTY_FUNCTION__));
359
360	bool CharDataInvalid = false;
361	const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
362	&CharDataInvalid);
363	if (Invalid)
364	*Invalid = CharDataInvalid;
365	if (CharDataInvalid)
366	return {};
367
368	// If this token contains nothing interesting, return it directly.
369	if (!Tok.needsCleaning())
370	return std::string(TokStart, TokStart + Tok.getLength());
371
372	std::string Result;
373	Result.resize(Tok.getLength());
374	Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
375	return Result;
376	}
377
378	/// getSpelling - This method is used to get the spelling of a token into a
379	/// preallocated buffer, instead of as an std::string. The caller is required
380	/// to allocate enough space for the token, which is guaranteed to be at least
381	/// Tok.getLength() bytes long. The actual length of the token is returned.
382	///
383	/// Note that this method may do two possible things: it may either fill in
384	/// the buffer specified with characters, or it may change the input pointer
385	/// to point to a constant buffer with the data already in it (avoiding a
386	/// copy). The caller is not allowed to modify the returned buffer pointer
387	/// if an internal buffer is returned.
388	unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
389	const SourceManager &SourceMgr,
390	const LangOptions &LangOpts, bool *Invalid) {
391	assert((int)Tok.getLength() >= 0 && "Token character range is bogus!")(((int)Tok.getLength() >= 0 && "Token character range is bogus!" ) ? static_cast<void> (0) : __assert_fail ("(int)Tok.getLength() >= 0 && \"Token character range is bogus!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 391, __PRETTY_FUNCTION__));
392
393	const char *TokStart = nullptr;
394	// NOTE: this has to be checked before testing for an IdentifierInfo.
395	if (Tok.is(tok::raw_identifier))
396	TokStart = Tok.getRawIdentifier().data();
397	else if (!Tok.hasUCN()) {
398	if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
399	// Just return the string from the identifier table, which is very quick.
400	Buffer = II->getNameStart();
401	return II->getLength();
402	}
403	}
404
405	// NOTE: this can be checked even after testing for an IdentifierInfo.
406	if (Tok.isLiteral())
407	TokStart = Tok.getLiteralData();
408
409	if (!TokStart) {
410	// Compute the start of the token in the input lexer buffer.
411	bool CharDataInvalid = false;
412	TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
413	if (Invalid)
414	*Invalid = CharDataInvalid;
415	if (CharDataInvalid) {
416	Buffer = "";
417	return 0;
418	}
419	}
420
421	// If this token contains nothing interesting, return it directly.
422	if (!Tok.needsCleaning()) {
423	Buffer = TokStart;
424	return Tok.getLength();
425	}
426
427	// Otherwise, hard case, relex the characters into the string.
428	return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
429	}
430
431	/// MeasureTokenLength - Relex the token at the specified location and return
432	/// its length in bytes in the input file. If the token needs cleaning (e.g.
433	/// includes a trigraph or an escaped newline) then this count includes bytes
434	/// that are part of that.
435	unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
436	const SourceManager &SM,
437	const LangOptions &LangOpts) {
438	Token TheTok;
439	if (getRawToken(Loc, TheTok, SM, LangOpts))
440	return 0;
441	return TheTok.getLength();
442	}
443
444	/// Relex the token at the specified location.
445	/// \returns true if there was a failure, false on success.
446	bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
447	const SourceManager &SM,
448	const LangOptions &LangOpts,
449	bool IgnoreWhiteSpace) {
450	// TODO: this could be special cased for common tokens like identifiers, ')',
451	// etc to make this faster, if it mattered. Just look at StrData[0] to handle
452	// all obviously single-char tokens. This could use
453	// Lexer::isObviouslySimpleCharacter for example to handle identifiers or
454	// something.
455
456	// If this comes from a macro expansion, we really do want the macro name, not
457	// the token this macro expanded to.
458	Loc = SM.getExpansionLoc(Loc);
459	std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
460	bool Invalid = false;
461	StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
462	if (Invalid)
463	return true;
464
465	const char *StrData = Buffer.data()+LocInfo.second;
466
467	if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
468	return true;
469
470	// Create a lexer starting at the beginning of this token.
471	Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
472	Buffer.begin(), StrData, Buffer.end());
473	TheLexer.SetCommentRetentionState(true);
474	TheLexer.LexFromRawLexer(Result);
475	return false;
476	}
477
478	/// Returns the pointer that points to the beginning of line that contains
479	/// the given offset, or null if the offset if invalid.
480	static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
481	const char *BufStart = Buffer.data();
482	if (Offset >= Buffer.size())
483	return nullptr;
484
485	const char *LexStart = BufStart + Offset;
486	for (; LexStart != BufStart; --LexStart) {
487	if (isVerticalWhitespace(LexStart[0]) &&
488	!Lexer::isNewLineEscaped(BufStart, LexStart)) {
489	// LexStart should point at first character of logical line.
490	++LexStart;
491	break;
492	}
493	}
494	return LexStart;
495	}
496
497	static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
498	const SourceManager &SM,
499	const LangOptions &LangOpts) {
500	assert(Loc.isFileID())((Loc.isFileID()) ? static_cast<void> (0) : __assert_fail ("Loc.isFileID()", "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 500, __PRETTY_FUNCTION__));
501	std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
502	if (LocInfo.first.isInvalid())
503	return Loc;
504
505	bool Invalid = false;
506	StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
507	if (Invalid)
508	return Loc;
509
510	// Back up from the current location until we hit the beginning of a line
511	// (or the buffer). We'll relex from that point.
512	const char *StrData = Buffer.data() + LocInfo.second;
513	const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
514	if (!LexStart \|\| LexStart == StrData)
515	return Loc;
516
517	// Create a lexer starting at the beginning of this token.
518	SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
519	Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
520	Buffer.end());
521	TheLexer.SetCommentRetentionState(true);
522
523	// Lex tokens until we find the token that contains the source location.
524	Token TheTok;
525	do {
526	TheLexer.LexFromRawLexer(TheTok);
527
528	if (TheLexer.getBufferLocation() > StrData) {
529	// Lexing this token has taken the lexer past the source location we're
530	// looking for. If the current token encompasses our source location,
531	// return the beginning of that token.
532	if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
533	return TheTok.getLocation();
534
535	// We ended up skipping over the source location entirely, which means
536	// that it points into whitespace. We're done here.
537	break;
538	}
539	} while (TheTok.getKind() != tok::eof);
540
541	// We've passed our source location; just return the original source location.
542	return Loc;
543	}
544
545	SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
546	const SourceManager &SM,
547	const LangOptions &LangOpts) {
548	if (Loc.isFileID())
549	return getBeginningOfFileToken(Loc, SM, LangOpts);
550
551	if (!SM.isMacroArgExpansion(Loc))
552	return Loc;
553
554	SourceLocation FileLoc = SM.getSpellingLoc(Loc);
555	SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
556	std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
557	std::pair<FileID, unsigned> BeginFileLocInfo =
558	SM.getDecomposedLoc(BeginFileLoc);
559	assert(FileLocInfo.first == BeginFileLocInfo.first &&((FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo .second >= BeginFileLocInfo.second) ? static_cast<void> (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 560, __PRETTY_FUNCTION__))
560	FileLocInfo.second >= BeginFileLocInfo.second)((FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo .second >= BeginFileLocInfo.second) ? static_cast<void> (0) : __assert_fail ("FileLocInfo.first == BeginFileLocInfo.first && FileLocInfo.second >= BeginFileLocInfo.second" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 560, __PRETTY_FUNCTION__));
561	return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
562	}
563
564	namespace {
565
566	enum PreambleDirectiveKind {
567	PDK_Skipped,
568	PDK_Unknown
569	};
570
571	} // namespace
572
573	PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
574	const LangOptions &LangOpts,
575	unsigned MaxLines) {
576	// Create a lexer starting at the beginning of the file. Note that we use a
577	// "fake" file source location at offset 1 so that the lexer will track our
578	// position within the file.
579	const unsigned StartOffset = 1;
580	SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
581	Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
582	Buffer.end());
583	TheLexer.SetCommentRetentionState(true);
584
585	bool InPreprocessorDirective = false;
586	Token TheTok;
587	SourceLocation ActiveCommentLoc;
588
589	unsigned MaxLineOffset = 0;
590	if (MaxLines) {
591	const char *CurPtr = Buffer.begin();
592	unsigned CurLine = 0;
593	while (CurPtr != Buffer.end()) {
594	char ch = *CurPtr++;
595	if (ch == '\n') {
596	++CurLine;
597	if (CurLine == MaxLines)
598	break;
599	}
600	}
601	if (CurPtr != Buffer.end())
602	MaxLineOffset = CurPtr - Buffer.begin();
603	}
604
605	do {
606	TheLexer.LexFromRawLexer(TheTok);
607
608	if (InPreprocessorDirective) {
609	// If we've hit the end of the file, we're done.
610	if (TheTok.getKind() == tok::eof) {
611	break;
612	}
613
614	// If we haven't hit the end of the preprocessor directive, skip this
615	// token.
616	if (!TheTok.isAtStartOfLine())
617	continue;
618
619	// We've passed the end of the preprocessor directive, and will look
620	// at this token again below.
621	InPreprocessorDirective = false;
622	}
623
624	// Keep track of the # of lines in the preamble.
625	if (TheTok.isAtStartOfLine()) {
626	unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
627
628	// If we were asked to limit the number of lines in the preamble,
629	// and we're about to exceed that limit, we're done.
630	if (MaxLineOffset && TokOffset >= MaxLineOffset)
631	break;
632	}
633
634	// Comments are okay; skip over them.
635	if (TheTok.getKind() == tok::comment) {
636	if (ActiveCommentLoc.isInvalid())
637	ActiveCommentLoc = TheTok.getLocation();
638	continue;
639	}
640
641	if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
642	// This is the start of a preprocessor directive.
643	Token HashTok = TheTok;
644	InPreprocessorDirective = true;
645	ActiveCommentLoc = SourceLocation();
646
647	// Figure out which directive this is. Since we're lexing raw tokens,
648	// we don't have an identifier table available. Instead, just look at
649	// the raw identifier to recognize and categorize preprocessor directives.
650	TheLexer.LexFromRawLexer(TheTok);
651	if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
652	StringRef Keyword = TheTok.getRawIdentifier();
653	PreambleDirectiveKind PDK
654	= llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
655	.Case("include", PDK_Skipped)
656	.Case("__include_macros", PDK_Skipped)
657	.Case("define", PDK_Skipped)
658	.Case("undef", PDK_Skipped)
659	.Case("line", PDK_Skipped)
660	.Case("error", PDK_Skipped)
661	.Case("pragma", PDK_Skipped)
662	.Case("import", PDK_Skipped)
663	.Case("include_next", PDK_Skipped)
664	.Case("warning", PDK_Skipped)
665	.Case("ident", PDK_Skipped)
666	.Case("sccs", PDK_Skipped)
667	.Case("assert", PDK_Skipped)
668	.Case("unassert", PDK_Skipped)
669	.Case("if", PDK_Skipped)
670	.Case("ifdef", PDK_Skipped)
671	.Case("ifndef", PDK_Skipped)
672	.Case("elif", PDK_Skipped)
673	.Case("else", PDK_Skipped)
674	.Case("endif", PDK_Skipped)
675	.Default(PDK_Unknown);
676
677	switch (PDK) {
678	case PDK_Skipped:
679	continue;
680
681	case PDK_Unknown:
682	// We don't know what this directive is; stop at the '#'.
683	break;
684	}
685	}
686
687	// We only end up here if we didn't recognize the preprocessor
688	// directive or it was one that can't occur in the preamble at this
689	// point. Roll back the current token to the location of the '#'.
690	InPreprocessorDirective = false;
	Value stored to 'InPreprocessorDirective' is never read
691	TheTok = HashTok;
692	}
693
694	// We hit a token that we don't recognize as being in the
695	// "preprocessing only" part of the file, so we're no longer in
696	// the preamble.
697	break;
698	} while (true);
699
700	SourceLocation End;
701	if (ActiveCommentLoc.isValid())
702	End = ActiveCommentLoc; // don't truncate a decl comment.
703	else
704	End = TheTok.getLocation();
705
706	return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
707	TheTok.isAtStartOfLine());
708	}
709
710	unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
711	const SourceManager &SM,
712	const LangOptions &LangOpts) {
713	// Figure out how many physical characters away the specified expansion
714	// character is. This needs to take into consideration newlines and
715	// trigraphs.
716	bool Invalid = false;
717	const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
718
719	// If they request the first char of the token, we're trivially done.
720	if (Invalid \|\| (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
721	return 0;
722
723	unsigned PhysOffset = 0;
724
725	// The usual case is that tokens don't contain anything interesting. Skip
726	// over the uninteresting characters. If a token only consists of simple
727	// chars, this method is extremely fast.
728	while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
729	if (CharNo == 0)
730	return PhysOffset;
731	++TokPtr;
732	--CharNo;
733	++PhysOffset;
734	}
735
736	// If we have a character that may be a trigraph or escaped newline, use a
737	// lexer to parse it correctly.
738	for (; CharNo; --CharNo) {
739	unsigned Size;
740	Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
741	TokPtr += Size;
742	PhysOffset += Size;
743	}
744
745	// Final detail: if we end up on an escaped newline, we want to return the
746	// location of the actual byte of the token. For example foo\<newline>bar
747	// advanced by 3 should return the location of b, not of \\. One compounding
748	// detail of this is that the escape may be made by a trigraph.
749	if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
750	PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
751
752	return PhysOffset;
753	}
754
755	/// Computes the source location just past the end of the
756	/// token at this source location.
757	///
758	/// This routine can be used to produce a source location that
759	/// points just past the end of the token referenced by \p Loc, and
760	/// is generally used when a diagnostic needs to point just after a
761	/// token where it expected something different that it received. If
762	/// the returned source location would not be meaningful (e.g., if
763	/// it points into a macro), this routine returns an invalid
764	/// source location.
765	///
766	/// \param Offset an offset from the end of the token, where the source
767	/// location should refer to. The default offset (0) produces a source
768	/// location pointing just past the end of the token; an offset of 1 produces
769	/// a source location pointing to the last character in the token, etc.
770	SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
771	const SourceManager &SM,
772	const LangOptions &LangOpts) {
773	if (Loc.isInvalid())
774	return {};
775
776	if (Loc.isMacroID()) {
777	if (Offset > 0 \|\| !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
778	return {}; // Points inside the macro expansion.
779	}
780
781	unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
782	if (Len > Offset)
783	Len = Len - Offset;
784	else
785	return Loc;
786
787	return Loc.getLocWithOffset(Len);
788	}
789
790	/// Returns true if the given MacroID location points at the first
791	/// token of the macro expansion.
792	bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
793	const SourceManager &SM,
794	const LangOptions &LangOpts,
795	SourceLocation *MacroBegin) {
796	assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")((loc.isValid() && loc.isMacroID() && "Expected a valid macro loc" ) ? static_cast<void> (0) : __assert_fail ("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 796, __PRETTY_FUNCTION__));
797
798	SourceLocation expansionLoc;
799	if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
800	return false;
801
802	if (expansionLoc.isFileID()) {
803	// No other macro expansions, this is the first.
804	if (MacroBegin)
805	*MacroBegin = expansionLoc;
806	return true;
807	}
808
809	return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
810	}
811
812	/// Returns true if the given MacroID location points at the last
813	/// token of the macro expansion.
814	bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
815	const SourceManager &SM,
816	const LangOptions &LangOpts,
817	SourceLocation *MacroEnd) {
818	assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc")((loc.isValid() && loc.isMacroID() && "Expected a valid macro loc" ) ? static_cast<void> (0) : __assert_fail ("loc.isValid() && loc.isMacroID() && \"Expected a valid macro loc\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 818, __PRETTY_FUNCTION__));
819
820	SourceLocation spellLoc = SM.getSpellingLoc(loc);
821	unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
822	if (tokLen == 0)
823	return false;
824
825	SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
826	SourceLocation expansionLoc;
827	if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
828	return false;
829
830	if (expansionLoc.isFileID()) {
831	// No other macro expansions.
832	if (MacroEnd)
833	*MacroEnd = expansionLoc;
834	return true;
835	}
836
837	return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
838	}
839
840	static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
841	const SourceManager &SM,
842	const LangOptions &LangOpts) {
843	SourceLocation Begin = Range.getBegin();
844	SourceLocation End = Range.getEnd();
845	assert(Begin.isFileID() && End.isFileID())((Begin.isFileID() && End.isFileID()) ? static_cast< void> (0) : __assert_fail ("Begin.isFileID() && End.isFileID()" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 845, __PRETTY_FUNCTION__));
846	if (Range.isTokenRange()) {
847	End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
848	if (End.isInvalid())
849	return {};
850	}
851
852	// Break down the source locations.
853	FileID FID;
854	unsigned BeginOffs;
855	std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
856	if (FID.isInvalid())
857	return {};
858
859	unsigned EndOffs;
860	if (!SM.isInFileID(End, FID, &EndOffs) \|\|
861	BeginOffs > EndOffs)
862	return {};
863
864	return CharSourceRange::getCharRange(Begin, End);
865	}
866
867	CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
868	const SourceManager &SM,
869	const LangOptions &LangOpts) {
870	SourceLocation Begin = Range.getBegin();
871	SourceLocation End = Range.getEnd();
872	if (Begin.isInvalid() \|\| End.isInvalid())
873	return {};
874
875	if (Begin.isFileID() && End.isFileID())
876	return makeRangeFromFileLocs(Range, SM, LangOpts);
877
878	if (Begin.isMacroID() && End.isFileID()) {
879	if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
880	return {};
881	Range.setBegin(Begin);
882	return makeRangeFromFileLocs(Range, SM, LangOpts);
883	}
884
885	if (Begin.isFileID() && End.isMacroID()) {
886	if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
887	&End)) \|\|
888	(Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
889	&End)))
890	return {};
891	Range.setEnd(End);
892	return makeRangeFromFileLocs(Range, SM, LangOpts);
893	}
894
895	assert(Begin.isMacroID() && End.isMacroID())((Begin.isMacroID() && End.isMacroID()) ? static_cast <void> (0) : __assert_fail ("Begin.isMacroID() && End.isMacroID()" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 895, __PRETTY_FUNCTION__));
896	SourceLocation MacroBegin, MacroEnd;
897	if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
898	((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
899	&MacroEnd)) \|\|
900	(Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
901	&MacroEnd)))) {
902	Range.setBegin(MacroBegin);
903	Range.setEnd(MacroEnd);
904	return makeRangeFromFileLocs(Range, SM, LangOpts);
905	}
906
907	bool Invalid = false;
908	const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
909	&Invalid);
910	if (Invalid)
911	return {};
912
913	if (BeginEntry.getExpansion().isMacroArgExpansion()) {
914	const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
915	&Invalid);
916	if (Invalid)
917	return {};
918
919	if (EndEntry.getExpansion().isMacroArgExpansion() &&
920	BeginEntry.getExpansion().getExpansionLocStart() ==
921	EndEntry.getExpansion().getExpansionLocStart()) {
922	Range.setBegin(SM.getImmediateSpellingLoc(Begin));
923	Range.setEnd(SM.getImmediateSpellingLoc(End));
924	return makeFileCharRange(Range, SM, LangOpts);
925	}
926	}
927
928	return {};
929	}
930
931	StringRef Lexer::getSourceText(CharSourceRange Range,
932	const SourceManager &SM,
933	const LangOptions &LangOpts,
934	bool *Invalid) {
935	Range = makeFileCharRange(Range, SM, LangOpts);
936	if (Range.isInvalid()) {
937	if (Invalid) *Invalid = true;
938	return {};
939	}
940
941	// Break down the source location.
942	std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
943	if (beginInfo.first.isInvalid()) {
944	if (Invalid) *Invalid = true;
945	return {};
946	}
947
948	unsigned EndOffs;
949	if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) \|\|
950	beginInfo.second > EndOffs) {
951	if (Invalid) *Invalid = true;
952	return {};
953	}
954
955	// Try to the load the file buffer.
956	bool invalidTemp = false;
957	StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
958	if (invalidTemp) {
959	if (Invalid) *Invalid = true;
960	return {};
961	}
962
963	if (Invalid) *Invalid = false;
964	return file.substr(beginInfo.second, EndOffs - beginInfo.second);
965	}
966
967	StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
968	const SourceManager &SM,
969	const LangOptions &LangOpts) {
970	assert(Loc.isMacroID() && "Only reasonable to call this on macros")((Loc.isMacroID() && "Only reasonable to call this on macros" ) ? static_cast<void> (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 970, __PRETTY_FUNCTION__));
971
972	// Find the location of the immediate macro expansion.
973	while (true) {
974	FileID FID = SM.getFileID(Loc);
975	const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
976	const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
977	Loc = Expansion.getExpansionLocStart();
978	if (!Expansion.isMacroArgExpansion())
979	break;
980
981	// For macro arguments we need to check that the argument did not come
982	// from an inner macro, e.g: "MAC1( MAC2(foo) )"
983
984	// Loc points to the argument id of the macro definition, move to the
985	// macro expansion.
986	Loc = SM.getImmediateExpansionRange(Loc).getBegin();
987	SourceLocation SpellLoc = Expansion.getSpellingLoc();
988	if (SpellLoc.isFileID())
989	break; // No inner macro.
990
991	// If spelling location resides in the same FileID as macro expansion
992	// location, it means there is no inner macro.
993	FileID MacroFID = SM.getFileID(Loc);
994	if (SM.isInFileID(SpellLoc, MacroFID))
995	break;
996
997	// Argument came from inner macro.
998	Loc = SpellLoc;
999	}
1000
1001	// Find the spelling location of the start of the non-argument expansion
1002	// range. This is where the macro name was spelled in order to begin
1003	// expanding this macro.
1004	Loc = SM.getSpellingLoc(Loc);
1005
1006	// Dig out the buffer where the macro name was spelled and the extents of the
1007	// name so that we can render it into the expansion note.
1008	std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1009	unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1010	StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1011	return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1012	}
1013
1014	StringRef Lexer::getImmediateMacroNameForDiagnostics(
1015	SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1016	assert(Loc.isMacroID() && "Only reasonable to call this on macros")((Loc.isMacroID() && "Only reasonable to call this on macros" ) ? static_cast<void> (0) : __assert_fail ("Loc.isMacroID() && \"Only reasonable to call this on macros\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1016, __PRETTY_FUNCTION__));
1017	// Walk past macro argument expansions.
1018	while (SM.isMacroArgExpansion(Loc))
1019	Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1020
1021	// If the macro's spelling has no FileID, then it's actually a token paste
1022	// or stringization (or similar) and not a macro at all.
1023	if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1024	return {};
1025
1026	// Find the spelling location of the start of the non-argument expansion
1027	// range. This is where the macro name was spelled in order to begin
1028	// expanding this macro.
1029	Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1030
1031	// Dig out the buffer where the macro name was spelled and the extents of the
1032	// name so that we can render it into the expansion note.
1033	std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1034	unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1035	StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1036	return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1037	}
1038
1039	bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1040	return isIdentifierBody(c, LangOpts.DollarIdents);
1041	}
1042
1043	bool Lexer::isNewLineEscaped(const char BufferStart, const char Str) {
1044	assert(isVerticalWhitespace(Str[0]))((isVerticalWhitespace(Str[0])) ? static_cast<void> (0) : __assert_fail ("isVerticalWhitespace(Str[0])", "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1044, __PRETTY_FUNCTION__));
1045	if (Str - 1 < BufferStart)
1046	return false;
1047
1048	if ((Str[0] == '\n' && Str[-1] == '\r') \|\|
1049	(Str[0] == '\r' && Str[-1] == '\n')) {
1050	if (Str - 2 < BufferStart)
1051	return false;
1052	--Str;
1053	}
1054	--Str;
1055
1056	// Rewind to first non-space character:
1057	while (Str > BufferStart && isHorizontalWhitespace(*Str))
1058	--Str;
1059
1060	return *Str == '\\';
1061	}
1062
1063	StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1064	const SourceManager &SM) {
1065	if (Loc.isInvalid() \|\| Loc.isMacroID())
1066	return {};
1067	std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1068	if (LocInfo.first.isInvalid())
1069	return {};
1070	bool Invalid = false;
1071	StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1072	if (Invalid)
1073	return {};
1074	const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1075	if (!Line)
1076	return {};
1077	StringRef Rest = Buffer.substr(Line - Buffer.data());
1078	size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1079	return NumWhitespaceChars == StringRef::npos
1080	? ""
1081	: Rest.take_front(NumWhitespaceChars);
1082	}
1083
1084	//===----------------------------------------------------------------------===//
1085	// Diagnostics forwarding code.
1086	//===----------------------------------------------------------------------===//
1087
1088	/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1089	/// lexer buffer was all expanded at a single point, perform the mapping.
1090	/// This is currently only used for _Pragma implementation, so it is the slow
1091	/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1092	static LLVM_ATTRIBUTE_NOINLINE__attribute__((noinline)) SourceLocation GetMappedTokenLoc(
1093	Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1094	static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1095	SourceLocation FileLoc,
1096	unsigned CharNo, unsigned TokLen) {
1097	assert(FileLoc.isMacroID() && "Must be a macro expansion")((FileLoc.isMacroID() && "Must be a macro expansion") ? static_cast<void> (0) : __assert_fail ("FileLoc.isMacroID() && \"Must be a macro expansion\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1097, __PRETTY_FUNCTION__));
1098
1099	// Otherwise, we're lexing "mapped tokens". This is used for things like
1100	// _Pragma handling. Combine the expansion location of FileLoc with the
1101	// spelling location.
1102	SourceManager &SM = PP.getSourceManager();
1103
1104	// Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1105	// characters come from spelling(FileLoc)+Offset.
1106	SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1107	SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1108
1109	// Figure out the expansion loc range, which is the range covered by the
1110	// original _Pragma(...) sequence.
1111	CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1112
1113	return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1114	}
1115
1116	/// getSourceLocation - Return a source location identifier for the specified
1117	/// offset in the current file.
1118	SourceLocation Lexer::getSourceLocation(const char *Loc,
1119	unsigned TokLen) const {
1120	assert(Loc >= BufferStart && Loc <= BufferEnd &&((Loc >= BufferStart && Loc <= BufferEnd && "Location out of range for this buffer!") ? static_cast<void > (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1121, __PRETTY_FUNCTION__))
1121	"Location out of range for this buffer!")((Loc >= BufferStart && Loc <= BufferEnd && "Location out of range for this buffer!") ? static_cast<void > (0) : __assert_fail ("Loc >= BufferStart && Loc <= BufferEnd && \"Location out of range for this buffer!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1121, __PRETTY_FUNCTION__));
1122
1123	// In the normal case, we're just lexing from a simple file buffer, return
1124	// the file id from FileLoc with the offset specified.
1125	unsigned CharNo = Loc-BufferStart;
1126	if (FileLoc.isFileID())
1127	return FileLoc.getLocWithOffset(CharNo);
1128
1129	// Otherwise, this is the _Pragma lexer case, which pretends that all of the
1130	// tokens are lexed from where the _Pragma was defined.
1131	assert(PP && "This doesn't work on raw lexers")((PP && "This doesn't work on raw lexers") ? static_cast <void> (0) : __assert_fail ("PP && \"This doesn't work on raw lexers\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1131, __PRETTY_FUNCTION__));
1132	return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1133	}
1134
1135	/// Diag - Forwarding function for diagnostics. This translate a source
1136	/// position in the current buffer into a SourceLocation object for rendering.
1137	DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1138	return PP->Diag(getSourceLocation(Loc), DiagID);
1139	}
1140
1141	//===----------------------------------------------------------------------===//
1142	// Trigraph and Escaped Newline Handling Code.
1143	//===----------------------------------------------------------------------===//
1144
1145	/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1146	/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1147	static char GetTrigraphCharForLetter(char Letter) {
1148	switch (Letter) {
1149	default: return 0;
1150	case '=': return '#';
1151	case ')': return ']';
1152	case '(': return '[';
1153	case '!': return '\|';
1154	case '\'': return '^';
1155	case '>': return '}';
1156	case '/': return '\\';
1157	case '<': return '{';
1158	case '-': return '~';
1159	}
1160	}
1161
1162	/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1163	/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1164	/// return the result character. Finally, emit a warning about trigraph use
1165	/// whether trigraphs are enabled or not.
1166	static char DecodeTrigraphChar(const char CP, Lexer L) {
1167	char Res = GetTrigraphCharForLetter(*CP);
1168	if (!Res \|\| !L) return Res;
1169
1170	if (!L->getLangOpts().Trigraphs) {
1171	if (!L->isLexingRawMode())
1172	L->Diag(CP-2, diag::trigraph_ignored);
1173	return 0;
1174	}
1175
1176	if (!L->isLexingRawMode())
1177	L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1178	return Res;
1179	}
1180
1181	/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1182	/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1183	/// trigraph equivalent on entry to this function.
1184	unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1185	unsigned Size = 0;
1186	while (isWhitespace(Ptr[Size])) {
1187	++Size;
1188
1189	if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1190	continue;
1191
1192	// If this is a \r\n or \n\r, skip the other half.
1193	if ((Ptr[Size] == '\r' \|\| Ptr[Size] == '\n') &&
1194	Ptr[Size-1] != Ptr[Size])
1195	++Size;
1196
1197	return Size;
1198	}
1199
1200	// Not an escaped newline, must be a \t or something else.
1201	return 0;
1202	}
1203
1204	/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1205	/// them), skip over them and return the first non-escaped-newline found,
1206	/// otherwise return P.
1207	const char Lexer::SkipEscapedNewLines(const char P) {
1208	while (true) {
1209	const char *AfterEscape;
1210	if (*P == '\\') {
1211	AfterEscape = P+1;
1212	} else if (*P == '?') {
1213	// If not a trigraph for escape, bail out.
1214	if (P[1] != '?' \|\| P[2] != '/')
1215	return P;
1216	// FIXME: Take LangOpts into account; the language might not
1217	// support trigraphs.
1218	AfterEscape = P+3;
1219	} else {
1220	return P;
1221	}
1222
1223	unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1224	if (NewLineSize == 0) return P;
1225	P = AfterEscape+NewLineSize;
1226	}
1227	}
1228
1229	Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1230	const SourceManager &SM,
1231	const LangOptions &LangOpts) {
1232	if (Loc.isMacroID()) {
1233	if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1234	return None;
1235	}
1236	Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1237
1238	// Break down the source location.
1239	std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1240
1241	// Try to load the file buffer.
1242	bool InvalidTemp = false;
1243	StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1244	if (InvalidTemp)
1245	return None;
1246
1247	const char *TokenBegin = File.data() + LocInfo.second;
1248
1249	// Lex from the start of the given location.
1250	Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1251	TokenBegin, File.end());
1252	// Find the token.
1253	Token Tok;
1254	lexer.LexFromRawLexer(Tok);
1255	return Tok;
1256	}
1257
1258	/// Checks that the given token is the first token that occurs after the
1259	/// given location (this excludes comments and whitespace). Returns the location
1260	/// immediately after the specified token. If the token is not found or the
1261	/// location is inside a macro, the returned source location will be invalid.
1262	SourceLocation Lexer::findLocationAfterToken(
1263	SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1264	const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1265	Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1266	if (!Tok \|\| Tok->isNot(TKind))
1267	return {};
1268	SourceLocation TokenLoc = Tok->getLocation();
1269
1270	// Calculate how much whitespace needs to be skipped if any.
1271	unsigned NumWhitespaceChars = 0;
1272	if (SkipTrailingWhitespaceAndNewLine) {
1273	const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1274	unsigned char C = *TokenEnd;
1275	while (isHorizontalWhitespace(C)) {
1276	C = *(++TokenEnd);
1277	NumWhitespaceChars++;
1278	}
1279
1280	// Skip \r, \n, \r\n, or \n\r
1281	if (C == '\n' \|\| C == '\r') {
1282	char PrevC = C;
1283	C = *(++TokenEnd);
1284	NumWhitespaceChars++;
1285	if ((C == '\n' \|\| C == '\r') && C != PrevC)
1286	NumWhitespaceChars++;
1287	}
1288	}
1289
1290	return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1291	}
1292
1293	/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1294	/// get its size, and return it. This is tricky in several cases:
1295	/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1296	/// then either return the trigraph (skipping 3 chars) or the '?',
1297	/// depending on whether trigraphs are enabled or not.
1298	/// 2. If this is an escaped newline (potentially with whitespace between
1299	/// the backslash and newline), implicitly skip the newline and return
1300	/// the char after it.
1301	///
1302	/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1303	/// know that we can accumulate into Size, and that we have already incremented
1304	/// Ptr by Size bytes.
1305	///
1306	/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1307	/// be updated to match.
1308	char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1309	Token *Tok) {
1310	// If we have a slash, look for an escaped newline.
1311	if (Ptr[0] == '\\') {
1312	++Size;
1313	++Ptr;
1314	Slash:
1315	// Common case, backslash-char where the char is not whitespace.
1316	if (!isWhitespace(Ptr[0])) return '\\';
1317
1318	// See if we have optional whitespace characters between the slash and
1319	// newline.
1320	if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1321	// Remember that this token needs to be cleaned.
1322	if (Tok) Tok->setFlag(Token::NeedsCleaning);
1323
1324	// Warn if there was whitespace between the backslash and newline.
1325	if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1326	Diag(Ptr, diag::backslash_newline_space);
1327
1328	// Found backslash<whitespace><newline>. Parse the char after it.
1329	Size += EscapedNewLineSize;
1330	Ptr += EscapedNewLineSize;
1331
1332	// Use slow version to accumulate a correct size field.
1333	return getCharAndSizeSlow(Ptr, Size, Tok);
1334	}
1335
1336	// Otherwise, this is not an escaped newline, just return the slash.
1337	return '\\';
1338	}
1339
1340	// If this is a trigraph, process it.
1341	if (Ptr[0] == '?' && Ptr[1] == '?') {
1342	// If this is actually a legal trigraph (not something like "??x"), emit
1343	// a trigraph warning. If so, and if trigraphs are enabled, return it.
1344	if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1345	// Remember that this token needs to be cleaned.
1346	if (Tok) Tok->setFlag(Token::NeedsCleaning);
1347
1348	Ptr += 3;
1349	Size += 3;
1350	if (C == '\\') goto Slash;
1351	return C;
1352	}
1353	}
1354
1355	// If this is neither, return a single character.
1356	++Size;
1357	return *Ptr;
1358	}
1359
1360	/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1361	/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1362	/// and that we have already incremented Ptr by Size bytes.
1363	///
1364	/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1365	/// be updated to match.
1366	char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1367	const LangOptions &LangOpts) {
1368	// If we have a slash, look for an escaped newline.
1369	if (Ptr[0] == '\\') {
1370	++Size;
1371	++Ptr;
1372	Slash:
1373	// Common case, backslash-char where the char is not whitespace.
1374	if (!isWhitespace(Ptr[0])) return '\\';
1375
1376	// See if we have optional whitespace characters followed by a newline.
1377	if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1378	// Found backslash<whitespace><newline>. Parse the char after it.
1379	Size += EscapedNewLineSize;
1380	Ptr += EscapedNewLineSize;
1381
1382	// Use slow version to accumulate a correct size field.
1383	return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1384	}
1385
1386	// Otherwise, this is not an escaped newline, just return the slash.
1387	return '\\';
1388	}
1389
1390	// If this is a trigraph, process it.
1391	if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1392	// If this is actually a legal trigraph (not something like "??x"), return
1393	// it.
1394	if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1395	Ptr += 3;
1396	Size += 3;
1397	if (C == '\\') goto Slash;
1398	return C;
1399	}
1400	}
1401
1402	// If this is neither, return a single character.
1403	++Size;
1404	return *Ptr;
1405	}
1406
1407	//===----------------------------------------------------------------------===//
1408	// Helper methods for lexing.
1409	//===----------------------------------------------------------------------===//
1410
1411	/// Routine that indiscriminately sets the offset into the source file.
1412	void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1413	BufferPtr = BufferStart + Offset;
1414	if (BufferPtr > BufferEnd)
1415	BufferPtr = BufferEnd;
1416	// FIXME: What exactly does the StartOfLine bit mean? There are two
1417	// possible meanings for the "start" of the line: the first token on the
1418	// unexpanded line, or the first token on the expanded line.
1419	IsAtStartOfLine = StartOfLine;
1420	IsAtPhysicalStartOfLine = StartOfLine;
1421	}
1422
1423	static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1424	if (LangOpts.AsmPreprocessor) {
1425	return false;
1426	} else if (LangOpts.CPlusPlus11 \|\| LangOpts.C11) {
1427	static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1428	C11AllowedIDCharRanges);
1429	return C11AllowedIDChars.contains(C);
1430	} else if (LangOpts.CPlusPlus) {
1431	static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1432	CXX03AllowedIDCharRanges);
1433	return CXX03AllowedIDChars.contains(C);
1434	} else {
1435	static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1436	C99AllowedIDCharRanges);
1437	return C99AllowedIDChars.contains(C);
1438	}
1439	}
1440
1441	static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1442	assert(isAllowedIDChar(C, LangOpts))((isAllowedIDChar(C, LangOpts)) ? static_cast<void> (0) : __assert_fail ("isAllowedIDChar(C, LangOpts)", "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1442, __PRETTY_FUNCTION__));
1443	if (LangOpts.AsmPreprocessor) {
1444	return false;
1445	} else if (LangOpts.CPlusPlus11 \|\| LangOpts.C11) {
1446	static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1447	C11DisallowedInitialIDCharRanges);
1448	return !C11DisallowedInitialIDChars.contains(C);
1449	} else if (LangOpts.CPlusPlus) {
1450	return true;
1451	} else {
1452	static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1453	C99DisallowedInitialIDCharRanges);
1454	return !C99DisallowedInitialIDChars.contains(C);
1455	}
1456	}
1457
1458	static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1459	const char *End) {
1460	return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1461	L.getSourceLocation(End));
1462	}
1463
1464	static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1465	CharSourceRange Range, bool IsFirst) {
1466	// Check C99 compatibility.
1467	if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1468	enum {
1469	CannotAppearInIdentifier = 0,
1470	CannotStartIdentifier
1471	};
1472
1473	static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1474	C99AllowedIDCharRanges);
1475	static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1476	C99DisallowedInitialIDCharRanges);
1477	if (!C99AllowedIDChars.contains(C)) {
1478	Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1479	<< Range
1480	<< CannotAppearInIdentifier;
1481	} else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1482	Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1483	<< Range
1484	<< CannotStartIdentifier;
1485	}
1486	}
1487
1488	// Check C++98 compatibility.
1489	if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1490	static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1491	CXX03AllowedIDCharRanges);
1492	if (!CXX03AllowedIDChars.contains(C)) {
1493	Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1494	<< Range;
1495	}
1496	}
1497	}
1498
1499	/// After encountering UTF-8 character C and interpreting it as an identifier
1500	/// character, check whether it's a homoglyph for a common non-identifier
1501	/// source character that is unlikely to be an intentional identifier
1502	/// character and warn if so.
1503	static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1504	CharSourceRange Range) {
1505	// FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1506	struct HomoglyphPair {
1507	uint32_t Character;
1508	char LooksLike;
1509	bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1510	};
1511	static constexpr HomoglyphPair SortedHomoglyphs[] = {
1512	{U'\u00ad', 0}, // SOFT HYPHEN
1513	{U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1514	{U'\u037e', ';'}, // GREEK QUESTION MARK
1515	{U'\u200b', 0}, // ZERO WIDTH SPACE
1516	{U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1517	{U'\u200d', 0}, // ZERO WIDTH JOINER
1518	{U'\u2060', 0}, // WORD JOINER
1519	{U'\u2061', 0}, // FUNCTION APPLICATION
1520	{U'\u2062', 0}, // INVISIBLE TIMES
1521	{U'\u2063', 0}, // INVISIBLE SEPARATOR
1522	{U'\u2064', 0}, // INVISIBLE PLUS
1523	{U'\u2212', '-'}, // MINUS SIGN
1524	{U'\u2215', '/'}, // DIVISION SLASH
1525	{U'\u2216', '\\'}, // SET MINUS
1526	{U'\u2217', '*'}, // ASTERISK OPERATOR
1527	{U'\u2223', '\|'}, // DIVIDES
1528	{U'\u2227', '^'}, // LOGICAL AND
1529	{U'\u2236', ':'}, // RATIO
1530	{U'\u223c', '~'}, // TILDE OPERATOR
1531	{U'\ua789', ':'}, // MODIFIER LETTER COLON
1532	{U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1533	{U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1534	{U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1535	{U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1536	{U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1537	{U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1538	{U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1539	{U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1540	{U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1541	{U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1542	{U'\uff0c', ','}, // FULLWIDTH COMMA
1543	{U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1544	{U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1545	{U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1546	{U'\uff1a', ':'}, // FULLWIDTH COLON
1547	{U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1548	{U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1549	{U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1550	{U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1551	{U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1552	{U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1553	{U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1554	{U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1555	{U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1556	{U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1557	{U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1558	{U'\uff5c', '\|'}, // FULLWIDTH VERTICAL LINE
1559	{U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1560	{U'\uff5e', '~'}, // FULLWIDTH TILDE
1561	{0, 0}
1562	};
1563	auto Homoglyph =
1564	std::lower_bound(std::begin(SortedHomoglyphs),
1565	std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1566	if (Homoglyph->Character == C) {
1567	llvm::SmallString<5> CharBuf;
1568	{
1569	llvm::raw_svector_ostream CharOS(CharBuf);
1570	llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1571	}
1572	if (Homoglyph->LooksLike) {
1573	const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1574	Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1575	<< Range << CharBuf << LooksLikeStr;
1576	} else {
1577	Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1578	<< Range << CharBuf;
1579	}
1580	}
1581	}
1582
1583	bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1584	Token &Result) {
1585	const char *UCNPtr = CurPtr + Size;
1586	uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /Token=/nullptr);
1587	if (CodePoint == 0 \|\| !isAllowedIDChar(CodePoint, LangOpts))
1588	return false;
1589
1590	if (!isLexingRawMode())
1591	maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1592	makeCharRange(*this, CurPtr, UCNPtr),
1593	/IsFirst=/false);
1594
1595	Result.setFlag(Token::HasUCN);
1596	if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') \|\|
1597	(UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1598	CurPtr = UCNPtr;
1599	else
1600	while (CurPtr != UCNPtr)
1601	(void)getAndAdvanceChar(CurPtr, Result);
1602	return true;
1603	}
1604
1605	bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1606	const char *UnicodePtr = CurPtr;
1607	llvm::UTF32 CodePoint;
1608	llvm::ConversionResult Result =
1609	llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1610	(const llvm::UTF8 *)BufferEnd,
1611	&CodePoint,
1612	llvm::strictConversion);
1613	if (Result != llvm::conversionOK \|\|
1614	!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1615	return false;
1616
1617	if (!isLexingRawMode()) {
1618	maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1619	makeCharRange(*this, CurPtr, UnicodePtr),
1620	/IsFirst=/false);
1621	maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1622	makeCharRange(*this, CurPtr, UnicodePtr));
1623	}
1624
1625	CurPtr = UnicodePtr;
1626	return true;
1627	}
1628
1629	bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1630	// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1631	unsigned Size;
1632	unsigned char C = *CurPtr++;
1633	while (isIdentifierBody(C))
1634	C = *CurPtr++;
1635
1636	--CurPtr; // Back up over the skipped character.
1637
1638	// Fast path, no $,\,? in identifier found. '\' might be an escaped newline
1639	// or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1640	//
1641	// TODO: Could merge these checks into an InfoTable flag to make the
1642	// comparison cheaper
1643	if (isASCII(C) && C != '\\' && C != '?' &&
1644	(C != '$' \|\| !LangOpts.DollarIdents)) {
1645	FinishIdentifier:
1646	const char *IdStart = BufferPtr;
1647	FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1648	Result.setRawIdentifierData(IdStart);
1649
1650	// If we are in raw mode, return this identifier raw. There is no need to
1651	// look up identifier information or attempt to macro expand it.
1652	if (LexingRawMode)
1653	return true;
1654
1655	// Fill in Result.IdentifierInfo and update the token kind,
1656	// looking up the identifier in the identifier table.
1657	IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1658	// Note that we have to call PP->LookUpIdentifierInfo() even for code
1659	// completion, it writes IdentifierInfo into Result, and callers rely on it.
1660
1661	// If the completion point is at the end of an identifier, we want to treat
1662	// the identifier as incomplete even if it resolves to a macro or a keyword.
1663	// This allows e.g. 'class^' to complete to 'classifier'.
1664	if (isCodeCompletionPoint(CurPtr)) {
1665	// Return the code-completion token.
1666	Result.setKind(tok::code_completion);
1667	// Skip the code-completion char and all immediate identifier characters.
1668	// This ensures we get consistent behavior when completing at any point in
1669	// an identifier (i.e. at the start, in the middle, at the end). Note that
1670	// only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1671	// simpler.
1672	assert(CurPtr == 0 && "Completion character must be 0")((CurPtr == 0 && "Completion character must be 0") ? static_cast<void> (0) : __assert_fail ("*CurPtr == 0 && \"Completion character must be 0\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1672, __PRETTY_FUNCTION__));
1673	++CurPtr;
1674	// Note that code completion token is not added as a separate character
1675	// when the completion point is at the end of the buffer. Therefore, we need
1676	// to check if the buffer has ended.
1677	if (CurPtr < BufferEnd) {
1678	while (isIdentifierBody(*CurPtr))
1679	++CurPtr;
1680	}
1681	BufferPtr = CurPtr;
1682	return true;
1683	}
1684
1685	// Finally, now that we know we have an identifier, pass this off to the
1686	// preprocessor, which may macro expand it or something.
1687	if (II->isHandleIdentifierCase())
1688	return PP->HandleIdentifier(Result);
1689
1690	return true;
1691	}
1692
1693	// Otherwise, $,\,? in identifier found. Enter slower path.
1694
1695	C = getCharAndSize(CurPtr, Size);
1696	while (true) {
1697	if (C == '$') {
1698	// If we hit a $ and they are not supported in identifiers, we are done.
1699	if (!LangOpts.DollarIdents) goto FinishIdentifier;
1700
1701	// Otherwise, emit a diagnostic and continue.
1702	if (!isLexingRawMode())
1703	Diag(CurPtr, diag::ext_dollar_in_identifier);
1704	CurPtr = ConsumeChar(CurPtr, Size, Result);
1705	C = getCharAndSize(CurPtr, Size);
1706	continue;
1707	} else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
1708	C = getCharAndSize(CurPtr, Size);
1709	continue;
1710	} else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
1711	C = getCharAndSize(CurPtr, Size);
1712	continue;
1713	} else if (!isIdentifierBody(C)) {
1714	goto FinishIdentifier;
1715	}
1716
1717	// Otherwise, this character is good, consume it.
1718	CurPtr = ConsumeChar(CurPtr, Size, Result);
1719
1720	C = getCharAndSize(CurPtr, Size);
1721	while (isIdentifierBody(C)) {
1722	CurPtr = ConsumeChar(CurPtr, Size, Result);
1723	C = getCharAndSize(CurPtr, Size);
1724	}
1725	}
1726	}
1727
1728	/// isHexaLiteral - Return true if Start points to a hex constant.
1729	/// in microsoft mode (where this is supposed to be several different tokens).
1730	bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1731	unsigned Size;
1732	char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1733	if (C1 != '0')
1734	return false;
1735	char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1736	return (C2 == 'x' \|\| C2 == 'X');
1737	}
1738
1739	/// LexNumericConstant - Lex the remainder of a integer or floating point
1740	/// constant. From[-1] is the first character lexed. Return the end of the
1741	/// constant.
1742	bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1743	unsigned Size;
1744	char C = getCharAndSize(CurPtr, Size);
1745	char PrevCh = 0;
1746	while (isPreprocessingNumberBody(C)) {
1747	CurPtr = ConsumeChar(CurPtr, Size, Result);
1748	PrevCh = C;
1749	C = getCharAndSize(CurPtr, Size);
1750	}
1751
1752	// If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1753	if ((C == '-' \|\| C == '+') && (PrevCh == 'E' \|\| PrevCh == 'e')) {
1754	// If we are in Microsoft mode, don't continue if the constant is hex.
1755	// For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1756	if (!LangOpts.MicrosoftExt \|\| !isHexaLiteral(BufferPtr, LangOpts))
1757	return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1758	}
1759
1760	// If we have a hex FP constant, continue.
1761	if ((C == '-' \|\| C == '+') && (PrevCh == 'P' \|\| PrevCh == 'p')) {
1762	// Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1763	// not-quite-conforming extension. Only do so if this looks like it's
1764	// actually meant to be a hexfloat, and not if it has a ud-suffix.
1765	bool IsHexFloat = true;
1766	if (!LangOpts.C99) {
1767	if (!isHexaLiteral(BufferPtr, LangOpts))
1768	IsHexFloat = false;
1769	else if (!getLangOpts().CPlusPlus17 &&
1770	std::find(BufferPtr, CurPtr, '_') != CurPtr)
1771	IsHexFloat = false;
1772	}
1773	if (IsHexFloat)
1774	return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1775	}
1776
1777	// If we have a digit separator, continue.
1778	if (C == '\'' && getLangOpts().CPlusPlus14) {
1779	unsigned NextSize;
1780	char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1781	if (isIdentifierBody(Next)) {
1782	if (!isLexingRawMode())
1783	Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1784	CurPtr = ConsumeChar(CurPtr, Size, Result);
1785	CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1786	return LexNumericConstant(Result, CurPtr);
1787	}
1788	}
1789
1790	// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1791	if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1792	return LexNumericConstant(Result, CurPtr);
1793	if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1794	return LexNumericConstant(Result, CurPtr);
1795
1796	// Update the location of token as well as BufferPtr.
1797	const char *TokStart = BufferPtr;
1798	FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1799	Result.setLiteralData(TokStart);
1800	return true;
1801	}
1802
1803	/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1804	/// in C++11, or warn on a ud-suffix in C++98.
1805	const char Lexer::LexUDSuffix(Token &Result, const char CurPtr,
1806	bool IsStringLiteral) {
1807	assert(getLangOpts().CPlusPlus)((getLangOpts().CPlusPlus) ? static_cast<void> (0) : __assert_fail ("getLangOpts().CPlusPlus", "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 1807, __PRETTY_FUNCTION__));
1808
1809	// Maximally munch an identifier.
1810	unsigned Size;
1811	char C = getCharAndSize(CurPtr, Size);
1812	bool Consumed = false;
1813
1814	if (!isIdentifierHead(C)) {
1815	if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1816	Consumed = true;
1817	else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1818	Consumed = true;
1819	else
1820	return CurPtr;
1821	}
1822
1823	if (!getLangOpts().CPlusPlus11) {
1824	if (!isLexingRawMode())
1825	Diag(CurPtr,
1826	C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1827	: diag::warn_cxx11_compat_reserved_user_defined_literal)
1828	<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1829	return CurPtr;
1830	}
1831
1832	// C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1833	// that does not start with an underscore is ill-formed. As a conforming
1834	// extension, we treat all such suffixes as if they had whitespace before
1835	// them. We assume a suffix beginning with a UCN or UTF-8 character is more
1836	// likely to be a ud-suffix than a macro, however, and accept that.
1837	if (!Consumed) {
1838	bool IsUDSuffix = false;
1839	if (C == '_')
1840	IsUDSuffix = true;
1841	else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
1842	// In C++1y, we need to look ahead a few characters to see if this is a
1843	// valid suffix for a string literal or a numeric literal (this could be
1844	// the 'operator""if' defining a numeric literal operator).
1845	const unsigned MaxStandardSuffixLength = 3;
1846	char Buffer[MaxStandardSuffixLength] = { C };
1847	unsigned Consumed = Size;
1848	unsigned Chars = 1;
1849	while (true) {
1850	unsigned NextSize;
1851	char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1852	getLangOpts());
1853	if (!isIdentifierBody(Next)) {
1854	// End of suffix. Check whether this is on the whitelist.
1855	const StringRef CompleteSuffix(Buffer, Chars);
1856	IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1857	CompleteSuffix);
1858	break;
1859	}
1860
1861	if (Chars == MaxStandardSuffixLength)
1862	// Too long: can't be a standard suffix.
1863	break;
1864
1865	Buffer[Chars++] = Next;
1866	Consumed += NextSize;
1867	}
1868	}
1869
1870	if (!IsUDSuffix) {
1871	if (!isLexingRawMode())
1872	Diag(CurPtr, getLangOpts().MSVCCompat
1873	? diag::ext_ms_reserved_user_defined_literal
1874	: diag::ext_reserved_user_defined_literal)
1875	<< FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1876	return CurPtr;
1877	}
1878
1879	CurPtr = ConsumeChar(CurPtr, Size, Result);
1880	}
1881
1882	Result.setFlag(Token::HasUDSuffix);
1883	while (true) {
1884	C = getCharAndSize(CurPtr, Size);
1885	if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
1886	else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
1887	else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
1888	else break;
1889	}
1890
1891	return CurPtr;
1892	}
1893
1894	/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1895	/// either " or L" or u8" or u" or U".
1896	bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1897	tok::TokenKind Kind) {
1898	const char *AfterQuote = CurPtr;
1899	// Does this string contain the \0 character?
1900	const char *NulCharacter = nullptr;
1901
1902	if (!isLexingRawMode() &&
1903	(Kind == tok::utf8_string_literal \|\|
1904	Kind == tok::utf16_string_literal \|\|
1905	Kind == tok::utf32_string_literal))
1906	Diag(BufferPtr, getLangOpts().CPlusPlus
1907	? diag::warn_cxx98_compat_unicode_literal
1908	: diag::warn_c99_compat_unicode_literal);
1909
1910	char C = getAndAdvanceChar(CurPtr, Result);
1911	while (C != '"') {
1912	// Skip escaped characters. Escaped newlines will already be processed by
1913	// getAndAdvanceChar.
1914	if (C == '\\')
1915	C = getAndAdvanceChar(CurPtr, Result);
1916
1917	if (C == '\n' \|\| C == '\r' \|\| // Newline.
1918	(C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
1919	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
1920	Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1921	FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1922	return true;
1923	}
1924
1925	if (C == 0) {
1926	if (isCodeCompletionPoint(CurPtr-1)) {
1927	if (ParsingFilename)
1928	codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /IsAngled=/false);
1929	else
1930	PP->CodeCompleteNaturalLanguage();
1931	FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
1932	cutOffLexing();
1933	return true;
1934	}
1935
1936	NulCharacter = CurPtr-1;
1937	}
1938	C = getAndAdvanceChar(CurPtr, Result);
1939	}
1940
1941	// If we are in C++11, lex the optional ud-suffix.
1942	if (getLangOpts().CPlusPlus)
1943	CurPtr = LexUDSuffix(Result, CurPtr, true);
1944
1945	// If a nul character existed in the string, warn about it.
1946	if (NulCharacter && !isLexingRawMode())
1947	Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1948
1949	// Update the location of the token as well as the BufferPtr instance var.
1950	const char *TokStart = BufferPtr;
1951	FormTokenWithChars(Result, CurPtr, Kind);
1952	Result.setLiteralData(TokStart);
1953	return true;
1954	}
1955
1956	/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1957	/// having lexed R", LR", u8R", uR", or UR".
1958	bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1959	tok::TokenKind Kind) {
1960	// This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1961	// Between the initial and final double quote characters of the raw string,
1962	// any transformations performed in phases 1 and 2 (trigraphs,
1963	// universal-character-names, and line splicing) are reverted.
1964
1965	if (!isLexingRawMode())
1966	Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1967
1968	unsigned PrefixLen = 0;
1969
1970	while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
1971	++PrefixLen;
1972
1973	// If the last character was not a '(', then we didn't lex a valid delimiter.
1974	if (CurPtr[PrefixLen] != '(') {
1975	if (!isLexingRawMode()) {
1976	const char *PrefixEnd = &CurPtr[PrefixLen];
1977	if (PrefixLen == 16) {
1978	Diag(PrefixEnd, diag::err_raw_delim_too_long);
1979	} else {
1980	Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1981	<< StringRef(PrefixEnd, 1);
1982	}
1983	}
1984
1985	// Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1986	// it's possible the '"' was intended to be part of the raw string, but
1987	// there's not much we can do about that.
1988	while (true) {
1989	char C = *CurPtr++;
1990
1991	if (C == '"')
1992	break;
1993	if (C == 0 && CurPtr-1 == BufferEnd) {
1994	--CurPtr;
1995	break;
1996	}
1997	}
1998
1999	FormTokenWithChars(Result, CurPtr, tok::unknown);
2000	return true;
2001	}
2002
2003	// Save prefix and move CurPtr past it
2004	const char *Prefix = CurPtr;
2005	CurPtr += PrefixLen + 1; // skip over prefix and '('
2006
2007	while (true) {
2008	char C = *CurPtr++;
2009
2010	if (C == ')') {
2011	// Check for prefix match and closing quote.
2012	if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2013	CurPtr += PrefixLen + 1; // skip over prefix and '"'
2014	break;
2015	}
2016	} else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2017	if (!isLexingRawMode())
2018	Diag(BufferPtr, diag::err_unterminated_raw_string)
2019	<< StringRef(Prefix, PrefixLen);
2020	FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2021	return true;
2022	}
2023	}
2024
2025	// If we are in C++11, lex the optional ud-suffix.
2026	if (getLangOpts().CPlusPlus)
2027	CurPtr = LexUDSuffix(Result, CurPtr, true);
2028
2029	// Update the location of token as well as BufferPtr.
2030	const char *TokStart = BufferPtr;
2031	FormTokenWithChars(Result, CurPtr, Kind);
2032	Result.setLiteralData(TokStart);
2033	return true;
2034	}
2035
2036	/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2037	/// after having lexed the '<' character. This is used for #include filenames.
2038	bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2039	// Does this string contain the \0 character?
2040	const char *NulCharacter = nullptr;
2041	const char *AfterLessPos = CurPtr;
2042	char C = getAndAdvanceChar(CurPtr, Result);
2043	while (C != '>') {
2044	// Skip escaped characters. Escaped newlines will already be processed by
2045	// getAndAdvanceChar.
2046	if (C == '\\')
2047	C = getAndAdvanceChar(CurPtr, Result);
2048
2049	if (C == '\n' \|\| C == '\r' \|\| // Newline.
2050	(C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2051	// If the filename is unterminated, then it must just be a lone <
2052	// character. Return this as such.
2053	FormTokenWithChars(Result, AfterLessPos, tok::less);
2054	return true;
2055	}
2056
2057	if (C == 0) {
2058	if (isCodeCompletionPoint(CurPtr - 1)) {
2059	codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /IsAngled=/true);
2060	cutOffLexing();
2061	FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2062	return true;
2063	}
2064	NulCharacter = CurPtr-1;
2065	}
2066	C = getAndAdvanceChar(CurPtr, Result);
2067	}
2068
2069	// If a nul character existed in the string, warn about it.
2070	if (NulCharacter && !isLexingRawMode())
2071	Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2072
2073	// Update the location of token as well as BufferPtr.
2074	const char *TokStart = BufferPtr;
2075	FormTokenWithChars(Result, CurPtr, tok::header_name);
2076	Result.setLiteralData(TokStart);
2077	return true;
2078	}
2079
2080	void Lexer::codeCompleteIncludedFile(const char *PathStart,
2081	const char *CompletionPoint,
2082	bool IsAngled) {
2083	// Completion only applies to the filename, after the last slash.
2084	StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2085	auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/");
2086	StringRef Dir =
2087	(Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2088	const char *StartOfFilename =
2089	(Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2090	// Code completion filter range is the filename only, up to completion point.
2091	PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2092	StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2093	// We should replace the characters up to the closing quote, if any.
2094	while (CompletionPoint < BufferEnd) {
2095	char Next = *(CompletionPoint + 1);
2096	if (Next == 0 \|\| Next == '\r' \|\| Next == '\n')
2097	break;
2098	++CompletionPoint;
2099	if (Next == (IsAngled ? '>' : '"'))
2100	break;
2101	}
2102	PP->setCodeCompletionTokenRange(
2103	FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2104	FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2105	PP->CodeCompleteIncludedFile(Dir, IsAngled);
2106	}
2107
2108	/// LexCharConstant - Lex the remainder of a character constant, after having
2109	/// lexed either ' or L' or u8' or u' or U'.
2110	bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2111	tok::TokenKind Kind) {
2112	// Does this character contain the \0 character?
2113	const char *NulCharacter = nullptr;
2114
2115	if (!isLexingRawMode()) {
2116	if (Kind == tok::utf16_char_constant \|\| Kind == tok::utf32_char_constant)
2117	Diag(BufferPtr, getLangOpts().CPlusPlus
2118	? diag::warn_cxx98_compat_unicode_literal
2119	: diag::warn_c99_compat_unicode_literal);
2120	else if (Kind == tok::utf8_char_constant)
2121	Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2122	}
2123
2124	char C = getAndAdvanceChar(CurPtr, Result);
2125	if (C == '\'') {
2126	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2127	Diag(BufferPtr, diag::ext_empty_character);
2128	FormTokenWithChars(Result, CurPtr, tok::unknown);
2129	return true;
2130	}
2131
2132	while (C != '\'') {
2133	// Skip escaped characters.
2134	if (C == '\\')
2135	C = getAndAdvanceChar(CurPtr, Result);
2136
2137	if (C == '\n' \|\| C == '\r' \|\| // Newline.
2138	(C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2139	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2140	Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2141	FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2142	return true;
2143	}
2144
2145	if (C == 0) {
2146	if (isCodeCompletionPoint(CurPtr-1)) {
2147	PP->CodeCompleteNaturalLanguage();
2148	FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2149	cutOffLexing();
2150	return true;
2151	}
2152
2153	NulCharacter = CurPtr-1;
2154	}
2155	C = getAndAdvanceChar(CurPtr, Result);
2156	}
2157
2158	// If we are in C++11, lex the optional ud-suffix.
2159	if (getLangOpts().CPlusPlus)
2160	CurPtr = LexUDSuffix(Result, CurPtr, false);
2161
2162	// If a nul character existed in the character, warn about it.
2163	if (NulCharacter && !isLexingRawMode())
2164	Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2165
2166	// Update the location of token as well as BufferPtr.
2167	const char *TokStart = BufferPtr;
2168	FormTokenWithChars(Result, CurPtr, Kind);
2169	Result.setLiteralData(TokStart);
2170	return true;
2171	}
2172
2173	/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2174	/// Update BufferPtr to point to the next non-whitespace character and return.
2175	///
2176	/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2177	bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2178	bool &TokAtPhysicalStartOfLine) {
2179	// Whitespace - Skip it, then return the token after the whitespace.
2180	bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2181
2182	unsigned char Char = *CurPtr;
2183
2184	// Skip consecutive spaces efficiently.
2185	while (true) {
2186	// Skip horizontal whitespace very aggressively.
2187	while (isHorizontalWhitespace(Char))
2188	Char = *++CurPtr;
2189
2190	// Otherwise if we have something other than whitespace, we're done.
2191	if (!isVerticalWhitespace(Char))
2192	break;
2193
2194	if (ParsingPreprocessorDirective) {
2195	// End of preprocessor directive line, let LexTokenInternal handle this.
2196	BufferPtr = CurPtr;
2197	return false;
2198	}
2199
2200	// OK, but handle newline.
2201	SawNewline = true;
2202	Char = *++CurPtr;
2203	}
2204
2205	// If the client wants us to return whitespace, return it now.
2206	if (isKeepWhitespaceMode()) {
2207	FormTokenWithChars(Result, CurPtr, tok::unknown);
2208	if (SawNewline) {
2209	IsAtStartOfLine = true;
2210	IsAtPhysicalStartOfLine = true;
2211	}
2212	// FIXME: The next token will not have LeadingSpace set.
2213	return true;
2214	}
2215
2216	// If this isn't immediately after a newline, there is leading space.
2217	char PrevChar = CurPtr[-1];
2218	bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2219
2220	Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2221	if (SawNewline) {
2222	Result.setFlag(Token::StartOfLine);
2223	TokAtPhysicalStartOfLine = true;
2224	}
2225
2226	BufferPtr = CurPtr;
2227	return false;
2228	}
2229
2230	/// We have just read the // characters from input. Skip until we find the
2231	/// newline character that terminates the comment. Then update BufferPtr and
2232	/// return.
2233	///
2234	/// If we're in KeepCommentMode or any CommentHandler has inserted
2235	/// some tokens, this will store the first token and return true.
2236	bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2237	bool &TokAtPhysicalStartOfLine) {
2238	// If Line comments aren't explicitly enabled for this language, emit an
2239	// extension warning.
2240	if (!LangOpts.LineComment && !isLexingRawMode()) {
2241	Diag(BufferPtr, diag::ext_line_comment);
2242
2243	// Mark them enabled so we only emit one warning for this translation
2244	// unit.
2245	LangOpts.LineComment = true;
2246	}
2247
2248	// Scan over the body of the comment. The common case, when scanning, is that
2249	// the comment contains normal ascii characters with nothing interesting in
2250	// them. As such, optimize for this case with the inner loop.
2251	//
2252	// This loop terminates with CurPtr pointing at the newline (or end of buffer)
2253	// character that ends the line comment.
2254	char C;
2255	while (true) {
2256	C = *CurPtr;
2257	// Skip over characters in the fast loop.
2258	while (C != 0 && // Potentially EOF.
2259	C != '\n' && C != '\r') // Newline or DOS-style newline.
2260	C = *++CurPtr;
2261
2262	const char *NextLine = CurPtr;
2263	if (C != 0) {
2264	// We found a newline, see if it's escaped.
2265	const char *EscapePtr = CurPtr-1;
2266	bool HasSpace = false;
2267	while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2268	--EscapePtr;
2269	HasSpace = true;
2270	}
2271
2272	if (*EscapePtr == '\\')
2273	// Escaped newline.
2274	CurPtr = EscapePtr;
2275	else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2276	EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2277	// Trigraph-escaped newline.
2278	CurPtr = EscapePtr-2;
2279	else
2280	break; // This is a newline, we're done.
2281
2282	// If there was space between the backslash and newline, warn about it.
2283	if (HasSpace && !isLexingRawMode())
2284	Diag(EscapePtr, diag::backslash_newline_space);
2285	}
2286
2287	// Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2288	// properly decode the character. Read it in raw mode to avoid emitting
2289	// diagnostics about things like trigraphs. If we see an escaped newline,
2290	// we'll handle it below.
2291	const char *OldPtr = CurPtr;
2292	bool OldRawMode = isLexingRawMode();
2293	LexingRawMode = true;
2294	C = getAndAdvanceChar(CurPtr, Result);
2295	LexingRawMode = OldRawMode;
2296
2297	// If we only read only one character, then no special handling is needed.
2298	// We're done and can skip forward to the newline.
2299	if (C != 0 && CurPtr == OldPtr+1) {
2300	CurPtr = NextLine;
2301	break;
2302	}
2303
2304	// If we read multiple characters, and one of those characters was a \r or
2305	// \n, then we had an escaped newline within the comment. Emit diagnostic
2306	// unless the next line is also a // comment.
2307	if (CurPtr != OldPtr + 1 && C != '/' &&
2308	(CurPtr == BufferEnd + 1 \|\| CurPtr[0] != '/')) {
2309	for (; OldPtr != CurPtr; ++OldPtr)
2310	if (OldPtr[0] == '\n' \|\| OldPtr[0] == '\r') {
2311	// Okay, we found a // comment that ends in a newline, if the next
2312	// line is also a // comment, but has spaces, don't emit a diagnostic.
2313	if (isWhitespace(C)) {
2314	const char *ForwardPtr = CurPtr;
2315	while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2316	++ForwardPtr;
2317	if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2318	break;
2319	}
2320
2321	if (!isLexingRawMode())
2322	Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2323	break;
2324	}
2325	}
2326
2327	if (C == '\r' \|\| C == '\n' \|\| CurPtr == BufferEnd + 1) {
2328	--CurPtr;
2329	break;
2330	}
2331
2332	if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2333	PP->CodeCompleteNaturalLanguage();
2334	cutOffLexing();
2335	return false;
2336	}
2337	}
2338
2339	// Found but did not consume the newline. Notify comment handlers about the
2340	// comment unless we're in a #if 0 block.
2341	if (PP && !isLexingRawMode() &&
2342	PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2343	getSourceLocation(CurPtr)))) {
2344	BufferPtr = CurPtr;
2345	return true; // A token has to be returned.
2346	}
2347
2348	// If we are returning comments as tokens, return this comment as a token.
2349	if (inKeepCommentMode())
2350	return SaveLineComment(Result, CurPtr);
2351
2352	// If we are inside a preprocessor directive and we see the end of line,
2353	// return immediately, so that the lexer can return this as an EOD token.
2354	if (ParsingPreprocessorDirective \|\| CurPtr == BufferEnd) {
2355	BufferPtr = CurPtr;
2356	return false;
2357	}
2358
2359	// Otherwise, eat the \n character. We don't care if this is a \n\r or
2360	// \r\n sequence. This is an efficiency hack (because we know the \n can't
2361	// contribute to another token), it isn't needed for correctness. Note that
2362	// this is ok even in KeepWhitespaceMode, because we would have returned the
2363	/// comment above in that mode.
2364	++CurPtr;
2365
2366	// The next returned token is at the start of the line.
2367	Result.setFlag(Token::StartOfLine);
2368	TokAtPhysicalStartOfLine = true;
2369	// No leading whitespace seen so far.
2370	Result.clearFlag(Token::LeadingSpace);
2371	BufferPtr = CurPtr;
2372	return false;
2373	}
2374
2375	/// If in save-comment mode, package up this Line comment in an appropriate
2376	/// way and return it.
2377	bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2378	// If we're not in a preprocessor directive, just return the // comment
2379	// directly.
2380	FormTokenWithChars(Result, CurPtr, tok::comment);
2381
2382	if (!ParsingPreprocessorDirective \|\| LexingRawMode)
2383	return true;
2384
2385	// If this Line-style comment is in a macro definition, transmogrify it into
2386	// a C-style block comment.
2387	bool Invalid = false;
2388	std::string Spelling = PP->getSpelling(Result, &Invalid);
2389	if (Invalid)
2390	return true;
2391
2392	assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?")((Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?") ? static_cast<void> (0) : __assert_fail ("Spelling[0] == '/' && Spelling[1] == '/' && \"Not line comment?\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2392, __PRETTY_FUNCTION__));
2393	Spelling[1] = ''; // Change prefix to "/".
2394	Spelling += "*/"; // add suffix.
2395
2396	Result.setKind(tok::comment);
2397	PP->CreateString(Spelling, Result,
2398	Result.getLocation(), Result.getLocation());
2399	return true;
2400	}
2401
2402	/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2403	/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2404	/// a diagnostic if so. We know that the newline is inside of a block comment.
2405	static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2406	Lexer *L) {
2407	assert(CurPtr[0] == '\n' \|\| CurPtr[0] == '\r')((CurPtr[0] == '\n' \|\| CurPtr[0] == '\r') ? static_cast<void > (0) : __assert_fail ("CurPtr[0] == '\\n' \|\| CurPtr[0] == '\\r'" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2407, __PRETTY_FUNCTION__));
2408
2409	// Back up off the newline.
2410	--CurPtr;
2411
2412	// If this is a two-character newline sequence, skip the other character.
2413	if (CurPtr[0] == '\n' \|\| CurPtr[0] == '\r') {
2414	// \n\n or \r\r -> not escaped newline.
2415	if (CurPtr[0] == CurPtr[1])
2416	return false;
2417	// \n\r or \r\n -> skip the newline.
2418	--CurPtr;
2419	}
2420
2421	// If we have horizontal whitespace, skip over it. We allow whitespace
2422	// between the slash and newline.
2423	bool HasSpace = false;
2424	while (isHorizontalWhitespace(CurPtr) \|\| CurPtr == 0) {
2425	--CurPtr;
2426	HasSpace = true;
2427	}
2428
2429	// If we have a slash, we know this is an escaped newline.
2430	if (*CurPtr == '\\') {
2431	if (CurPtr[-1] != '*') return false;
2432	} else {
2433	// It isn't a slash, is it the ?? / trigraph?
2434	if (CurPtr[0] != '/' \|\| CurPtr[-1] != '?' \|\| CurPtr[-2] != '?' \|\|
2435	CurPtr[-3] != '*')
2436	return false;
2437
2438	// This is the trigraph ending the comment. Emit a stern warning!
2439	CurPtr -= 2;
2440
2441	// If no trigraphs are enabled, warn that we ignored this trigraph and
2442	// ignore this * character.
2443	if (!L->getLangOpts().Trigraphs) {
2444	if (!L->isLexingRawMode())
2445	L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2446	return false;
2447	}
2448	if (!L->isLexingRawMode())
2449	L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2450	}
2451
2452	// Warn about having an escaped newline between the */ characters.
2453	if (!L->isLexingRawMode())
2454	L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2455
2456	// If there was space between the backslash and newline, warn about it.
2457	if (HasSpace && !L->isLexingRawMode())
2458	L->Diag(CurPtr, diag::backslash_newline_space);
2459
2460	return true;
2461	}
2462
2463	#ifdef __SSE2__1
2464	#include <emmintrin.h>
2465	#elif __ALTIVEC__
2466	#include <altivec.h>
2467	#undef bool
2468	#endif
2469
2470	/// We have just read from input the / and * characters that started a comment.
2471	/// Read until we find the * and / characters that terminate the comment.
2472	/// Note that we don't bother decoding trigraphs or escaped newlines in block
2473	/// comments, because they cannot cause the comment to end. The only thing
2474	/// that can happen is the comment could end with an escaped newline between
2475	/// the terminating * and /.
2476	///
2477	/// If we're in KeepCommentMode or any CommentHandler has inserted
2478	/// some tokens, this will store the first token and return true.
2479	bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2480	bool &TokAtPhysicalStartOfLine) {
2481	// Scan one character past where we should, looking for a '/' character. Once
2482	// we find it, check to see if it was preceded by a *. This common
2483	// optimization helps people who like to put a lot of * characters in their
2484	// comments.
2485
2486	// The first character we get with newlines and trigraphs skipped to handle
2487	// the degenerate // case below correctly if the has an escaped newline
2488	// after it.
2489	unsigned CharSize;
2490	unsigned char C = getCharAndSize(CurPtr, CharSize);
2491	CurPtr += CharSize;
2492	if (C == 0 && CurPtr == BufferEnd+1) {
2493	if (!isLexingRawMode())
2494	Diag(BufferPtr, diag::err_unterminated_block_comment);
2495	--CurPtr;
2496
2497	// KeepWhitespaceMode should return this broken comment as a token. Since
2498	// it isn't a well formed comment, just return it as an 'unknown' token.
2499	if (isKeepWhitespaceMode()) {
2500	FormTokenWithChars(Result, CurPtr, tok::unknown);
2501	return true;
2502	}
2503
2504	BufferPtr = CurPtr;
2505	return false;
2506	}
2507
2508	// Check to see if the first character after the '/*' is another /. If so,
2509	// then this slash does not end the block comment, it is part of it.
2510	if (C == '/')
2511	C = *CurPtr++;
2512
2513	while (true) {
2514	// Skip over all non-interesting characters until we find end of buffer or a
2515	// (probably ending) '/' character.
2516	if (CurPtr + 24 < BufferEnd &&
2517	// If there is a code-completion point avoid the fast scan because it
2518	// doesn't check for '\0'.
2519	!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2520	// While not aligned to a 16-byte boundary.
2521	while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
2522	C = *CurPtr++;
2523
2524	if (C == '/') goto FoundSlash;
2525
2526	#ifdef __SSE2__1
2527	__m128i Slashes = _mm_set1_epi8('/');
2528	while (CurPtr+16 <= BufferEnd) {
2529	int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8((const __m128i)CurPtr,
2530	Slashes));
2531	if (cmp != 0) {
2532	// Adjust the pointer to point directly after the first slash. It's
2533	// not necessary to set C here, it will be overwritten at the end of
2534	// the outer loop.
2535	CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2536	goto FoundSlash;
2537	}
2538	CurPtr += 16;
2539	}
2540	#elif __ALTIVEC__
2541	__vector unsigned char Slashes = {
2542	'/', '/', '/', '/', '/', '/', '/', '/',
2543	'/', '/', '/', '/', '/', '/', '/', '/'
2544	};
2545	while (CurPtr+16 <= BufferEnd &&
2546	!vec_any_eq((const vector unsigned char)CurPtr, Slashes))
2547	CurPtr += 16;
2548	#else
2549	// Scan for '/' quickly. Many block comments are very large.
2550	while (CurPtr[0] != '/' &&
2551	CurPtr[1] != '/' &&
2552	CurPtr[2] != '/' &&
2553	CurPtr[3] != '/' &&
2554	CurPtr+4 < BufferEnd) {
2555	CurPtr += 4;
2556	}
2557	#endif
2558
2559	// It has to be one of the bytes scanned, increment to it and read one.
2560	C = *CurPtr++;
2561	}
2562
2563	// Loop to scan the remainder.
2564	while (C != '/' && C != '\0')
2565	C = *CurPtr++;
2566
2567	if (C == '/') {
2568	FoundSlash:
2569	if (CurPtr[-2] == '') // We found the final /. We're done!
2570	break;
2571
2572	if ((CurPtr[-2] == '\n' \|\| CurPtr[-2] == '\r')) {
2573	if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2574	// We found the final */, though it had an escaped newline between the
2575	// * and /. We're done!
2576	break;
2577	}
2578	}
2579	if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2580	// If this is a /* inside of the comment, emit a warning. Don't do this
2581	// if this is a /*/, which will end the comment. This misses cases with
2582	// embedded escaped newlines, but oh well.
2583	if (!isLexingRawMode())
2584	Diag(CurPtr-1, diag::warn_nested_block_comment);
2585	}
2586	} else if (C == 0 && CurPtr == BufferEnd+1) {
2587	if (!isLexingRawMode())
2588	Diag(BufferPtr, diag::err_unterminated_block_comment);
2589	// Note: the user probably forgot a */. We could continue immediately
2590	// after the /*, but this would involve lexing a lot of what really is the
2591	// comment, which surely would confuse the parser.
2592	--CurPtr;
2593
2594	// KeepWhitespaceMode should return this broken comment as a token. Since
2595	// it isn't a well formed comment, just return it as an 'unknown' token.
2596	if (isKeepWhitespaceMode()) {
2597	FormTokenWithChars(Result, CurPtr, tok::unknown);
2598	return true;
2599	}
2600
2601	BufferPtr = CurPtr;
2602	return false;
2603	} else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2604	PP->CodeCompleteNaturalLanguage();
2605	cutOffLexing();
2606	return false;
2607	}
2608
2609	C = *CurPtr++;
2610	}
2611
2612	// Notify comment handlers about the comment unless we're in a #if 0 block.
2613	if (PP && !isLexingRawMode() &&
2614	PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2615	getSourceLocation(CurPtr)))) {
2616	BufferPtr = CurPtr;
2617	return true; // A token has to be returned.
2618	}
2619
2620	// If we are returning comments as tokens, return this comment as a token.
2621	if (inKeepCommentMode()) {
2622	FormTokenWithChars(Result, CurPtr, tok::comment);
2623	return true;
2624	}
2625
2626	// It is common for the tokens immediately after a /**/ comment to be
2627	// whitespace. Instead of going through the big switch, handle it
2628	// efficiently now. This is safe even in KeepWhitespaceMode because we would
2629	// have already returned above with the comment as a token.
2630	if (isHorizontalWhitespace(*CurPtr)) {
2631	SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2632	return false;
2633	}
2634
2635	// Otherwise, just return so that the next character will be lexed as a token.
2636	BufferPtr = CurPtr;
2637	Result.setFlag(Token::LeadingSpace);
2638	return false;
2639	}
2640
2641	//===----------------------------------------------------------------------===//
2642	// Primary Lexing Entry Points
2643	//===----------------------------------------------------------------------===//
2644
2645	/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2646	/// uninterpreted string. This switches the lexer out of directive mode.
2647	void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2648	assert(ParsingPreprocessorDirective && ParsingFilename == false &&((ParsingPreprocessorDirective && ParsingFilename == false && "Must be in a preprocessing directive!") ? static_cast <void> (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2649, __PRETTY_FUNCTION__))
2649	"Must be in a preprocessing directive!")((ParsingPreprocessorDirective && ParsingFilename == false && "Must be in a preprocessing directive!") ? static_cast <void> (0) : __assert_fail ("ParsingPreprocessorDirective && ParsingFilename == false && \"Must be in a preprocessing directive!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2649, __PRETTY_FUNCTION__));
2650	Token Tmp;
2651
2652	// CurPtr - Cache BufferPtr in an automatic variable.
2653	const char *CurPtr = BufferPtr;
2654	while (true) {
2655	char Char = getAndAdvanceChar(CurPtr, Tmp);
2656	switch (Char) {
2657	default:
2658	if (Result)
2659	Result->push_back(Char);
2660	break;
2661	case 0: // Null.
2662	// Found end of file?
2663	if (CurPtr-1 != BufferEnd) {
2664	if (isCodeCompletionPoint(CurPtr-1)) {
2665	PP->CodeCompleteNaturalLanguage();
2666	cutOffLexing();
2667	return;
2668	}
2669
2670	// Nope, normal character, continue.
2671	if (Result)
2672	Result->push_back(Char);
2673	break;
2674	}
2675	// FALL THROUGH.
2676	LLVM_FALLTHROUGH[[clang::fallthrough]];
2677	case '\r':
2678	case '\n':
2679	// Okay, we found the end of the line. First, back up past the \0, \r, \n.
2680	assert(CurPtr[-1] == Char && "Trigraphs for newline?")((CurPtr[-1] == Char && "Trigraphs for newline?") ? static_cast <void> (0) : __assert_fail ("CurPtr[-1] == Char && \"Trigraphs for newline?\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2680, __PRETTY_FUNCTION__));
2681	BufferPtr = CurPtr-1;
2682
2683	// Next, lex the character, which should handle the EOD transition.
2684	Lex(Tmp);
2685	if (Tmp.is(tok::code_completion)) {
2686	if (PP)
2687	PP->CodeCompleteNaturalLanguage();
2688	Lex(Tmp);
2689	}
2690	assert(Tmp.is(tok::eod) && "Unexpected token!")((Tmp.is(tok::eod) && "Unexpected token!") ? static_cast <void> (0) : __assert_fail ("Tmp.is(tok::eod) && \"Unexpected token!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2690, __PRETTY_FUNCTION__));
2691
2692	// Finally, we're done;
2693	return;
2694	}
2695	}
2696	}
2697
2698	/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2699	/// condition, reporting diagnostics and handling other edge cases as required.
2700	/// This returns true if Result contains a token, false if PP.Lex should be
2701	/// called again.
2702	bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2703	// If we hit the end of the file while parsing a preprocessor directive,
2704	// end the preprocessor directive first. The next token returned will
2705	// then be the end of file.
2706	if (ParsingPreprocessorDirective) {
2707	// Done parsing the "line".
2708	ParsingPreprocessorDirective = false;
2709	// Update the location of token as well as BufferPtr.
2710	FormTokenWithChars(Result, CurPtr, tok::eod);
2711
2712	// Restore comment saving mode, in case it was disabled for directive.
2713	if (PP)
2714	resetExtendedTokenMode();
2715	return true; // Have a token.
2716	}
2717
2718	// If we are in raw mode, return this event as an EOF token. Let the caller
2719	// that put us in raw mode handle the event.
2720	if (isLexingRawMode()) {
2721	Result.startToken();
2722	BufferPtr = BufferEnd;
2723	FormTokenWithChars(Result, BufferEnd, tok::eof);
2724	return true;
2725	}
2726
2727	if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2728	PP->setRecordedPreambleConditionalStack(ConditionalStack);
2729	ConditionalStack.clear();
2730	}
2731
2732	// Issue diagnostics for unterminated #if and missing newline.
2733
2734	// If we are in a #if directive, emit an error.
2735	while (!ConditionalStack.empty()) {
2736	if (PP->getCodeCompletionFileLoc() != FileLoc)
2737	PP->Diag(ConditionalStack.back().IfLoc,
2738	diag::err_pp_unterminated_conditional);
2739	ConditionalStack.pop_back();
2740	}
2741
2742	// C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2743	// a pedwarn.
2744	if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2745	DiagnosticsEngine &Diags = PP->getDiagnostics();
2746	SourceLocation EndLoc = getSourceLocation(BufferEnd);
2747	unsigned DiagID;
2748
2749	if (LangOpts.CPlusPlus11) {
2750	// C++11 [lex.phases] 2.2 p2
2751	// Prefer the C++98 pedantic compatibility warning over the generic,
2752	// non-extension, user-requested "missing newline at EOF" warning.
2753	if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2754	DiagID = diag::warn_cxx98_compat_no_newline_eof;
2755	} else {
2756	DiagID = diag::warn_no_newline_eof;
2757	}
2758	} else {
2759	DiagID = diag::ext_no_newline_eof;
2760	}
2761
2762	Diag(BufferEnd, DiagID)
2763	<< FixItHint::CreateInsertion(EndLoc, "\n");
2764	}
2765
2766	BufferPtr = CurPtr;
2767
2768	// Finally, let the preprocessor handle this.
2769	return PP->HandleEndOfFile(Result, isPragmaLexer());
2770	}
2771
2772	/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2773	/// the specified lexer will return a tok::l_paren token, 0 if it is something
2774	/// else and 2 if there are no more tokens in the buffer controlled by the
2775	/// lexer.
2776	unsigned Lexer::isNextPPTokenLParen() {
2777	assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?")((!LexingRawMode && "How can we expand a macro from a skipping buffer?" ) ? static_cast<void> (0) : __assert_fail ("!LexingRawMode && \"How can we expand a macro from a skipping buffer?\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2777, __PRETTY_FUNCTION__));
2778
2779	// Switch to 'skipping' mode. This will ensure that we can lex a token
2780	// without emitting diagnostics, disables macro expansion, and will cause EOF
2781	// to return an EOF token instead of popping the include stack.
2782	LexingRawMode = true;
2783
2784	// Save state that can be changed while lexing so that we can restore it.
2785	const char *TmpBufferPtr = BufferPtr;
2786	bool inPPDirectiveMode = ParsingPreprocessorDirective;
2787	bool atStartOfLine = IsAtStartOfLine;
2788	bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2789	bool leadingSpace = HasLeadingSpace;
2790
2791	Token Tok;
2792	Lex(Tok);
2793
2794	// Restore state that may have changed.
2795	BufferPtr = TmpBufferPtr;
2796	ParsingPreprocessorDirective = inPPDirectiveMode;
2797	HasLeadingSpace = leadingSpace;
2798	IsAtStartOfLine = atStartOfLine;
2799	IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2800
2801	// Restore the lexer back to non-skipping mode.
2802	LexingRawMode = false;
2803
2804	if (Tok.is(tok::eof))
2805	return 2;
2806	return Tok.is(tok::l_paren);
2807	}
2808
2809	/// Find the end of a version control conflict marker.
2810	static const char FindConflictEnd(const char CurPtr, const char *BufferEnd,
2811	ConflictMarkerKind CMK) {
2812	const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2813	size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2814	auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2815	size_t Pos = RestOfBuffer.find(Terminator);
2816	while (Pos != StringRef::npos) {
2817	// Must occur at start of line.
2818	if (Pos == 0 \|\|
2819	(RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
2820	RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2821	Pos = RestOfBuffer.find(Terminator);
2822	continue;
2823	}
2824	return RestOfBuffer.data()+Pos;
2825	}
2826	return nullptr;
2827	}
2828
2829	/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2830	/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2831	/// and recover nicely. This returns true if it is a conflict marker and false
2832	/// if not.
2833	bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2834	// Only a conflict marker if it starts at the beginning of a line.
2835	if (CurPtr != BufferStart &&
2836	CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2837	return false;
2838
2839	// Check to see if we have <<<<<<< or >>>>.
2840	if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2841	!StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
2842	return false;
2843
2844	// If we have a situation where we don't care about conflict markers, ignore
2845	// it.
2846	if (CurrentConflictMarkerState \|\| isLexingRawMode())
2847	return false;
2848
2849	ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
2850
2851	// Check to see if there is an ending marker somewhere in the buffer at the
2852	// start of a line to terminate this conflict marker.
2853	if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2854	// We found a match. We are really in a conflict marker.
2855	// Diagnose this, and ignore to the end of line.
2856	Diag(CurPtr, diag::err_conflict_marker);
2857	CurrentConflictMarkerState = Kind;
2858
2859	// Skip ahead to the end of line. We know this exists because the
2860	// end-of-conflict marker starts with \r or \n.
2861	while (CurPtr != '\r' && CurPtr != '\n') {
2862	assert(CurPtr != BufferEnd && "Didn't find end of line")((CurPtr != BufferEnd && "Didn't find end of line") ? static_cast<void> (0) : __assert_fail ("CurPtr != BufferEnd && \"Didn't find end of line\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2862, __PRETTY_FUNCTION__));
2863	++CurPtr;
2864	}
2865	BufferPtr = CurPtr;
2866	return true;
2867	}
2868
2869	// No end of conflict marker found.
2870	return false;
2871	}
2872
2873	/// HandleEndOfConflictMarker - If this is a '====' or '\|\|\|\|' or '>>>>', or if
2874	/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2875	/// is the end of a conflict marker. Handle it by ignoring up until the end of
2876	/// the line. This returns true if it is a conflict marker and false if not.
2877	bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2878	// Only a conflict marker if it starts at the beginning of a line.
2879	if (CurPtr != BufferStart &&
2880	CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2881	return false;
2882
2883	// If we have a situation where we don't care about conflict markers, ignore
2884	// it.
2885	if (!CurrentConflictMarkerState \|\| isLexingRawMode())
2886	return false;
2887
2888	// Check to see if we have the marker (4 characters in a row).
2889	for (unsigned i = 1; i != 4; ++i)
2890	if (CurPtr[i] != CurPtr[0])
2891	return false;
2892
2893	// If we do have it, search for the end of the conflict marker. This could
2894	// fail if it got skipped with a '#if 0' or something. Note that CurPtr might
2895	// be the end of conflict marker.
2896	if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2897	CurrentConflictMarkerState)) {
2898	CurPtr = End;
2899
2900	// Skip ahead to the end of line.
2901	while (CurPtr != BufferEnd && CurPtr != '\r' && CurPtr != '\n')
2902	++CurPtr;
2903
2904	BufferPtr = CurPtr;
2905
2906	// No longer in the conflict marker.
2907	CurrentConflictMarkerState = CMK_None;
2908	return true;
2909	}
2910
2911	return false;
2912	}
2913
2914	static const char findPlaceholderEnd(const char CurPtr,
2915	const char *BufferEnd) {
2916	if (CurPtr == BufferEnd)
2917	return nullptr;
2918	BufferEnd -= 1; // Scan until the second last character.
2919	for (; CurPtr != BufferEnd; ++CurPtr) {
2920	if (CurPtr[0] == '#' && CurPtr[1] == '>')
2921	return CurPtr + 2;
2922	}
2923	return nullptr;
2924	}
2925
2926	bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2927	assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!")((CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!") ? static_cast<void> (0) : __assert_fail ("CurPtr[-1] == '<' && CurPtr[0] == '#' && \"Not a placeholder!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 2927, __PRETTY_FUNCTION__));
2928	if (!PP \|\| !PP->getPreprocessorOpts().LexEditorPlaceholders \|\| LexingRawMode)
2929	return false;
2930	const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2931	if (!End)
2932	return false;
2933	const char *Start = CurPtr - 1;
2934	if (!LangOpts.AllowEditorPlaceholders)
2935	Diag(Start, diag::err_placeholder_in_source);
2936	Result.startToken();
2937	FormTokenWithChars(Result, End, tok::raw_identifier);
2938	Result.setRawIdentifierData(Start);
2939	PP->LookUpIdentifierInfo(Result);
2940	Result.setFlag(Token::IsEditorPlaceholder);
2941	BufferPtr = End;
2942	return true;
2943	}
2944
2945	bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2946	if (PP && PP->isCodeCompletionEnabled()) {
2947	SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2948	return Loc == PP->getCodeCompletionLoc();
2949	}
2950
2951	return false;
2952	}
2953
2954	uint32_t Lexer::tryReadUCN(const char &StartPtr, const char SlashLoc,
2955	Token *Result) {
2956	unsigned CharSize;
2957	char Kind = getCharAndSize(StartPtr, CharSize);
2958
2959	unsigned NumHexDigits;
2960	if (Kind == 'u')
2961	NumHexDigits = 4;
2962	else if (Kind == 'U')
2963	NumHexDigits = 8;
2964	else
2965	return 0;
2966
2967	if (!LangOpts.CPlusPlus && !LangOpts.C99) {
2968	if (Result && !isLexingRawMode())
2969	Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2970	return 0;
2971	}
2972
2973	const char *CurPtr = StartPtr + CharSize;
2974	const char *KindLoc = &CurPtr[-1];
2975
2976	uint32_t CodePoint = 0;
2977	for (unsigned i = 0; i < NumHexDigits; ++i) {
2978	char C = getCharAndSize(CurPtr, CharSize);
2979
2980	unsigned Value = llvm::hexDigitValue(C);
2981	if (Value == -1U) {
2982	if (Result && !isLexingRawMode()) {
2983	if (i == 0) {
2984	Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2985	<< StringRef(KindLoc, 1);
2986	} else {
2987	Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2988
2989	// If the user wrote \U1234, suggest a fixit to \u.
2990	if (i == 4 && NumHexDigits == 8) {
2991	CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
2992	Diag(KindLoc, diag::note_ucn_four_not_eight)
2993	<< FixItHint::CreateReplacement(URange, "u");
2994	}
2995	}
2996	}
2997
2998	return 0;
2999	}
3000
3001	CodePoint <<= 4;
3002	CodePoint += Value;
3003
3004	CurPtr += CharSize;
3005	}
3006
3007	if (Result) {
3008	Result->setFlag(Token::HasUCN);
3009	if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
3010	StartPtr = CurPtr;
3011	else
3012	while (StartPtr != CurPtr)
3013	(void)getAndAdvanceChar(StartPtr, *Result);
3014	} else {
3015	StartPtr = CurPtr;
3016	}
3017
3018	// Don't apply C family restrictions to UCNs in assembly mode
3019	if (LangOpts.AsmPreprocessor)
3020	return CodePoint;
3021
3022	// C99 6.4.3p2: A universal character name shall not specify a character whose
3023	// short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3024	// 0060 (`), nor one in the range D800 through DFFF inclusive.)
3025	// C++11 [lex.charset]p2: If the hexadecimal value for a
3026	// universal-character-name corresponds to a surrogate code point (in the
3027	// range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3028	// if the hexadecimal value for a universal-character-name outside the
3029	// c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3030	// string literal corresponds to a control character (in either of the
3031	// ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3032	// basic source character set, the program is ill-formed.
3033	if (CodePoint < 0xA0) {
3034	if (CodePoint == 0x24 \|\| CodePoint == 0x40 \|\| CodePoint == 0x60)
3035	return CodePoint;
3036
3037	// We don't use isLexingRawMode() here because we need to warn about bad
3038	// UCNs even when skipping preprocessing tokens in a #if block.
3039	if (Result && PP) {
3040	if (CodePoint < 0x20 \|\| CodePoint >= 0x7F)
3041	Diag(BufferPtr, diag::err_ucn_control_character);
3042	else {
3043	char C = static_cast<char>(CodePoint);
3044	Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3045	}
3046	}
3047
3048	return 0;
3049	} else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3050	// C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3051	// We don't use isLexingRawMode() here because we need to diagnose bad
3052	// UCNs even when skipping preprocessing tokens in a #if block.
3053	if (Result && PP) {
3054	if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3055	Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3056	else
3057	Diag(BufferPtr, diag::err_ucn_escape_invalid);
3058	}
3059	return 0;
3060	}
3061
3062	return CodePoint;
3063	}
3064
3065	bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3066	const char *CurPtr) {
3067	static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3068	UnicodeWhitespaceCharRanges);
3069	if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3070	UnicodeWhitespaceChars.contains(C)) {
3071	Diag(BufferPtr, diag::ext_unicode_whitespace)
3072	<< makeCharRange(*this, BufferPtr, CurPtr);
3073
3074	Result.setFlag(Token::LeadingSpace);
3075	return true;
3076	}
3077	return false;
3078	}
3079
3080	bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3081	if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
3082	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3083	!PP->isPreprocessedOutput()) {
3084	maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3085	makeCharRange(*this, BufferPtr, CurPtr),
3086	/IsFirst=/true);
3087	maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
3088	makeCharRange(*this, BufferPtr, CurPtr));
3089	}
3090
3091	MIOpt.ReadToken();
3092	return LexIdentifier(Result, CurPtr);
3093	}
3094
3095	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
3096	!PP->isPreprocessedOutput() &&
3097	!isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
3098	// Non-ASCII characters tend to creep into source code unintentionally.
3099	// Instead of letting the parser complain about the unknown token,
3100	// just drop the character.
3101	// Note that we can /only/ do this when the non-ASCII character is actually
3102	// spelled as Unicode, not written as a UCN. The standard requires that
3103	// we not throw away any possible preprocessor tokens, but there's a
3104	// loophole in the mapping of Unicode characters to basic character set
3105	// characters that allows us to map these particular characters to, say,
3106	// whitespace.
3107	Diag(BufferPtr, diag::err_non_ascii)
3108	<< FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3109
3110	BufferPtr = CurPtr;
3111	return false;
3112	}
3113
3114	// Otherwise, we have an explicit UCN or a character that's unlikely to show
3115	// up by accident.
3116	MIOpt.ReadToken();
3117	FormTokenWithChars(Result, CurPtr, tok::unknown);
3118	return true;
3119	}
3120
3121	void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3122	IsAtStartOfLine = Result.isAtStartOfLine();
3123	HasLeadingSpace = Result.hasLeadingSpace();
3124	HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3125	// Note that this doesn't affect IsAtPhysicalStartOfLine.
3126	}
3127
3128	bool Lexer::Lex(Token &Result) {
3129	// Start a new token.
3130	Result.startToken();
3131
3132	// Set up misc whitespace flags for LexTokenInternal.
3133	if (IsAtStartOfLine) {
3134	Result.setFlag(Token::StartOfLine);
3135	IsAtStartOfLine = false;
3136	}
3137
3138	if (HasLeadingSpace) {
3139	Result.setFlag(Token::LeadingSpace);
3140	HasLeadingSpace = false;
3141	}
3142
3143	if (HasLeadingEmptyMacro) {
3144	Result.setFlag(Token::LeadingEmptyMacro);
3145	HasLeadingEmptyMacro = false;
3146	}
3147
3148	bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3149	IsAtPhysicalStartOfLine = false;
3150	bool isRawLex = isLexingRawMode();
3151	(void) isRawLex;
3152	bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3153	// (After the LexTokenInternal call, the lexer might be destroyed.)
3154	assert((returnedToken \|\| !isRawLex) && "Raw lex must succeed")(((returnedToken \|\| !isRawLex) && "Raw lex must succeed" ) ? static_cast<void> (0) : __assert_fail ("(returnedToken \|\| !isRawLex) && \"Raw lex must succeed\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 3154, __PRETTY_FUNCTION__));
3155	return returnedToken;
3156	}
3157
3158	/// LexTokenInternal - This implements a simple C family lexer. It is an
3159	/// extremely performance critical piece of code. This assumes that the buffer
3160	/// has a null character at the end of the file. This returns a preprocessing
3161	/// token, not a normal token, as such, it is an internal interface. It assumes
3162	/// that the Flags of result have been cleared before calling this.
3163	bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3164	LexNextToken:
3165	// New token, can't need cleaning yet.
3166	Result.clearFlag(Token::NeedsCleaning);
3167	Result.setIdentifierInfo(nullptr);
3168
3169	// CurPtr - Cache BufferPtr in an automatic variable.
3170	const char *CurPtr = BufferPtr;
3171
3172	// Small amounts of horizontal whitespace is very common between tokens.
3173	if ((CurPtr == ' ') \|\| (CurPtr == '\t')) {
3174	++CurPtr;
3175	while ((CurPtr == ' ') \|\| (CurPtr == '\t'))
3176	++CurPtr;
3177
3178	// If we are keeping whitespace and other tokens, just return what we just
3179	// skipped. The next lexer invocation will return the token after the
3180	// whitespace.
3181	if (isKeepWhitespaceMode()) {
3182	FormTokenWithChars(Result, CurPtr, tok::unknown);
3183	// FIXME: The next token will not have LeadingSpace set.
3184	return true;
3185	}
3186
3187	BufferPtr = CurPtr;
3188	Result.setFlag(Token::LeadingSpace);
3189	}
3190
3191	unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3192
3193	// Read a character, advancing over it.
3194	char Char = getAndAdvanceChar(CurPtr, Result);
3195	tok::TokenKind Kind;
3196
3197	switch (Char) {
3198	case 0: // Null.
3199	// Found end of file?
3200	if (CurPtr-1 == BufferEnd)
3201	return LexEndOfFile(Result, CurPtr-1);
3202
3203	// Check if we are performing code completion.
3204	if (isCodeCompletionPoint(CurPtr-1)) {
3205	// Return the code-completion token.
3206	Result.startToken();
3207	FormTokenWithChars(Result, CurPtr, tok::code_completion);
3208	return true;
3209	}
3210
3211	if (!isLexingRawMode())
3212	Diag(CurPtr-1, diag::null_in_file);
3213	Result.setFlag(Token::LeadingSpace);
3214	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3215	return true; // KeepWhitespaceMode
3216
3217	// We know the lexer hasn't changed, so just try again with this lexer.
3218	// (We manually eliminate the tail call to avoid recursion.)
3219	goto LexNextToken;
3220
3221	case 26: // DOS & CP/M EOF: "^Z".
3222	// If we're in Microsoft extensions mode, treat this as end of file.
3223	if (LangOpts.MicrosoftExt) {
3224	if (!isLexingRawMode())
3225	Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3226	return LexEndOfFile(Result, CurPtr-1);
3227	}
3228
3229	// If Microsoft extensions are disabled, this is just random garbage.
3230	Kind = tok::unknown;
3231	break;
3232
3233	case '\r':
3234	if (CurPtr[0] == '\n')
3235	Char = getAndAdvanceChar(CurPtr, Result);
3236	LLVM_FALLTHROUGH[[clang::fallthrough]];
3237	case '\n':
3238	// If we are inside a preprocessor directive and we see the end of line,
3239	// we know we are done with the directive, so return an EOD token.
3240	if (ParsingPreprocessorDirective) {
3241	// Done parsing the "line".
3242	ParsingPreprocessorDirective = false;
3243
3244	// Restore comment saving mode, in case it was disabled for directive.
3245	if (PP)
3246	resetExtendedTokenMode();
3247
3248	// Since we consumed a newline, we are back at the start of a line.
3249	IsAtStartOfLine = true;
3250	IsAtPhysicalStartOfLine = true;
3251
3252	Kind = tok::eod;
3253	break;
3254	}
3255
3256	// No leading whitespace seen so far.
3257	Result.clearFlag(Token::LeadingSpace);
3258
3259	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3260	return true; // KeepWhitespaceMode
3261
3262	// We only saw whitespace, so just try again with this lexer.
3263	// (We manually eliminate the tail call to avoid recursion.)
3264	goto LexNextToken;
3265	case ' ':
3266	case '\t':
3267	case '\f':
3268	case '\v':
3269	SkipHorizontalWhitespace:
3270	Result.setFlag(Token::LeadingSpace);
3271	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3272	return true; // KeepWhitespaceMode
3273
3274	SkipIgnoredUnits:
3275	CurPtr = BufferPtr;
3276
3277	// If the next token is obviously a // or /* */ comment, skip it efficiently
3278	// too (without going through the big switch stmt).
3279	if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3280	LangOpts.LineComment &&
3281	(LangOpts.CPlusPlus \|\| !LangOpts.TraditionalCPP)) {
3282	if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3283	return true; // There is a token to return.
3284	goto SkipIgnoredUnits;
3285	} else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3286	if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3287	return true; // There is a token to return.
3288	goto SkipIgnoredUnits;
3289	} else if (isHorizontalWhitespace(*CurPtr)) {
3290	goto SkipHorizontalWhitespace;
3291	}
3292	// We only saw whitespace, so just try again with this lexer.
3293	// (We manually eliminate the tail call to avoid recursion.)
3294	goto LexNextToken;
3295
3296	// C99 6.4.4.1: Integer Constants.
3297	// C99 6.4.4.2: Floating Constants.
3298	case '0': case '1': case '2': case '3': case '4':
3299	case '5': case '6': case '7': case '8': case '9':
3300	// Notify MIOpt that we read a non-whitespace/non-comment token.
3301	MIOpt.ReadToken();
3302	return LexNumericConstant(Result, CurPtr);
3303
3304	case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3305	// Notify MIOpt that we read a non-whitespace/non-comment token.
3306	MIOpt.ReadToken();
3307
3308	if (LangOpts.CPlusPlus11 \|\| LangOpts.C11) {
3309	Char = getCharAndSize(CurPtr, SizeTmp);
3310
3311	// UTF-16 string literal
3312	if (Char == '"')
3313	return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3314	tok::utf16_string_literal);
3315
3316	// UTF-16 character constant
3317	if (Char == '\'')
3318	return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3319	tok::utf16_char_constant);
3320
3321	// UTF-16 raw string literal
3322	if (Char == 'R' && LangOpts.CPlusPlus11 &&
3323	getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3324	return LexRawStringLiteral(Result,
3325	ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3326	SizeTmp2, Result),
3327	tok::utf16_string_literal);
3328
3329	if (Char == '8') {
3330	char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3331
3332	// UTF-8 string literal
3333	if (Char2 == '"')
3334	return LexStringLiteral(Result,
3335	ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3336	SizeTmp2, Result),
3337	tok::utf8_string_literal);
3338	if (Char2 == '\'' && LangOpts.CPlusPlus17)
3339	return LexCharConstant(
3340	Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3341	SizeTmp2, Result),
3342	tok::utf8_char_constant);
3343
3344	if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3345	unsigned SizeTmp3;
3346	char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3347	// UTF-8 raw string literal
3348	if (Char3 == '"') {
3349	return LexRawStringLiteral(Result,
3350	ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3351	SizeTmp2, Result),
3352	SizeTmp3, Result),
3353	tok::utf8_string_literal);
3354	}
3355	}
3356	}
3357	}
3358
3359	// treat u like the start of an identifier.
3360	return LexIdentifier(Result, CurPtr);
3361
3362	case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
3363	// Notify MIOpt that we read a non-whitespace/non-comment token.
3364	MIOpt.ReadToken();
3365
3366	if (LangOpts.CPlusPlus11 \|\| LangOpts.C11) {
3367	Char = getCharAndSize(CurPtr, SizeTmp);
3368
3369	// UTF-32 string literal
3370	if (Char == '"')
3371	return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3372	tok::utf32_string_literal);
3373
3374	// UTF-32 character constant
3375	if (Char == '\'')
3376	return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3377	tok::utf32_char_constant);
3378
3379	// UTF-32 raw string literal
3380	if (Char == 'R' && LangOpts.CPlusPlus11 &&
3381	getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3382	return LexRawStringLiteral(Result,
3383	ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3384	SizeTmp2, Result),
3385	tok::utf32_string_literal);
3386	}
3387
3388	// treat U like the start of an identifier.
3389	return LexIdentifier(Result, CurPtr);
3390
3391	case 'R': // Identifier or C++0x raw string literal
3392	// Notify MIOpt that we read a non-whitespace/non-comment token.
3393	MIOpt.ReadToken();
3394
3395	if (LangOpts.CPlusPlus11) {
3396	Char = getCharAndSize(CurPtr, SizeTmp);
3397
3398	if (Char == '"')
3399	return LexRawStringLiteral(Result,
3400	ConsumeChar(CurPtr, SizeTmp, Result),
3401	tok::string_literal);
3402	}
3403
3404	// treat R like the start of an identifier.
3405	return LexIdentifier(Result, CurPtr);
3406
3407	case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3408	// Notify MIOpt that we read a non-whitespace/non-comment token.
3409	MIOpt.ReadToken();
3410	Char = getCharAndSize(CurPtr, SizeTmp);
3411
3412	// Wide string literal.
3413	if (Char == '"')
3414	return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3415	tok::wide_string_literal);
3416
3417	// Wide raw string literal.
3418	if (LangOpts.CPlusPlus11 && Char == 'R' &&
3419	getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3420	return LexRawStringLiteral(Result,
3421	ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3422	SizeTmp2, Result),
3423	tok::wide_string_literal);
3424
3425	// Wide character constant.
3426	if (Char == '\'')
3427	return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3428	tok::wide_char_constant);
3429	// FALL THROUGH, treating L like the start of an identifier.
3430	LLVM_FALLTHROUGH[[clang::fallthrough]];
3431
3432	// C99 6.4.2: Identifiers.
3433	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3434	case 'H': case 'I': case 'J': case 'K': /'L'/case 'M': case 'N':
3435	case 'O': case 'P': case 'Q': /'R'/case 'S': case 'T': /'U'/
3436	case 'V': case 'W': case 'X': case 'Y': case 'Z':
3437	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3438	case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3439	case 'o': case 'p': case 'q': case 'r': case 's': case 't': /'u'/
3440	case 'v': case 'w': case 'x': case 'y': case 'z':
3441	case '_':
3442	// Notify MIOpt that we read a non-whitespace/non-comment token.
3443	MIOpt.ReadToken();
3444	return LexIdentifier(Result, CurPtr);
3445
3446	case '$': // $ in identifiers.
3447	if (LangOpts.DollarIdents) {
3448	if (!isLexingRawMode())
3449	Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3450	// Notify MIOpt that we read a non-whitespace/non-comment token.
3451	MIOpt.ReadToken();
3452	return LexIdentifier(Result, CurPtr);
3453	}
3454
3455	Kind = tok::unknown;
3456	break;
3457
3458	// C99 6.4.4: Character Constants.
3459	case '\'':
3460	// Notify MIOpt that we read a non-whitespace/non-comment token.
3461	MIOpt.ReadToken();
3462	return LexCharConstant(Result, CurPtr, tok::char_constant);
3463
3464	// C99 6.4.5: String Literals.
3465	case '"':
3466	// Notify MIOpt that we read a non-whitespace/non-comment token.
3467	MIOpt.ReadToken();
3468	return LexStringLiteral(Result, CurPtr,
3469	ParsingFilename ? tok::header_name
3470	: tok::string_literal);
3471
3472	// C99 6.4.6: Punctuators.
3473	case '?':
3474	Kind = tok::question;
3475	break;
3476	case '[':
3477	Kind = tok::l_square;
3478	break;
3479	case ']':
3480	Kind = tok::r_square;
3481	break;
3482	case '(':
3483	Kind = tok::l_paren;
3484	break;
3485	case ')':
3486	Kind = tok::r_paren;
3487	break;
3488	case '{':
3489	Kind = tok::l_brace;
3490	break;
3491	case '}':
3492	Kind = tok::r_brace;
3493	break;
3494	case '.':
3495	Char = getCharAndSize(CurPtr, SizeTmp);
3496	if (Char >= '0' && Char <= '9') {
3497	// Notify MIOpt that we read a non-whitespace/non-comment token.
3498	MIOpt.ReadToken();
3499
3500	return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3501	} else if (LangOpts.CPlusPlus && Char == '*') {
3502	Kind = tok::periodstar;
3503	CurPtr += SizeTmp;
3504	} else if (Char == '.' &&
3505	getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3506	Kind = tok::ellipsis;
3507	CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3508	SizeTmp2, Result);
3509	} else {
3510	Kind = tok::period;
3511	}
3512	break;
3513	case '&':
3514	Char = getCharAndSize(CurPtr, SizeTmp);
3515	if (Char == '&') {
3516	Kind = tok::ampamp;
3517	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3518	} else if (Char == '=') {
3519	Kind = tok::ampequal;
3520	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3521	} else {
3522	Kind = tok::amp;
3523	}
3524	break;
3525	case '*':
3526	if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3527	Kind = tok::starequal;
3528	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3529	} else {
3530	Kind = tok::star;
3531	}
3532	break;
3533	case '+':
3534	Char = getCharAndSize(CurPtr, SizeTmp);
3535	if (Char == '+') {
3536	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3537	Kind = tok::plusplus;
3538	} else if (Char == '=') {
3539	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3540	Kind = tok::plusequal;
3541	} else {
3542	Kind = tok::plus;
3543	}
3544	break;
3545	case '-':
3546	Char = getCharAndSize(CurPtr, SizeTmp);
3547	if (Char == '-') { // --
3548	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3549	Kind = tok::minusminus;
3550	} else if (Char == '>' && LangOpts.CPlusPlus &&
3551	getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '') { // C++ ->
3552	CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3553	SizeTmp2, Result);
3554	Kind = tok::arrowstar;
3555	} else if (Char == '>') { // ->
3556	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3557	Kind = tok::arrow;
3558	} else if (Char == '=') { // -=
3559	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3560	Kind = tok::minusequal;
3561	} else {
3562	Kind = tok::minus;
3563	}
3564	break;
3565	case '~':
3566	Kind = tok::tilde;
3567	break;
3568	case '!':
3569	if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3570	Kind = tok::exclaimequal;
3571	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3572	} else {
3573	Kind = tok::exclaim;
3574	}
3575	break;
3576	case '/':
3577	// 6.4.9: Comments
3578	Char = getCharAndSize(CurPtr, SizeTmp);
3579	if (Char == '/') { // Line comment.
3580	// Even if Line comments are disabled (e.g. in C89 mode), we generally
3581	// want to lex this as a comment. There is one problem with this though,
3582	// that in one particular corner case, this can change the behavior of the
3583	// resultant program. For example, In "foo //**/ bar", C89 would lex
3584	// this as "foo / bar" and languages with Line comments would lex it as
3585	// "foo". Check to see if the character after the second slash is a '*'.
3586	// If so, we will lex that as a "/" instead of the start of a comment.
3587	// However, we never do this if we are just preprocessing.
3588	bool TreatAsComment = LangOpts.LineComment &&
3589	(LangOpts.CPlusPlus \|\| !LangOpts.TraditionalCPP);
3590	if (!TreatAsComment)
3591	if (!(PP && PP->isPreprocessedOutput()))
3592	TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3593
3594	if (TreatAsComment) {
3595	if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3596	TokAtPhysicalStartOfLine))
3597	return true; // There is a token to return.
3598
3599	// It is common for the tokens immediately after a // comment to be
3600	// whitespace (indentation for the next line). Instead of going through
3601	// the big switch, handle it efficiently now.
3602	goto SkipIgnoredUnits;
3603	}
3604	}
3605
3606	if (Char == '') { // /*/ comment.
3607	if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3608	TokAtPhysicalStartOfLine))
3609	return true; // There is a token to return.
3610
3611	// We only saw whitespace, so just try again with this lexer.
3612	// (We manually eliminate the tail call to avoid recursion.)
3613	goto LexNextToken;
3614	}
3615
3616	if (Char == '=') {
3617	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3618	Kind = tok::slashequal;
3619	} else {
3620	Kind = tok::slash;
3621	}
3622	break;
3623	case '%':
3624	Char = getCharAndSize(CurPtr, SizeTmp);
3625	if (Char == '=') {
3626	Kind = tok::percentequal;
3627	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3628	} else if (LangOpts.Digraphs && Char == '>') {
3629	Kind = tok::r_brace; // '%>' -> '}'
3630	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3631	} else if (LangOpts.Digraphs && Char == ':') {
3632	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3633	Char = getCharAndSize(CurPtr, SizeTmp);
3634	if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3635	Kind = tok::hashhash; // '%:%:' -> '##'
3636	CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3637	SizeTmp2, Result);
3638	} else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
3639	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3640	if (!isLexingRawMode())
3641	Diag(BufferPtr, diag::ext_charize_microsoft);
3642	Kind = tok::hashat;
3643	} else { // '%:' -> '#'
3644	// We parsed a # character. If this occurs at the start of the line,
3645	// it's actually the start of a preprocessing directive. Callback to
3646	// the preprocessor to handle it.
3647	// TODO: -fpreprocessed mode??
3648	if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3649	goto HandleDirective;
3650
3651	Kind = tok::hash;
3652	}
3653	} else {
3654	Kind = tok::percent;
3655	}
3656	break;
3657	case '<':
3658	Char = getCharAndSize(CurPtr, SizeTmp);
3659	if (ParsingFilename) {
3660	return LexAngledStringLiteral(Result, CurPtr);
3661	} else if (Char == '<') {
3662	char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3663	if (After == '=') {
3664	Kind = tok::lesslessequal;
3665	CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3666	SizeTmp2, Result);
3667	} else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
3668	// If this is actually a '<<<<<<<' version control conflict marker,
3669	// recognize it as such and recover nicely.
3670	goto LexNextToken;
3671	} else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
3672	// If this is '<<<<' and we're in a Perforce-style conflict marker,
3673	// ignore it.
3674	goto LexNextToken;
3675	} else if (LangOpts.CUDA && After == '<') {
3676	Kind = tok::lesslessless;
3677	CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3678	SizeTmp2, Result);
3679	} else {
3680	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3681	Kind = tok::lessless;
3682	}
3683	} else if (Char == '=') {
3684	char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3685	if (After == '>') {
3686	if (getLangOpts().CPlusPlus2a) {
3687	if (!isLexingRawMode())
3688	Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3689	CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3690	SizeTmp2, Result);
3691	Kind = tok::spaceship;
3692	break;
3693	}
3694	// Suggest adding a space between the '<=' and the '>' to avoid a
3695	// change in semantics if this turns up in C++ <=17 mode.
3696	if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
3697	Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
3698	<< FixItHint::CreateInsertion(
3699	getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3700	}
3701	}
3702	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3703	Kind = tok::lessequal;
3704	} else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
3705	if (LangOpts.CPlusPlus11 &&
3706	getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3707	// C++0x [lex.pptoken]p3:
3708	// Otherwise, if the next three characters are <:: and the subsequent
3709	// character is neither : nor >, the < is treated as a preprocessor
3710	// token by itself and not as the first character of the alternative
3711	// token <:.
3712	unsigned SizeTmp3;
3713	char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3714	if (After != ':' && After != '>') {
3715	Kind = tok::less;
3716	if (!isLexingRawMode())
3717	Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3718	break;
3719	}
3720	}
3721
3722	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3723	Kind = tok::l_square;
3724	} else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
3725	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3726	Kind = tok::l_brace;
3727	} else if (Char == '#' && /Not a trigraph/ SizeTmp == 1 &&
3728	lexEditorPlaceholder(Result, CurPtr)) {
3729	return true;
3730	} else {
3731	Kind = tok::less;
3732	}
3733	break;
3734	case '>':
3735	Char = getCharAndSize(CurPtr, SizeTmp);
3736	if (Char == '=') {
3737	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3738	Kind = tok::greaterequal;
3739	} else if (Char == '>') {
3740	char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3741	if (After == '=') {
3742	CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3743	SizeTmp2, Result);
3744	Kind = tok::greatergreaterequal;
3745	} else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
3746	// If this is actually a '>>>>' conflict marker, recognize it as such
3747	// and recover nicely.
3748	goto LexNextToken;
3749	} else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
3750	// If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3751	goto LexNextToken;
3752	} else if (LangOpts.CUDA && After == '>') {
3753	Kind = tok::greatergreatergreater;
3754	CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3755	SizeTmp2, Result);
3756	} else {
3757	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3758	Kind = tok::greatergreater;
3759	}
3760	} else {
3761	Kind = tok::greater;
3762	}
3763	break;
3764	case '^':
3765	Char = getCharAndSize(CurPtr, SizeTmp);
3766	if (Char == '=') {
3767	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3768	Kind = tok::caretequal;
3769	} else if (LangOpts.OpenCL && Char == '^') {
3770	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3771	Kind = tok::caretcaret;
3772	} else {
3773	Kind = tok::caret;
3774	}
3775	break;
3776	case '\|':
3777	Char = getCharAndSize(CurPtr, SizeTmp);
3778	if (Char == '=') {
3779	Kind = tok::pipeequal;
3780	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3781	} else if (Char == '\|') {
3782	// If this is '\|\|\|\|\|\|\|' and we're in a conflict marker, ignore it.
3783	if (CurPtr[1] == '\|' && HandleEndOfConflictMarker(CurPtr-1))
3784	goto LexNextToken;
3785	Kind = tok::pipepipe;
3786	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3787	} else {
3788	Kind = tok::pipe;
3789	}
3790	break;
3791	case ':':
3792	Char = getCharAndSize(CurPtr, SizeTmp);
3793	if (LangOpts.Digraphs && Char == '>') {
3794	Kind = tok::r_square; // ':>' -> ']'
3795	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3796	} else if ((LangOpts.CPlusPlus \|\|
3797	LangOpts.DoubleSquareBracketAttributes) &&
3798	Char == ':') {
3799	Kind = tok::coloncolon;
3800	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3801	} else {
3802	Kind = tok::colon;
3803	}
3804	break;
3805	case ';':
3806	Kind = tok::semi;
3807	break;
3808	case '=':
3809	Char = getCharAndSize(CurPtr, SizeTmp);
3810	if (Char == '=') {
3811	// If this is '====' and we're in a conflict marker, ignore it.
3812	if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
3813	goto LexNextToken;
3814
3815	Kind = tok::equalequal;
3816	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3817	} else {
3818	Kind = tok::equal;
3819	}
3820	break;
3821	case ',':
3822	Kind = tok::comma;
3823	break;
3824	case '#':
3825	Char = getCharAndSize(CurPtr, SizeTmp);
3826	if (Char == '#') {
3827	Kind = tok::hashhash;
3828	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3829	} else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
3830	Kind = tok::hashat;
3831	if (!isLexingRawMode())
3832	Diag(BufferPtr, diag::ext_charize_microsoft);
3833	CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3834	} else {
3835	// We parsed a # character. If this occurs at the start of the line,
3836	// it's actually the start of a preprocessing directive. Callback to
3837	// the preprocessor to handle it.
3838	// TODO: -fpreprocessed mode??
3839	if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
3840	goto HandleDirective;
3841
3842	Kind = tok::hash;
3843	}
3844	break;
3845
3846	case '@':
3847	// Objective C support.
3848	if (CurPtr[-1] == '@' && LangOpts.ObjC)
3849	Kind = tok::at;
3850	else
3851	Kind = tok::unknown;
3852	break;
3853
3854	// UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3855	case '\\':
3856	if (!LangOpts.AsmPreprocessor) {
3857	if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3858	if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3859	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3860	return true; // KeepWhitespaceMode
3861
3862	// We only saw whitespace, so just try again with this lexer.
3863	// (We manually eliminate the tail call to avoid recursion.)
3864	goto LexNextToken;
3865	}
3866
3867	return LexUnicode(Result, CodePoint, CurPtr);
3868	}
3869	}
3870
3871	Kind = tok::unknown;
3872	break;
3873
3874	default: {
3875	if (isASCII(Char)) {
3876	Kind = tok::unknown;
3877	break;
3878	}
3879
3880	llvm::UTF32 CodePoint;
3881
3882	// We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3883	// an escaped newline.
3884	--CurPtr;
3885	llvm::ConversionResult Status =
3886	llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3887	(const llvm::UTF8 *)BufferEnd,
3888	&CodePoint,
3889	llvm::strictConversion);
3890	if (Status == llvm::conversionOK) {
3891	if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3892	if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3893	return true; // KeepWhitespaceMode
3894
3895	// We only saw whitespace, so just try again with this lexer.
3896	// (We manually eliminate the tail call to avoid recursion.)
3897	goto LexNextToken;
3898	}
3899	return LexUnicode(Result, CodePoint, CurPtr);
3900	}
3901
3902	if (isLexingRawMode() \|\| ParsingPreprocessorDirective \|\|
3903	PP->isPreprocessedOutput()) {
3904	++CurPtr;
3905	Kind = tok::unknown;
3906	break;
3907	}
3908
3909	// Non-ASCII characters tend to creep into source code unintentionally.
3910	// Instead of letting the parser complain about the unknown token,
3911	// just diagnose the invalid UTF-8, then drop the character.
3912	Diag(CurPtr, diag::err_invalid_utf8);
3913
3914	BufferPtr = CurPtr+1;
3915	// We're pretending the character didn't exist, so just try again with
3916	// this lexer.
3917	// (We manually eliminate the tail call to avoid recursion.)
3918	goto LexNextToken;
3919	}
3920	}
3921
3922	// Notify MIOpt that we read a non-whitespace/non-comment token.
3923	MIOpt.ReadToken();
3924
3925	// Update the location of token as well as BufferPtr.
3926	FormTokenWithChars(Result, CurPtr, Kind);
3927	return true;
3928
3929	HandleDirective:
3930	// We parsed a # character and it's the start of a preprocessing directive.
3931
3932	FormTokenWithChars(Result, CurPtr, tok::hash);
3933	PP->HandleDirective(Result);
3934
3935	if (PP->hadModuleLoaderFatalFailure()) {
3936	// With a fatal failure in the module loader, we abort parsing.
3937	assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof")((Result.is(tok::eof) && "Preprocessor did not set tok:eof" ) ? static_cast<void> (0) : __assert_fail ("Result.is(tok::eof) && \"Preprocessor did not set tok:eof\"" , "/build/llvm-toolchain-snapshot-9~svn362543/tools/clang/lib/Lex/Lexer.cpp" , 3937, __PRETTY_FUNCTION__));
3938	return true;
3939	}
3940
3941	// We parsed the directive; lex a token with the new state.
3942	return false;
3943	}