21 #include "llvm/Support/Regex.h" 30 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32 Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
37 Lex->SetKeepWhitespaceMode(
true);
40 ForEachMacros.push_back(&IdentTable.
get(ForEachMacro));
41 llvm::sort(ForEachMacros.begin(), ForEachMacros.end());
45 assert(Tokens.empty());
46 assert(FirstInLineIndex == 0);
48 Tokens.push_back(getNextToken());
50 tryParseJSRegexLiteral();
51 handleTemplateStrings();
54 tryParsePythonComment();
55 tryMergePreviousTokens();
56 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
57 FirstInLineIndex = Tokens.size() - 1;
58 }
while (Tokens.back()->Tok.isNot(
tok::eof));
62 void FormatTokenLexer::tryMergePreviousTokens() {
63 if (tryMerge_TMacro())
65 if (tryMergeConflictMarkers())
67 if (tryMergeLessLess())
69 if (tryMergeNSStringLiteral())
73 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
76 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
78 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
79 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
84 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
86 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
88 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
90 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
92 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
94 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
95 Tokens.back()->Tok.setKind(tok::starequal);
102 tok::greater, tok::greater, tok::greaterequal};
103 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
108 bool FormatTokenLexer::tryMergeNSStringLiteral() {
109 if (Tokens.size() < 2)
111 auto &At = *(Tokens.end() - 2);
112 auto &String = *(Tokens.end() - 1);
113 if (!At->is(tok::at) || !String->is(tok::string_literal))
115 At->Tok.setKind(tok::string_literal);
116 At->TokenText = StringRef(At->TokenText.begin(),
117 String->TokenText.end() - At->TokenText.begin());
118 At->ColumnWidth += String->ColumnWidth;
119 At->Type = TT_ObjCStringLiteral;
120 Tokens.erase(Tokens.end() - 1);
124 bool FormatTokenLexer::tryMergeLessLess() {
126 if (Tokens.size() < 3)
129 bool FourthTokenIsLess =
false;
130 if (Tokens.size() > 3)
131 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
133 auto First = Tokens.end() - 3;
134 if (
First[2]->is(tok::less) ||
First[1]->isNot(tok::less) ||
135 First[0]->isNot(tok::less) || FourthTokenIsLess)
139 if (
First[1]->WhitespaceRange.getBegin() !=
140 First[1]->WhitespaceRange.getEnd())
143 First[0]->Tok.setKind(tok::lessless);
144 First[0]->TokenText =
"<<";
145 First[0]->ColumnWidth += 1;
146 Tokens.erase(Tokens.end() - 2);
152 if (Tokens.size() < Kinds.size())
156 Tokens.end() - Kinds.size();
157 if (!First[0]->is(Kinds[0]))
159 unsigned AddLength = 0;
160 for (
unsigned i = 1; i < Kinds.size(); ++i) {
161 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
162 First[i]->WhitespaceRange.getEnd())
164 AddLength += First[i]->TokenText.size();
166 Tokens.resize(Tokens.size() - Kinds.size() + 1);
167 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
168 First[0]->TokenText.size() + AddLength);
169 First[0]->ColumnWidth += AddLength;
170 First[0]->Type = NewType;
179 return Tok->
isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
180 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
181 tok::colon, tok::question, tok::tilde) ||
182 Tok->
isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
183 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
188 bool FormatTokenLexer::canPrecedeRegexLiteral(
FormatToken *Prev) {
198 if (Prev->
isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
199 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
203 if (!precedesOperand(Prev))
213 void FormatTokenLexer::tryParseJSRegexLiteral() {
215 if (!RegexToken->
isOneOf(tok::slash, tok::slashequal))
219 for (
auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
222 if ((*I)->isNot(tok::comment)) {
228 if (!canPrecedeRegexLiteral(Prev))
232 const char *
Offset = Lex->getBufferLocation();
233 const char *RegexBegin = Offset - RegexToken->
TokenText.size();
234 StringRef Buffer = Lex->getBuffer();
235 bool InCharacterClass =
false;
236 bool HaveClosingSlash =
false;
237 for (; !HaveClosingSlash && Offset != Buffer.end(); ++
Offset) {
247 InCharacterClass =
true;
250 InCharacterClass =
false;
253 if (!InCharacterClass)
254 HaveClosingSlash =
true;
259 RegexToken->
Type = TT_RegexLiteral;
262 RegexToken->
TokenText = StringRef(RegexBegin, Offset - RegexBegin);
265 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
268 void FormatTokenLexer::handleTemplateStrings() {
271 if (BacktickToken->
is(tok::l_brace)) {
275 if (BacktickToken->
is(tok::r_brace)) {
276 if (StateStack.size() == 1)
282 }
else if (BacktickToken->
is(tok::unknown) &&
290 const char *
Offset = Lex->getBufferLocation();
291 const char *TmplBegin = Offset - BacktickToken->
TokenText.size();
292 for (; Offset != Lex->getBuffer().end(); ++
Offset) {
293 if (Offset[0] ==
'`') {
297 if (Offset[0] ==
'\\') {
299 }
else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] ==
'$' &&
308 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
309 BacktickToken->
Type = TT_TemplateString;
310 BacktickToken->
Tok.
setKind(tok::string_literal);
314 size_t FirstBreak = LiteralText.find(
'\n');
315 StringRef FirstLineText = FirstBreak == StringRef::npos
317 : LiteralText.substr(0, FirstBreak);
320 size_t LastBreak = LiteralText.rfind(
'\n');
321 if (LastBreak != StringRef::npos) {
323 unsigned StartColumn = 0;
325 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
330 ? Lex->getSourceLocation(Offset + 1)
335 void FormatTokenLexer::tryParsePythonComment() {
337 if (!HashToken->
isOneOf(tok::hash, tok::hashhash))
340 const char *CommentBegin =
341 Lex->getBufferLocation() - HashToken->
TokenText.size();
342 size_t From = CommentBegin - Lex->getBuffer().begin();
343 size_t To = Lex->getBuffer().find_first_of(
'\n', From);
344 if (To == StringRef::npos)
345 To = Lex->getBuffer().size();
346 size_t Len = To - From;
347 HashToken->
Type = TT_LineComment;
349 HashToken->
TokenText = Lex->getBuffer().substr(From, Len);
351 ? Lex->getSourceLocation(CommentBegin + Len)
356 bool FormatTokenLexer::tryMerge_TMacro() {
357 if (Tokens.size() < 4)
360 if (!Last->
is(tok::r_paren))
367 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
374 const char *Start = Macro->
TokenText.data();
376 String->
TokenText = StringRef(Start, End - Start);
389 Tokens.back() = String;
393 bool FormatTokenLexer::tryMergeConflictMarkers() {
394 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(
tok::eof))
408 unsigned FirstInLineOffset;
410 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
411 StringRef Buffer = SourceMgr.
getBuffer(ID)->getBuffer();
413 auto LineOffset = Buffer.rfind(
'\n', FirstInLineOffset);
414 if (LineOffset == StringRef::npos) {
420 auto FirstSpace = Buffer.find_first_of(
" \n", LineOffset);
422 if (FirstSpace == StringRef::npos) {
423 LineStart = Buffer.substr(LineOffset);
425 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
429 if (LineStart ==
"<<<<<<<" || LineStart ==
">>>>") {
430 Type = TT_ConflictStart;
431 }
else if (LineStart ==
"|||||||" || LineStart ==
"=======" ||
432 LineStart ==
"====") {
433 Type = TT_ConflictAlternative;
434 }
else if (LineStart ==
">>>>>>>" || LineStart ==
"<<<<") {
435 Type = TT_ConflictEnd;
438 if (Type != TT_Unknown) {
441 Tokens.resize(FirstInLineIndex + 1);
445 Tokens.back()->Type = Type;
446 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
448 Tokens.push_back(Next);
458 StringRef TokenText = FormatTok->
TokenText;
461 FormatTok =
new (Allocator.Allocate())
FormatToken;
477 return getStashedToken();
480 FormatTok =
new (Allocator.Allocate())
FormatToken;
481 readRawToken(*FormatTok);
484 FormatTok->
IsFirst = IsFirstToken;
485 IsFirstToken =
false;
488 unsigned WhitespaceLength = TrailingWhitespace;
489 while (FormatTok->
Tok.
is(tok::unknown)) {
491 auto EscapesNewline = [&](
int pos) {
493 if (pos >= 0 && Text[pos] ==
'\r')
500 for (; pos >= 0; --pos, ++count)
501 if (Text[pos] !=
'\\')
507 for (
int i = 0, e = Text.size(); i != e; ++i) {
530 if (i + 1 == e || (Text[i + 1] !=
'\r' && Text[i + 1] !=
'\n'))
531 FormatTok->
Type = TT_ImplicitStringLiteral;
534 FormatTok->
Type = TT_ImplicitStringLiteral;
537 if (FormatTok->
Type == TT_ImplicitStringLiteral)
541 if (FormatTok->
is(TT_ImplicitStringLiteral))
545 readRawToken(*FormatTok);
557 FormatTok->
is(tok::comment) && FormatTok->
TokenText.startswith(
"//")) {
558 size_t BackslashPos = FormatTok->
TokenText.find(
'\\');
559 while (BackslashPos != StringRef::npos) {
560 if (BackslashPos + 1 < FormatTok->
TokenText.size() &&
561 FormatTok->
TokenText[BackslashPos + 1] ==
'\n') {
562 const char *
Offset = Lex->getBufferLocation();
564 Offset += BackslashPos + 1;
565 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
572 BackslashPos = FormatTok->
TokenText.find(
'\\', BackslashPos + 1);
581 unsigned SkippedWhitespace = 0;
584 SkippedWhitespace = 3;
585 else if (FormatTok->
TokenText[1] ==
'\n')
586 SkippedWhitespace = 2;
591 WhitespaceLength += SkippedWhitespace;
602 TrailingWhitespace = 0;
603 if (FormatTok->
Tok.
is(tok::comment)) {
605 StringRef UntrimmedText = FormatTok->
TokenText;
607 TrailingWhitespace = UntrimmedText.size() - FormatTok->
TokenText.size();
608 }
else if (FormatTok->
Tok.
is(tok::raw_identifier)) {
613 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
618 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union,
623 }
else if (FormatTok->
Tok.
is(tok::greatergreater)) {
628 }
else if (FormatTok->
Tok.
is(tok::lessless)) {
638 size_t FirstNewlinePos = Text.find(
'\n');
639 if (FirstNewlinePos == StringRef::npos) {
650 Text.substr(0, FirstNewlinePos), Column, Style.
TabWidth, Encoding);
655 Text.substr(Text.find_last_of(
'\n') + 1), 0, Style.
TabWidth, Encoding);
660 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
661 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
663 std::find(ForEachMacros.begin(), ForEachMacros.end(),
665 FormatTok->
Type = TT_ForEachMacro;
666 }
else if (FormatTok->
is(tok::identifier)) {
667 if (MacroBlockBeginRegex.match(Text)) {
668 FormatTok->
Type = TT_MacroBlockBegin;
669 }
else if (MacroBlockEndRegex.match(Text)) {
670 FormatTok->
Type = TT_MacroBlockEnd;
678 void FormatTokenLexer::readRawToken(
FormatToken &Tok) {
679 Lex->LexFromRawLexer(Tok.
Tok);
684 if (Tok.
is(tok::unknown)) {
697 Tok.
is(tok::char_constant)) {
701 if (Tok.
is(tok::comment) && (Tok.
TokenText ==
"// clang-format on" ||
702 Tok.
TokenText ==
"/* clang-format on */")) {
703 FormattingDisabled =
false;
708 if (Tok.
is(tok::comment) && (Tok.
TokenText ==
"// clang-format off" ||
709 Tok.
TokenText ==
"/* clang-format off */")) {
710 FormattingDisabled =
true;
714 void FormatTokenLexer::resetLexer(
unsigned Offset) {
718 Buffer.begin() +
Offset, Buffer.end()));
719 Lex->SetKeepWhitespaceMode(
true);
720 TrailingWhitespace = 0;
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Defines the SourceManager interface.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
One of these records is kept for each identifier that is lexed.
Token - This structure provides full information about a lexed token.
void setKind(tok::TokenKind K)
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
IdentifierInfo * getIdentifierInfo() const
void setIdentifierInfo(IdentifierInfo *II)
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Dataflow Directional Tag Classes.
unsigned getLength() const
Defines the clang::SourceLocation class and associated facilities.
void setLocation(SourceLocation L)
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.