20 #include "llvm/Support/Regex.h" 29 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
31 Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32 FormattingDisabled(
false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33 MacroBlockEndRegex(Style.MacroBlockEnd) {
36 Lex->SetKeepWhitespaceMode(
true);
38 for (
const std::string &ForEachMacro : Style.ForEachMacros)
39 Macros.insert({&IdentTable.
get(ForEachMacro), TT_ForEachMacro});
40 for (
const std::string &StatementMacro : Style.StatementMacros)
41 Macros.insert({&IdentTable.
get(StatementMacro), TT_StatementMacro});
42 for (
const std::string &TypenameMacro : Style.TypenameMacros)
43 Macros.insert({&IdentTable.
get(TypenameMacro), TT_TypenameMacro});
44 for (
const std::string &NamespaceMacro : Style.NamespaceMacros)
45 Macros.insert({&IdentTable.
get(NamespaceMacro), TT_NamespaceMacro});
49 assert(Tokens.empty());
50 assert(FirstInLineIndex == 0);
52 Tokens.push_back(getNextToken());
53 if (Style.Language == FormatStyle::LK_JavaScript) {
54 tryParseJSRegexLiteral();
55 handleTemplateStrings();
57 if (Style.Language == FormatStyle::LK_TextProto)
58 tryParsePythonComment();
59 tryMergePreviousTokens();
60 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
61 FirstInLineIndex = Tokens.size() - 1;
62 }
while (Tokens.back()->Tok.isNot(
tok::eof));
66 void FormatTokenLexer::tryMergePreviousTokens() {
67 if (tryMerge_TMacro())
69 if (tryMergeConflictMarkers())
71 if (tryMergeLessLess())
74 if (Style.isCSharp()) {
75 if (tryMergeCSharpKeywordVariables())
77 if (tryMergeCSharpVerbatimStringLiteral())
79 if (tryMergeCSharpDoubleQuestion())
81 if (tryMergeCSharpNullConditionals())
83 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
84 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
88 if (tryMergeNSStringLiteral())
91 if (Style.Language == FormatStyle::LK_JavaScript) {
92 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
95 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
97 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
98 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
103 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
105 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
107 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
109 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
111 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
113 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
114 Tokens.back()->Tok.setKind(tok::starequal);
117 if (tryMergeJSPrivateIdentifier())
121 if (Style.Language == FormatStyle::LK_Java) {
123 tok::greater, tok::greater, tok::greaterequal};
124 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
129 bool FormatTokenLexer::tryMergeNSStringLiteral() {
130 if (Tokens.size() < 2)
132 auto &At = *(Tokens.end() - 2);
133 auto &String = *(Tokens.end() - 1);
134 if (!At->is(tok::at) || !String->is(tok::string_literal))
136 At->Tok.setKind(tok::string_literal);
137 At->TokenText = StringRef(At->TokenText.begin(),
138 String->TokenText.end() - At->TokenText.begin());
139 At->ColumnWidth += String->ColumnWidth;
140 At->Type = TT_ObjCStringLiteral;
141 Tokens.erase(Tokens.end() - 1);
145 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
148 if (Tokens.size() < 2)
150 auto &Hash = *(Tokens.end() - 2);
152 if (!Hash->is(tok::hash) || !
Identifier->is(tok::identifier))
154 Hash->Tok.setKind(tok::identifier);
156 StringRef(Hash->TokenText.begin(),
157 Identifier->TokenText.end() - Hash->TokenText.begin());
159 Hash->Type = TT_JsPrivateIdentifier;
160 Tokens.erase(Tokens.end() - 1);
167 bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
168 if (Tokens.size() < 2)
170 auto &At = *(Tokens.end() - 2);
171 auto &String = *(Tokens.end() - 1);
174 if (!(At->is(tok::at) || At->TokenText ==
"$") ||
175 !String->is(tok::string_literal))
178 if (Tokens.size() >= 2 && At->is(tok::at)) {
179 auto &Dollar = *(Tokens.end() - 3);
180 if (Dollar->TokenText ==
"$") {
182 Dollar->Tok.setKind(tok::string_literal);
184 StringRef(Dollar->TokenText.begin(),
185 String->TokenText.end() - Dollar->TokenText.begin());
186 Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
187 Dollar->Type = TT_CSharpStringLiteral;
188 Tokens.erase(Tokens.end() - 2);
189 Tokens.erase(Tokens.end() - 1);
195 At->Tok.setKind(tok::string_literal);
196 At->TokenText = StringRef(At->TokenText.begin(),
197 String->TokenText.end() - At->TokenText.begin());
198 At->ColumnWidth += String->ColumnWidth;
199 At->Type = TT_CSharpStringLiteral;
200 Tokens.erase(Tokens.end() - 1);
204 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
205 if (Tokens.size() < 2)
207 auto &FirstQuestion = *(Tokens.end() - 2);
208 auto &SecondQuestion = *(Tokens.end() - 1);
209 if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
211 FirstQuestion->Tok.setKind(tok::question);
212 FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
213 SecondQuestion->TokenText.end() -
214 FirstQuestion->TokenText.begin());
215 FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
216 FirstQuestion->Type = TT_CSharpNullCoalescing;
217 Tokens.erase(Tokens.end() - 1);
221 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
222 if (Tokens.size() < 2)
224 auto &At = *(Tokens.end() - 2);
225 auto &Keyword = *(Tokens.end() - 1);
226 if (!At->is(tok::at))
231 At->Tok.setKind(tok::identifier);
232 At->TokenText = StringRef(At->TokenText.begin(),
233 Keyword->TokenText.end() - At->TokenText.begin());
234 At->ColumnWidth += Keyword->ColumnWidth;
235 At->Type = Keyword->Type;
236 Tokens.erase(Tokens.end() - 1);
241 bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
242 if (Tokens.size() < 2)
245 auto &Question = *(Tokens.end() - 1);
246 if (!
Identifier->isOneOf(tok::r_square, tok::identifier) ||
247 !Question->is(tok::question))
251 Question->TokenText.end() -
Identifier->TokenText.begin());
252 Identifier->ColumnWidth += Question->ColumnWidth;
253 Tokens.erase(Tokens.end() - 1);
257 bool FormatTokenLexer::tryMergeLessLess() {
259 if (Tokens.size() < 3)
262 bool FourthTokenIsLess =
false;
263 if (Tokens.size() > 3)
264 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
266 auto First = Tokens.end() - 3;
267 if (
First[2]->is(tok::less) ||
First[1]->isNot(tok::less) ||
268 First[0]->isNot(tok::less) || FourthTokenIsLess)
272 if (
First[1]->WhitespaceRange.getBegin() !=
273 First[1]->WhitespaceRange.getEnd())
276 First[0]->Tok.setKind(tok::lessless);
277 First[0]->TokenText =
"<<";
278 First[0]->ColumnWidth += 1;
279 Tokens.erase(Tokens.end() - 2);
285 if (Tokens.size() < Kinds.size())
289 Tokens.end() - Kinds.size();
290 if (!First[0]->is(Kinds[0]))
292 unsigned AddLength = 0;
293 for (
unsigned i = 1;
i < Kinds.size(); ++
i) {
294 if (!First[
i]->is(Kinds[
i]) || First[
i]->WhitespaceRange.getBegin() !=
295 First[
i]->WhitespaceRange.getEnd())
297 AddLength += First[
i]->TokenText.size();
299 Tokens.resize(Tokens.size() - Kinds.size() + 1);
300 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
301 First[0]->TokenText.size() + AddLength);
302 First[0]->ColumnWidth += AddLength;
303 First[0]->Type = NewType;
312 return Tok->
isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
313 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
314 tok::colon, tok::question, tok::tilde) ||
315 Tok->
isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
316 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
321 bool FormatTokenLexer::canPrecedeRegexLiteral(
FormatToken *Prev) {
331 if (Prev->
isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
332 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
336 if (!precedesOperand(Prev))
346 void FormatTokenLexer::tryParseJSRegexLiteral() {
348 if (!RegexToken->
isOneOf(tok::slash, tok::slashequal))
352 for (
auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
355 if ((*I)->isNot(tok::comment)) {
361 if (!canPrecedeRegexLiteral(Prev))
365 const char *
Offset = Lex->getBufferLocation();
366 const char *RegexBegin = Offset - RegexToken->
TokenText.size();
367 StringRef Buffer = Lex->getBuffer();
368 bool InCharacterClass =
false;
369 bool HaveClosingSlash =
false;
370 for (; !HaveClosingSlash && Offset != Buffer.end(); ++
Offset) {
380 InCharacterClass =
true;
383 InCharacterClass =
false;
386 if (!InCharacterClass)
387 HaveClosingSlash =
true;
392 RegexToken->
Type = TT_RegexLiteral;
395 RegexToken->
TokenText = StringRef(RegexBegin, Offset - RegexBegin);
398 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
401 void FormatTokenLexer::handleTemplateStrings() {
404 if (BacktickToken->
is(tok::l_brace)) {
408 if (BacktickToken->
is(tok::r_brace)) {
409 if (StateStack.size() == 1)
415 }
else if (BacktickToken->
is(tok::unknown) &&
423 const char *
Offset = Lex->getBufferLocation();
424 const char *TmplBegin = Offset - BacktickToken->
TokenText.size();
425 for (; Offset != Lex->getBuffer().end(); ++
Offset) {
426 if (Offset[0] ==
'`') {
430 if (Offset[0] ==
'\\') {
432 }
else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] ==
'$' &&
441 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
442 BacktickToken->
Type = TT_TemplateString;
443 BacktickToken->
Tok.
setKind(tok::string_literal);
447 size_t FirstBreak = LiteralText.find(
'\n');
448 StringRef FirstLineText = FirstBreak == StringRef::npos
450 : LiteralText.substr(0, FirstBreak);
452 FirstLineText, BacktickToken->
OriginalColumn, Style.TabWidth, Encoding);
453 size_t LastBreak = LiteralText.rfind(
'\n');
454 if (LastBreak != StringRef::npos) {
456 unsigned StartColumn = 0;
458 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
459 Style.TabWidth, Encoding);
463 ? Lex->getSourceLocation(Offset + 1)
468 void FormatTokenLexer::tryParsePythonComment() {
470 if (!HashToken->
isOneOf(tok::hash, tok::hashhash))
473 const char *CommentBegin =
474 Lex->getBufferLocation() - HashToken->
TokenText.size();
475 size_t From = CommentBegin - Lex->getBuffer().begin();
476 size_t To = Lex->getBuffer().find_first_of(
'\n', From);
477 if (To == StringRef::npos)
478 To = Lex->getBuffer().size();
479 size_t Len = To - From;
480 HashToken->
Type = TT_LineComment;
482 HashToken->
TokenText = Lex->getBuffer().substr(From, Len);
484 ? Lex->getSourceLocation(CommentBegin + Len)
489 bool FormatTokenLexer::tryMerge_TMacro() {
490 if (Tokens.size() < 4)
493 if (!Last->
is(tok::r_paren))
500 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
507 const char *Start = Macro->
TokenText.data();
509 String->
TokenText = StringRef(Start, End - Start);
522 Tokens.back() = String;
526 bool FormatTokenLexer::tryMergeConflictMarkers() {
527 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(
tok::eof))
541 unsigned FirstInLineOffset;
543 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
544 StringRef Buffer = SourceMgr.
getBuffer(ID)->getBuffer();
546 auto LineOffset = Buffer.rfind(
'\n', FirstInLineOffset);
547 if (LineOffset == StringRef::npos) {
553 auto FirstSpace = Buffer.find_first_of(
" \n", LineOffset);
555 if (FirstSpace == StringRef::npos) {
556 LineStart = Buffer.substr(LineOffset);
558 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
562 if (LineStart ==
"<<<<<<<" || LineStart ==
">>>>") {
563 Type = TT_ConflictStart;
564 }
else if (LineStart ==
"|||||||" || LineStart ==
"=======" ||
565 LineStart ==
"====") {
566 Type = TT_ConflictAlternative;
567 }
else if (LineStart ==
">>>>>>>" || LineStart ==
"<<<<") {
568 Type = TT_ConflictEnd;
571 if (Type != TT_Unknown) {
574 Tokens.resize(FirstInLineIndex + 1);
578 Tokens.back()->Type = Type;
579 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
581 Tokens.push_back(Next);
591 StringRef TokenText = FormatTok->
TokenText;
594 FormatTok =
new (Allocator.Allocate())
FormatToken;
610 return getStashedToken();
613 FormatTok =
new (Allocator.Allocate())
FormatToken;
614 readRawToken(*FormatTok);
617 FormatTok->
IsFirst = IsFirstToken;
618 IsFirstToken =
false;
621 unsigned WhitespaceLength = TrailingWhitespace;
622 while (FormatTok->
Tok.
is(tok::unknown)) {
624 auto EscapesNewline = [&](
int pos) {
626 if (pos >= 0 && Text[pos] ==
'\r')
633 for (; pos >= 0; --pos, ++count)
634 if (Text[pos] !=
'\\')
640 for (
int i = 0, e = Text.size();
i != e; ++
i) {
660 Column += Style.TabWidth - Column % Style.TabWidth;
663 if (i + 1 == e || (Text[i + 1] !=
'\r' && Text[i + 1] !=
'\n'))
664 FormatTok->
Type = TT_ImplicitStringLiteral;
667 FormatTok->
Type = TT_ImplicitStringLiteral;
670 if (FormatTok->
Type == TT_ImplicitStringLiteral)
674 if (FormatTok->
is(TT_ImplicitStringLiteral))
678 readRawToken(*FormatTok);
688 if ((Style.Language == FormatStyle::LK_JavaScript ||
689 Style.Language == FormatStyle::LK_Java) &&
690 FormatTok->
is(tok::comment) && FormatTok->
TokenText.startswith(
"//")) {
691 size_t BackslashPos = FormatTok->
TokenText.find(
'\\');
692 while (BackslashPos != StringRef::npos) {
693 if (BackslashPos + 1 < FormatTok->
TokenText.size() &&
694 FormatTok->
TokenText[BackslashPos + 1] ==
'\n') {
695 const char *
Offset = Lex->getBufferLocation();
697 Offset += BackslashPos + 1;
698 resetLexer(SourceMgr.
getFileOffset(Lex->getSourceLocation(Offset)));
705 BackslashPos = FormatTok->
TokenText.find(
'\\', BackslashPos + 1);
714 unsigned SkippedWhitespace = 0;
717 SkippedWhitespace = 3;
718 else if (FormatTok->
TokenText[1] ==
'\n')
719 SkippedWhitespace = 2;
724 WhitespaceLength += SkippedWhitespace;
735 TrailingWhitespace = 0;
736 if (FormatTok->
Tok.
is(tok::comment)) {
738 StringRef UntrimmedText = FormatTok->
TokenText;
740 TrailingWhitespace = UntrimmedText.size() - FormatTok->
TokenText.size();
741 }
else if (FormatTok->
Tok.
is(tok::raw_identifier)) {
745 if (Style.Language == FormatStyle::LK_Java &&
746 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
750 }
else if (Style.Language == FormatStyle::LK_JavaScript &&
751 FormatTok->
isOneOf(tok::kw_struct, tok::kw_union,
756 }
else if (FormatTok->
Tok.
is(tok::greatergreater)) {
761 }
else if (FormatTok->
Tok.
is(tok::lessless)) {
771 size_t FirstNewlinePos = Text.find(
'\n');
772 if (FirstNewlinePos == StringRef::npos) {
783 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
788 Text.substr(Text.find_last_of(
'\n') + 1), 0, Style.TabWidth, Encoding);
794 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
795 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
797 it != Macros.end()) {
798 FormatTok->
Type = it->second;
799 }
else if (FormatTok->
is(tok::identifier)) {
800 if (MacroBlockBeginRegex.match(Text)) {
801 FormatTok->
Type = TT_MacroBlockBegin;
802 }
else if (MacroBlockEndRegex.match(Text)) {
803 FormatTok->
Type = TT_MacroBlockEnd;
811 void FormatTokenLexer::readRawToken(
FormatToken &Tok) {
812 Lex->LexFromRawLexer(Tok.
Tok);
817 if (Tok.
is(tok::unknown)) {
821 }
else if (Style.Language == FormatStyle::LK_JavaScript &&
827 if ((Style.Language == FormatStyle::LK_JavaScript ||
828 Style.Language == FormatStyle::LK_Proto ||
829 Style.Language == FormatStyle::LK_TextProto) &&
830 Tok.
is(tok::char_constant)) {
834 if (Tok.
is(tok::comment) && (Tok.
TokenText ==
"// clang-format on" ||
835 Tok.
TokenText ==
"/* clang-format on */")) {
836 FormattingDisabled =
false;
841 if (Tok.
is(tok::comment) && (Tok.
TokenText ==
"// clang-format off" ||
842 Tok.
TokenText ==
"/* clang-format off */")) {
843 FormattingDisabled =
true;
847 void FormatTokenLexer::resetLexer(
unsigned Offset) {
851 Buffer.begin() +
Offset, Buffer.end()));
852 Lex->SetKeepWhitespaceMode(
true);
853 TrailingWhitespace = 0;
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Defines the SourceManager interface.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
One of these records is kept for each identifier that is lexed.
Token - This structure provides full information about a lexed token.
void setKind(tok::TokenKind K)
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
IdentifierInfo * getIdentifierInfo() const
void setIdentifierInfo(IdentifierInfo *II)
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
const llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Dataflow Directional Tag Classes.
unsigned getLength() const
Defines the clang::SourceLocation class and associated facilities.
void setLocation(SourceLocation L)
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.