clang  9.0.0
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
26  unsigned Column, const FormatStyle &Style,
28  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
29  Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
30  Style(Style), IdentTable(getFormattingLangOpts(Style)),
31  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33  MacroBlockEndRegex(Style.MacroBlockEnd) {
34  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
35  getFormattingLangOpts(Style)));
36  Lex->SetKeepWhitespaceMode(true);
37 
38  for (const std::string &ForEachMacro : Style.ForEachMacros)
39  Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
40  for (const std::string &StatementMacro : Style.StatementMacros)
41  Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
42  for (const std::string &TypenameMacro : Style.TypenameMacros)
43  Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
44  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
45  Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
46 }
47 
49  assert(Tokens.empty());
50  assert(FirstInLineIndex == 0);
51  do {
52  Tokens.push_back(getNextToken());
53  if (Style.Language == FormatStyle::LK_JavaScript) {
54  tryParseJSRegexLiteral();
55  handleTemplateStrings();
56  }
57  if (Style.Language == FormatStyle::LK_TextProto)
58  tryParsePythonComment();
59  tryMergePreviousTokens();
60  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
61  FirstInLineIndex = Tokens.size() - 1;
62  } while (Tokens.back()->Tok.isNot(tok::eof));
63  return Tokens;
64 }
65 
66 void FormatTokenLexer::tryMergePreviousTokens() {
67  if (tryMerge_TMacro())
68  return;
69  if (tryMergeConflictMarkers())
70  return;
71  if (tryMergeLessLess())
72  return;
73 
74  if (Style.isCSharp()) {
75  if (tryMergeCSharpKeywordVariables())
76  return;
77  if (tryMergeCSharpVerbatimStringLiteral())
78  return;
79  if (tryMergeCSharpDoubleQuestion())
80  return;
81  if (tryMergeCSharpNullConditionals())
82  return;
83  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
84  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
85  return;
86  }
87 
88  if (tryMergeNSStringLiteral())
89  return;
90 
91  if (Style.Language == FormatStyle::LK_JavaScript) {
92  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
93  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
94  tok::equal};
95  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
96  tok::greaterequal};
97  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
98  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
99  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
100  tok::starequal};
101 
102  // FIXME: Investigate what token type gives the correct operator priority.
103  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
104  return;
105  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
106  return;
107  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
108  return;
109  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
110  return;
111  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
112  return;
113  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
114  Tokens.back()->Tok.setKind(tok::starequal);
115  return;
116  }
117  if (tryMergeJSPrivateIdentifier())
118  return;
119  }
120 
121  if (Style.Language == FormatStyle::LK_Java) {
122  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
123  tok::greater, tok::greater, tok::greaterequal};
124  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
125  return;
126  }
127 }
128 
129 bool FormatTokenLexer::tryMergeNSStringLiteral() {
130  if (Tokens.size() < 2)
131  return false;
132  auto &At = *(Tokens.end() - 2);
133  auto &String = *(Tokens.end() - 1);
134  if (!At->is(tok::at) || !String->is(tok::string_literal))
135  return false;
136  At->Tok.setKind(tok::string_literal);
137  At->TokenText = StringRef(At->TokenText.begin(),
138  String->TokenText.end() - At->TokenText.begin());
139  At->ColumnWidth += String->ColumnWidth;
140  At->Type = TT_ObjCStringLiteral;
141  Tokens.erase(Tokens.end() - 1);
142  return true;
143 }
144 
145 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
146  // Merges #idenfier into a single identifier with the text #identifier
147  // but the token tok::identifier.
148  if (Tokens.size() < 2)
149  return false;
150  auto &Hash = *(Tokens.end() - 2);
151  auto &Identifier = *(Tokens.end() - 1);
152  if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
153  return false;
154  Hash->Tok.setKind(tok::identifier);
155  Hash->TokenText =
156  StringRef(Hash->TokenText.begin(),
157  Identifier->TokenText.end() - Hash->TokenText.begin());
158  Hash->ColumnWidth += Identifier->ColumnWidth;
159  Hash->Type = TT_JsPrivateIdentifier;
160  Tokens.erase(Tokens.end() - 1);
161  return true;
162 }
163 
164 // Search for verbatim or interpolated string literals @"ABC" or
165 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
166 // prevent splitting of @, $ and ".
167 bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
168  if (Tokens.size() < 2)
169  return false;
170  auto &At = *(Tokens.end() - 2);
171  auto &String = *(Tokens.end() - 1);
172 
173  // Look for $"aaaaaa" @"aaaaaa".
174  if (!(At->is(tok::at) || At->TokenText == "$") ||
175  !String->is(tok::string_literal))
176  return false;
177 
178  if (Tokens.size() >= 2 && At->is(tok::at)) {
179  auto &Dollar = *(Tokens.end() - 3);
180  if (Dollar->TokenText == "$") {
181  // This looks like $@"aaaaa" so we need to combine all 3 tokens.
182  Dollar->Tok.setKind(tok::string_literal);
183  Dollar->TokenText =
184  StringRef(Dollar->TokenText.begin(),
185  String->TokenText.end() - Dollar->TokenText.begin());
186  Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
187  Dollar->Type = TT_CSharpStringLiteral;
188  Tokens.erase(Tokens.end() - 2);
189  Tokens.erase(Tokens.end() - 1);
190  return true;
191  }
192  }
193 
194  // Convert back into just a string_literal.
195  At->Tok.setKind(tok::string_literal);
196  At->TokenText = StringRef(At->TokenText.begin(),
197  String->TokenText.end() - At->TokenText.begin());
198  At->ColumnWidth += String->ColumnWidth;
199  At->Type = TT_CSharpStringLiteral;
200  Tokens.erase(Tokens.end() - 1);
201  return true;
202 }
203 
204 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
205  if (Tokens.size() < 2)
206  return false;
207  auto &FirstQuestion = *(Tokens.end() - 2);
208  auto &SecondQuestion = *(Tokens.end() - 1);
209  if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
210  return false;
211  FirstQuestion->Tok.setKind(tok::question);
212  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
213  SecondQuestion->TokenText.end() -
214  FirstQuestion->TokenText.begin());
215  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
216  FirstQuestion->Type = TT_CSharpNullCoalescing;
217  Tokens.erase(Tokens.end() - 1);
218  return true;
219 }
220 
221 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
222  if (Tokens.size() < 2)
223  return false;
224  auto &At = *(Tokens.end() - 2);
225  auto &Keyword = *(Tokens.end() - 1);
226  if (!At->is(tok::at))
227  return false;
228  if (!Keywords.isCSharpKeyword(*Keyword))
229  return false;
230 
231  At->Tok.setKind(tok::identifier);
232  At->TokenText = StringRef(At->TokenText.begin(),
233  Keyword->TokenText.end() - At->TokenText.begin());
234  At->ColumnWidth += Keyword->ColumnWidth;
235  At->Type = Keyword->Type;
236  Tokens.erase(Tokens.end() - 1);
237  return true;
238 }
239 
240 // In C# merge the Identifier and the ? together e.g. arg?.
241 bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
242  if (Tokens.size() < 2)
243  return false;
244  auto &Identifier = *(Tokens.end() - 2);
245  auto &Question = *(Tokens.end() - 1);
246  if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
247  !Question->is(tok::question))
248  return false;
249  Identifier->TokenText =
250  StringRef(Identifier->TokenText.begin(),
251  Question->TokenText.end() - Identifier->TokenText.begin());
252  Identifier->ColumnWidth += Question->ColumnWidth;
253  Tokens.erase(Tokens.end() - 1);
254  return true;
255 }
256 
257 bool FormatTokenLexer::tryMergeLessLess() {
258  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
259  if (Tokens.size() < 3)
260  return false;
261 
262  bool FourthTokenIsLess = false;
263  if (Tokens.size() > 3)
264  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
265 
266  auto First = Tokens.end() - 3;
267  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
268  First[0]->isNot(tok::less) || FourthTokenIsLess)
269  return false;
270 
271  // Only merge if there currently is no whitespace between the two "<".
272  if (First[1]->WhitespaceRange.getBegin() !=
273  First[1]->WhitespaceRange.getEnd())
274  return false;
275 
276  First[0]->Tok.setKind(tok::lessless);
277  First[0]->TokenText = "<<";
278  First[0]->ColumnWidth += 1;
279  Tokens.erase(Tokens.end() - 2);
280  return true;
281 }
282 
283 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
284  TokenType NewType) {
285  if (Tokens.size() < Kinds.size())
286  return false;
287 
289  Tokens.end() - Kinds.size();
290  if (!First[0]->is(Kinds[0]))
291  return false;
292  unsigned AddLength = 0;
293  for (unsigned i = 1; i < Kinds.size(); ++i) {
294  if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
295  First[i]->WhitespaceRange.getEnd())
296  return false;
297  AddLength += First[i]->TokenText.size();
298  }
299  Tokens.resize(Tokens.size() - Kinds.size() + 1);
300  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
301  First[0]->TokenText.size() + AddLength);
302  First[0]->ColumnWidth += AddLength;
303  First[0]->Type = NewType;
304  return true;
305 }
306 
307 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
308 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
309  // NB: This is not entirely correct, as an r_paren can introduce an operand
310  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
311  // corner case to not matter in practice, though.
312  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
313  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
314  tok::colon, tok::question, tok::tilde) ||
315  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
316  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
317  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
318  Tok->isBinaryOperator();
319 }
320 
321 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
322  if (!Prev)
323  return true;
324 
325  // Regex literals can only follow after prefix unary operators, not after
326  // postfix unary operators. If the '++' is followed by a non-operand
327  // introducing token, the slash here is the operand and not the start of a
328  // regex.
329  // `!` is an unary prefix operator, but also a post-fix operator that casts
330  // away nullability, so the same check applies.
331  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
332  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
333 
334  // The previous token must introduce an operand location where regex
335  // literals can occur.
336  if (!precedesOperand(Prev))
337  return false;
338 
339  return true;
340 }
341 
342 // Tries to parse a JavaScript Regex literal starting at the current token,
343 // if that begins with a slash and is in a location where JavaScript allows
344 // regex literals. Changes the current token to a regex literal and updates
345 // its text if successful.
346 void FormatTokenLexer::tryParseJSRegexLiteral() {
347  FormatToken *RegexToken = Tokens.back();
348  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
349  return;
350 
351  FormatToken *Prev = nullptr;
352  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
353  // NB: Because previous pointers are not initialized yet, this cannot use
354  // Token.getPreviousNonComment.
355  if ((*I)->isNot(tok::comment)) {
356  Prev = *I;
357  break;
358  }
359  }
360 
361  if (!canPrecedeRegexLiteral(Prev))
362  return;
363 
364  // 'Manually' lex ahead in the current file buffer.
365  const char *Offset = Lex->getBufferLocation();
366  const char *RegexBegin = Offset - RegexToken->TokenText.size();
367  StringRef Buffer = Lex->getBuffer();
368  bool InCharacterClass = false;
369  bool HaveClosingSlash = false;
370  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
371  // Regular expressions are terminated with a '/', which can only be
372  // escaped using '\' or a character class between '[' and ']'.
373  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
374  switch (*Offset) {
375  case '\\':
376  // Skip the escaped character.
377  ++Offset;
378  break;
379  case '[':
380  InCharacterClass = true;
381  break;
382  case ']':
383  InCharacterClass = false;
384  break;
385  case '/':
386  if (!InCharacterClass)
387  HaveClosingSlash = true;
388  break;
389  }
390  }
391 
392  RegexToken->Type = TT_RegexLiteral;
393  // Treat regex literals like other string_literals.
394  RegexToken->Tok.setKind(tok::string_literal);
395  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
396  RegexToken->ColumnWidth = RegexToken->TokenText.size();
397 
398  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
399 }
400 
401 void FormatTokenLexer::handleTemplateStrings() {
402  FormatToken *BacktickToken = Tokens.back();
403 
404  if (BacktickToken->is(tok::l_brace)) {
405  StateStack.push(LexerState::NORMAL);
406  return;
407  }
408  if (BacktickToken->is(tok::r_brace)) {
409  if (StateStack.size() == 1)
410  return;
411  StateStack.pop();
412  if (StateStack.top() != LexerState::TEMPLATE_STRING)
413  return;
414  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
415  } else if (BacktickToken->is(tok::unknown) &&
416  BacktickToken->TokenText == "`") {
417  StateStack.push(LexerState::TEMPLATE_STRING);
418  } else {
419  return; // Not actually a template
420  }
421 
422  // 'Manually' lex ahead in the current file buffer.
423  const char *Offset = Lex->getBufferLocation();
424  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
425  for (; Offset != Lex->getBuffer().end(); ++Offset) {
426  if (Offset[0] == '`') {
427  StateStack.pop();
428  break;
429  }
430  if (Offset[0] == '\\') {
431  ++Offset; // Skip the escaped character.
432  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
433  Offset[1] == '{') {
434  // '${' introduces an expression interpolation in the template string.
435  StateStack.push(LexerState::NORMAL);
436  ++Offset;
437  break;
438  }
439  }
440 
441  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
442  BacktickToken->Type = TT_TemplateString;
443  BacktickToken->Tok.setKind(tok::string_literal);
444  BacktickToken->TokenText = LiteralText;
445 
446  // Adjust width for potentially multiline string literals.
447  size_t FirstBreak = LiteralText.find('\n');
448  StringRef FirstLineText = FirstBreak == StringRef::npos
449  ? LiteralText
450  : LiteralText.substr(0, FirstBreak);
452  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
453  size_t LastBreak = LiteralText.rfind('\n');
454  if (LastBreak != StringRef::npos) {
455  BacktickToken->IsMultiline = true;
456  unsigned StartColumn = 0; // The template tail spans the entire line.
458  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
459  Style.TabWidth, Encoding);
460  }
461 
462  SourceLocation loc = Offset < Lex->getBuffer().end()
463  ? Lex->getSourceLocation(Offset + 1)
464  : SourceMgr.getLocForEndOfFile(ID);
465  resetLexer(SourceMgr.getFileOffset(loc));
466 }
467 
468 void FormatTokenLexer::tryParsePythonComment() {
469  FormatToken *HashToken = Tokens.back();
470  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
471  return;
472  // Turn the remainder of this line into a comment.
473  const char *CommentBegin =
474  Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
475  size_t From = CommentBegin - Lex->getBuffer().begin();
476  size_t To = Lex->getBuffer().find_first_of('\n', From);
477  if (To == StringRef::npos)
478  To = Lex->getBuffer().size();
479  size_t Len = To - From;
480  HashToken->Type = TT_LineComment;
481  HashToken->Tok.setKind(tok::comment);
482  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
483  SourceLocation Loc = To < Lex->getBuffer().size()
484  ? Lex->getSourceLocation(CommentBegin + Len)
485  : SourceMgr.getLocForEndOfFile(ID);
486  resetLexer(SourceMgr.getFileOffset(Loc));
487 }
488 
489 bool FormatTokenLexer::tryMerge_TMacro() {
490  if (Tokens.size() < 4)
491  return false;
492  FormatToken *Last = Tokens.back();
493  if (!Last->is(tok::r_paren))
494  return false;
495 
496  FormatToken *String = Tokens[Tokens.size() - 2];
497  if (!String->is(tok::string_literal) || String->IsMultiline)
498  return false;
499 
500  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
501  return false;
502 
503  FormatToken *Macro = Tokens[Tokens.size() - 4];
504  if (Macro->TokenText != "_T")
505  return false;
506 
507  const char *Start = Macro->TokenText.data();
508  const char *End = Last->TokenText.data() + Last->TokenText.size();
509  String->TokenText = StringRef(Start, End - Start);
510  String->IsFirst = Macro->IsFirst;
511  String->LastNewlineOffset = Macro->LastNewlineOffset;
512  String->WhitespaceRange = Macro->WhitespaceRange;
513  String->OriginalColumn = Macro->OriginalColumn;
515  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
516  String->NewlinesBefore = Macro->NewlinesBefore;
517  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
518 
519  Tokens.pop_back();
520  Tokens.pop_back();
521  Tokens.pop_back();
522  Tokens.back() = String;
523  return true;
524 }
525 
526 bool FormatTokenLexer::tryMergeConflictMarkers() {
527  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
528  return false;
529 
530  // Conflict lines look like:
531  // <marker> <text from the vcs>
532  // For example:
533  // >>>>>>> /file/in/file/system at revision 1234
534  //
535  // We merge all tokens in a line that starts with a conflict marker
536  // into a single token with a special token type that the unwrapped line
537  // parser will use to correctly rebuild the underlying code.
538 
539  FileID ID;
540  // Get the position of the first token in the line.
541  unsigned FirstInLineOffset;
542  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
543  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
544  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
545  // Calculate the offset of the start of the current line.
546  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
547  if (LineOffset == StringRef::npos) {
548  LineOffset = 0;
549  } else {
550  ++LineOffset;
551  }
552 
553  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
554  StringRef LineStart;
555  if (FirstSpace == StringRef::npos) {
556  LineStart = Buffer.substr(LineOffset);
557  } else {
558  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
559  }
560 
561  TokenType Type = TT_Unknown;
562  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
563  Type = TT_ConflictStart;
564  } else if (LineStart == "|||||||" || LineStart == "=======" ||
565  LineStart == "====") {
566  Type = TT_ConflictAlternative;
567  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
568  Type = TT_ConflictEnd;
569  }
570 
571  if (Type != TT_Unknown) {
572  FormatToken *Next = Tokens.back();
573 
574  Tokens.resize(FirstInLineIndex + 1);
575  // We do not need to build a complete token here, as we will skip it
576  // during parsing anyway (as we must not touch whitespace around conflict
577  // markers).
578  Tokens.back()->Type = Type;
579  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
580 
581  Tokens.push_back(Next);
582  return true;
583  }
584 
585  return false;
586 }
587 
588 FormatToken *FormatTokenLexer::getStashedToken() {
589  // Create a synthesized second '>' or '<' token.
590  Token Tok = FormatTok->Tok;
591  StringRef TokenText = FormatTok->TokenText;
592 
593  unsigned OriginalColumn = FormatTok->OriginalColumn;
594  FormatTok = new (Allocator.Allocate()) FormatToken;
595  FormatTok->Tok = Tok;
596  SourceLocation TokLocation =
597  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
598  FormatTok->Tok.setLocation(TokLocation);
599  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
600  FormatTok->TokenText = TokenText;
601  FormatTok->ColumnWidth = 1;
602  FormatTok->OriginalColumn = OriginalColumn + 1;
603 
604  return FormatTok;
605 }
606 
607 FormatToken *FormatTokenLexer::getNextToken() {
608  if (StateStack.top() == LexerState::TOKEN_STASHED) {
609  StateStack.pop();
610  return getStashedToken();
611  }
612 
613  FormatTok = new (Allocator.Allocate()) FormatToken;
614  readRawToken(*FormatTok);
615  SourceLocation WhitespaceStart =
616  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
617  FormatTok->IsFirst = IsFirstToken;
618  IsFirstToken = false;
619 
620  // Consume and record whitespace until we find a significant token.
621  unsigned WhitespaceLength = TrailingWhitespace;
622  while (FormatTok->Tok.is(tok::unknown)) {
623  StringRef Text = FormatTok->TokenText;
624  auto EscapesNewline = [&](int pos) {
625  // A '\r' here is just part of '\r\n'. Skip it.
626  if (pos >= 0 && Text[pos] == '\r')
627  --pos;
628  // See whether there is an odd number of '\' before this.
629  // FIXME: This is wrong. A '\' followed by a newline is always removed,
630  // regardless of whether there is another '\' before it.
631  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
632  unsigned count = 0;
633  for (; pos >= 0; --pos, ++count)
634  if (Text[pos] != '\\')
635  break;
636  return count & 1;
637  };
638  // FIXME: This miscounts tok:unknown tokens that are not just
639  // whitespace, e.g. a '`' character.
640  for (int i = 0, e = Text.size(); i != e; ++i) {
641  switch (Text[i]) {
642  case '\n':
643  ++FormatTok->NewlinesBefore;
644  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
645  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
646  Column = 0;
647  break;
648  case '\r':
649  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
650  Column = 0;
651  break;
652  case '\f':
653  case '\v':
654  Column = 0;
655  break;
656  case ' ':
657  ++Column;
658  break;
659  case '\t':
660  Column += Style.TabWidth - Column % Style.TabWidth;
661  break;
662  case '\\':
663  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
664  FormatTok->Type = TT_ImplicitStringLiteral;
665  break;
666  default:
667  FormatTok->Type = TT_ImplicitStringLiteral;
668  break;
669  }
670  if (FormatTok->Type == TT_ImplicitStringLiteral)
671  break;
672  }
673 
674  if (FormatTok->is(TT_ImplicitStringLiteral))
675  break;
676  WhitespaceLength += FormatTok->Tok.getLength();
677 
678  readRawToken(*FormatTok);
679  }
680 
681  // JavaScript and Java do not allow to escape the end of the line with a
682  // backslash. Backslashes are syntax errors in plain source, but can occur in
683  // comments. When a single line comment ends with a \, it'll cause the next
684  // line of code to be lexed as a comment, breaking formatting. The code below
685  // finds comments that contain a backslash followed by a line break, truncates
686  // the comment token at the backslash, and resets the lexer to restart behind
687  // the backslash.
688  if ((Style.Language == FormatStyle::LK_JavaScript ||
689  Style.Language == FormatStyle::LK_Java) &&
690  FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
691  size_t BackslashPos = FormatTok->TokenText.find('\\');
692  while (BackslashPos != StringRef::npos) {
693  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
694  FormatTok->TokenText[BackslashPos + 1] == '\n') {
695  const char *Offset = Lex->getBufferLocation();
696  Offset -= FormatTok->TokenText.size();
697  Offset += BackslashPos + 1;
698  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
699  FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
701  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
702  Encoding);
703  break;
704  }
705  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
706  }
707  }
708 
709  // In case the token starts with escaped newlines, we want to
710  // take them into account as whitespace - this pattern is quite frequent
711  // in macro definitions.
712  // FIXME: Add a more explicit test.
713  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
714  unsigned SkippedWhitespace = 0;
715  if (FormatTok->TokenText.size() > 2 &&
716  (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
717  SkippedWhitespace = 3;
718  else if (FormatTok->TokenText[1] == '\n')
719  SkippedWhitespace = 2;
720  else
721  break;
722 
723  ++FormatTok->NewlinesBefore;
724  WhitespaceLength += SkippedWhitespace;
725  FormatTok->LastNewlineOffset = SkippedWhitespace;
726  Column = 0;
727  FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
728  }
729 
730  FormatTok->WhitespaceRange = SourceRange(
731  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
732 
733  FormatTok->OriginalColumn = Column;
734 
735  TrailingWhitespace = 0;
736  if (FormatTok->Tok.is(tok::comment)) {
737  // FIXME: Add the trimmed whitespace to Column.
738  StringRef UntrimmedText = FormatTok->TokenText;
739  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
740  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
741  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
742  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
743  FormatTok->Tok.setIdentifierInfo(&Info);
744  FormatTok->Tok.setKind(Info.getTokenID());
745  if (Style.Language == FormatStyle::LK_Java &&
746  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
747  tok::kw_operator)) {
748  FormatTok->Tok.setKind(tok::identifier);
749  FormatTok->Tok.setIdentifierInfo(nullptr);
750  } else if (Style.Language == FormatStyle::LK_JavaScript &&
751  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
752  tok::kw_operator)) {
753  FormatTok->Tok.setKind(tok::identifier);
754  FormatTok->Tok.setIdentifierInfo(nullptr);
755  }
756  } else if (FormatTok->Tok.is(tok::greatergreater)) {
757  FormatTok->Tok.setKind(tok::greater);
758  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
759  ++Column;
760  StateStack.push(LexerState::TOKEN_STASHED);
761  } else if (FormatTok->Tok.is(tok::lessless)) {
762  FormatTok->Tok.setKind(tok::less);
763  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
764  ++Column;
765  StateStack.push(LexerState::TOKEN_STASHED);
766  }
767 
768  // Now FormatTok is the next non-whitespace token.
769 
770  StringRef Text = FormatTok->TokenText;
771  size_t FirstNewlinePos = Text.find('\n');
772  if (FirstNewlinePos == StringRef::npos) {
773  // FIXME: ColumnWidth actually depends on the start column, we need to
774  // take this into account when the token is moved.
775  FormatTok->ColumnWidth =
776  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
777  Column += FormatTok->ColumnWidth;
778  } else {
779  FormatTok->IsMultiline = true;
780  // FIXME: ColumnWidth actually depends on the start column, we need to
781  // take this into account when the token is moved.
783  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
784 
785  // The last line of the token always starts in column 0.
786  // Thus, the length can be precomputed even in the presence of tabs.
788  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
789  Column = FormatTok->LastLineColumnWidth;
790  }
791 
792  if (Style.isCpp()) {
793  auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
794  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
795  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
796  tok::pp_define) &&
797  it != Macros.end()) {
798  FormatTok->Type = it->second;
799  } else if (FormatTok->is(tok::identifier)) {
800  if (MacroBlockBeginRegex.match(Text)) {
801  FormatTok->Type = TT_MacroBlockBegin;
802  } else if (MacroBlockEndRegex.match(Text)) {
803  FormatTok->Type = TT_MacroBlockEnd;
804  }
805  }
806  }
807 
808  return FormatTok;
809 }
810 
811 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
812  Lex->LexFromRawLexer(Tok.Tok);
813  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
814  Tok.Tok.getLength());
815  // For formatting, treat unterminated string literals like normal string
816  // literals.
817  if (Tok.is(tok::unknown)) {
818  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
819  Tok.Tok.setKind(tok::string_literal);
820  Tok.IsUnterminatedLiteral = true;
821  } else if (Style.Language == FormatStyle::LK_JavaScript &&
822  Tok.TokenText == "''") {
823  Tok.Tok.setKind(tok::string_literal);
824  }
825  }
826 
827  if ((Style.Language == FormatStyle::LK_JavaScript ||
828  Style.Language == FormatStyle::LK_Proto ||
829  Style.Language == FormatStyle::LK_TextProto) &&
830  Tok.is(tok::char_constant)) {
831  Tok.Tok.setKind(tok::string_literal);
832  }
833 
834  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
835  Tok.TokenText == "/* clang-format on */")) {
836  FormattingDisabled = false;
837  }
838 
839  Tok.Finalized = FormattingDisabled;
840 
841  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
842  Tok.TokenText == "/* clang-format off */")) {
843  FormattingDisabled = true;
844  }
845 }
846 
847 void FormatTokenLexer::resetLexer(unsigned Offset) {
848  StringRef Buffer = SourceMgr.getBufferData(ID);
849  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
850  getFormattingLangOpts(Style), Buffer.begin(),
851  Buffer.begin() + Offset, Buffer.end()));
852  Lex->SetKeepWhitespaceMode(true);
853  TrailingWhitespace = 0;
854 }
855 
856 } // namespace format
857 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
StringRef Identifier
Definition: Format.cpp:1718
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:76
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:133
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:97
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:220
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1433
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:162
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:165
bool isBinaryOperator() const
Definition: FormatToken.h:418
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:139
long i
Definition: xmmintrin.h:1456
One of these records is kept for each identifier that is lexed.
Token - This structure provides full information about a lexed token.
Definition: Token.h:34
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:61
void setKind(tok::TokenKind K)
Definition: Token.h:93
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
const FormatToken & Tok
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:889
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:321
unsigned Offset
Definition: Format.cpp:1713
SourceLocation End
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding)
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:2365
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:126
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:129
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:150
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:312
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:179
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:188
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:146
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:24
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:180
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:177
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:49
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
const llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Dataflow Directional Tag Classes.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:155
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:310
unsigned getLength() const
Definition: Token.h:129
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:1712
void setLocation(SourceLocation L)
Definition: Token.h:134
#define true
Definition: stdbool.h:16
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:143
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:159
const encoding::Encoding Encoding
const FormatStyle & Style