clang  5.0.0
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief This file implements FormatTokenLexer, which tokenizes a source file
12 /// into a FormatToken stream suitable for ClangFormat.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "FormatTokenLexer.h"
17 #include "FormatToken.h"
20 #include "clang/Format/Format.h"
21 #include "llvm/Support/Regex.h"
22 
23 namespace clang {
24 namespace format {
25 
27  const FormatStyle &Style,
29  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
33  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
34  MacroBlockEndRegex(Style.MacroBlockEnd) {
35  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
37  Lex->SetKeepWhitespaceMode(true);
38 
39  for (const std::string &ForEachMacro : Style.ForEachMacros)
40  ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
41  std::sort(ForEachMacros.begin(), ForEachMacros.end());
42 }
43 
45  assert(Tokens.empty());
46  assert(FirstInLineIndex == 0);
47  do {
48  Tokens.push_back(getNextToken());
49  if (Style.Language == FormatStyle::LK_JavaScript) {
50  tryParseJSRegexLiteral();
51  handleTemplateStrings();
52  }
53  tryMergePreviousTokens();
54  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
55  FirstInLineIndex = Tokens.size() - 1;
56  } while (Tokens.back()->Tok.isNot(tok::eof));
57  return Tokens;
58 }
59 
60 void FormatTokenLexer::tryMergePreviousTokens() {
61  if (tryMerge_TMacro())
62  return;
63  if (tryMergeConflictMarkers())
64  return;
65  if (tryMergeLessLess())
66  return;
67  if (tryMergeNSStringLiteral())
68  return;
69 
70  if (Style.Language == FormatStyle::LK_JavaScript) {
71  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
72  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
73  tok::equal};
74  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
75  tok::greaterequal};
76  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
77  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
78  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
79  tok::starequal};
80 
81  // FIXME: Investigate what token type gives the correct operator priority.
82  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
83  return;
84  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
85  return;
86  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
87  return;
88  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
89  return;
90  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
91  return;
92  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
93  Tokens.back()->Tok.setKind(tok::starequal);
94  return;
95  }
96  }
97 
98  if (Style.Language == FormatStyle::LK_Java) {
99  static const tok::TokenKind JavaRightLogicalShift[] = {tok::greater,
100  tok::greater,
101  tok::greater};
102  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {tok::greater,
103  tok::greater,
104  tok::greaterequal};
105  if (tryMergeTokens(JavaRightLogicalShift, TT_BinaryOperator))
106  return;
107  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
108  return;
109  }
110 }
111 
112 bool FormatTokenLexer::tryMergeNSStringLiteral() {
113  if (Tokens.size() < 2)
114  return false;
115  auto &At = *(Tokens.end() - 2);
116  auto &String = *(Tokens.end() - 1);
117  if (!At->is(tok::at) || !String->is(tok::string_literal))
118  return false;
119  At->Tok.setKind(tok::string_literal);
120  At->TokenText = StringRef(At->TokenText.begin(),
121  String->TokenText.end() - At->TokenText.begin());
122  At->ColumnWidth += String->ColumnWidth;
123  At->Type = TT_ObjCStringLiteral;
124  Tokens.erase(Tokens.end() - 1);
125  return true;
126 }
127 
128 bool FormatTokenLexer::tryMergeLessLess() {
129  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
130  if (Tokens.size() < 3)
131  return false;
132 
133  bool FourthTokenIsLess = false;
134  if (Tokens.size() > 3)
135  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
136 
137  auto First = Tokens.end() - 3;
138  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
139  First[0]->isNot(tok::less) || FourthTokenIsLess)
140  return false;
141 
142  // Only merge if there currently is no whitespace between the two "<".
143  if (First[1]->WhitespaceRange.getBegin() !=
144  First[1]->WhitespaceRange.getEnd())
145  return false;
146 
147  First[0]->Tok.setKind(tok::lessless);
148  First[0]->TokenText = "<<";
149  First[0]->ColumnWidth += 1;
150  Tokens.erase(Tokens.end() - 2);
151  return true;
152 }
153 
154 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
155  TokenType NewType) {
156  if (Tokens.size() < Kinds.size())
157  return false;
158 
159  SmallVectorImpl<FormatToken *>::const_iterator First =
160  Tokens.end() - Kinds.size();
161  if (!First[0]->is(Kinds[0]))
162  return false;
163  unsigned AddLength = 0;
164  for (unsigned i = 1; i < Kinds.size(); ++i) {
165  if (!First[i]->is(Kinds[i]) ||
166  First[i]->WhitespaceRange.getBegin() !=
167  First[i]->WhitespaceRange.getEnd())
168  return false;
169  AddLength += First[i]->TokenText.size();
170  }
171  Tokens.resize(Tokens.size() - Kinds.size() + 1);
172  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
173  First[0]->TokenText.size() + AddLength);
174  First[0]->ColumnWidth += AddLength;
175  First[0]->Type = NewType;
176  return true;
177 }
178 
179 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
180 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
181  // NB: This is not entirely correct, as an r_paren can introduce an operand
182  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
183  // corner case to not matter in practice, though.
184  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
185  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
186  tok::colon, tok::question, tok::tilde) ||
187  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
188  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
189  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
190  Tok->isBinaryOperator();
191 }
192 
193 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
194  if (!Prev)
195  return true;
196 
197  // Regex literals can only follow after prefix unary operators, not after
198  // postfix unary operators. If the '++' is followed by a non-operand
199  // introducing token, the slash here is the operand and not the start of a
200  // regex.
201  // `!` is an unary prefix operator, but also a post-fix operator that casts
202  // away nullability, so the same check applies.
203  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
204  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
205 
206  // The previous token must introduce an operand location where regex
207  // literals can occur.
208  if (!precedesOperand(Prev))
209  return false;
210 
211  return true;
212 }
213 
214 // Tries to parse a JavaScript Regex literal starting at the current token,
215 // if that begins with a slash and is in a location where JavaScript allows
216 // regex literals. Changes the current token to a regex literal and updates
217 // its text if successful.
218 void FormatTokenLexer::tryParseJSRegexLiteral() {
219  FormatToken *RegexToken = Tokens.back();
220  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
221  return;
222 
223  FormatToken *Prev = nullptr;
224  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
225  // NB: Because previous pointers are not initialized yet, this cannot use
226  // Token.getPreviousNonComment.
227  if ((*I)->isNot(tok::comment)) {
228  Prev = *I;
229  break;
230  }
231  }
232 
233  if (!canPrecedeRegexLiteral(Prev))
234  return;
235 
236  // 'Manually' lex ahead in the current file buffer.
237  const char *Offset = Lex->getBufferLocation();
238  const char *RegexBegin = Offset - RegexToken->TokenText.size();
239  StringRef Buffer = Lex->getBuffer();
240  bool InCharacterClass = false;
241  bool HaveClosingSlash = false;
242  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
243  // Regular expressions are terminated with a '/', which can only be
244  // escaped using '\' or a character class between '[' and ']'.
245  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
246  switch (*Offset) {
247  case '\\':
248  // Skip the escaped character.
249  ++Offset;
250  break;
251  case '[':
252  InCharacterClass = true;
253  break;
254  case ']':
255  InCharacterClass = false;
256  break;
257  case '/':
258  if (!InCharacterClass)
259  HaveClosingSlash = true;
260  break;
261  }
262  }
263 
264  RegexToken->Type = TT_RegexLiteral;
265  // Treat regex literals like other string_literals.
266  RegexToken->Tok.setKind(tok::string_literal);
267  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
268  RegexToken->ColumnWidth = RegexToken->TokenText.size();
269 
270  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
271 }
272 
273 void FormatTokenLexer::handleTemplateStrings() {
274  FormatToken *BacktickToken = Tokens.back();
275 
276  if (BacktickToken->is(tok::l_brace)) {
277  StateStack.push(LexerState::NORMAL);
278  return;
279  }
280  if (BacktickToken->is(tok::r_brace)) {
281  if (StateStack.size() == 1)
282  return;
283  StateStack.pop();
284  if (StateStack.top() != LexerState::TEMPLATE_STRING)
285  return;
286  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
287  } else if (BacktickToken->is(tok::unknown) &&
288  BacktickToken->TokenText == "`") {
289  StateStack.push(LexerState::TEMPLATE_STRING);
290  } else {
291  return; // Not actually a template
292  }
293 
294  // 'Manually' lex ahead in the current file buffer.
295  const char *Offset = Lex->getBufferLocation();
296  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
297  for (; Offset != Lex->getBuffer().end(); ++Offset) {
298  if (Offset[0] == '`') {
299  StateStack.pop();
300  break;
301  }
302  if (Offset[0] == '\\') {
303  ++Offset; // Skip the escaped character.
304  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
305  Offset[1] == '{') {
306  // '${' introduces an expression interpolation in the template string.
307  StateStack.push(LexerState::NORMAL);
308  ++Offset;
309  break;
310  }
311  }
312 
313  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
314  BacktickToken->Type = TT_TemplateString;
315  BacktickToken->Tok.setKind(tok::string_literal);
316  BacktickToken->TokenText = LiteralText;
317 
318  // Adjust width for potentially multiline string literals.
319  size_t FirstBreak = LiteralText.find('\n');
320  StringRef FirstLineText = FirstBreak == StringRef::npos
321  ? LiteralText
322  : LiteralText.substr(0, FirstBreak);
323  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
324  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
325  size_t LastBreak = LiteralText.rfind('\n');
326  if (LastBreak != StringRef::npos) {
327  BacktickToken->IsMultiline = true;
328  unsigned StartColumn = 0; // The template tail spans the entire line.
329  BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
330  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
331  Style.TabWidth, Encoding);
332  }
333 
334  SourceLocation loc = Offset < Lex->getBuffer().end()
335  ? Lex->getSourceLocation(Offset + 1)
336  : SourceMgr.getLocForEndOfFile(ID);
337  resetLexer(SourceMgr.getFileOffset(loc));
338 }
339 
340 bool FormatTokenLexer::tryMerge_TMacro() {
341  if (Tokens.size() < 4)
342  return false;
343  FormatToken *Last = Tokens.back();
344  if (!Last->is(tok::r_paren))
345  return false;
346 
347  FormatToken *String = Tokens[Tokens.size() - 2];
348  if (!String->is(tok::string_literal) || String->IsMultiline)
349  return false;
350 
351  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
352  return false;
353 
354  FormatToken *Macro = Tokens[Tokens.size() - 4];
355  if (Macro->TokenText != "_T")
356  return false;
357 
358  const char *Start = Macro->TokenText.data();
359  const char *End = Last->TokenText.data() + Last->TokenText.size();
360  String->TokenText = StringRef(Start, End - Start);
361  String->IsFirst = Macro->IsFirst;
362  String->LastNewlineOffset = Macro->LastNewlineOffset;
363  String->WhitespaceRange = Macro->WhitespaceRange;
364  String->OriginalColumn = Macro->OriginalColumn;
365  String->ColumnWidth = encoding::columnWidthWithTabs(
366  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
367  String->NewlinesBefore = Macro->NewlinesBefore;
368  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
369 
370  Tokens.pop_back();
371  Tokens.pop_back();
372  Tokens.pop_back();
373  Tokens.back() = String;
374  return true;
375 }
376 
377 bool FormatTokenLexer::tryMergeConflictMarkers() {
378  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
379  return false;
380 
381  // Conflict lines look like:
382  // <marker> <text from the vcs>
383  // For example:
384  // >>>>>>> /file/in/file/system at revision 1234
385  //
386  // We merge all tokens in a line that starts with a conflict marker
387  // into a single token with a special token type that the unwrapped line
388  // parser will use to correctly rebuild the underlying code.
389 
390  FileID ID;
391  // Get the position of the first token in the line.
392  unsigned FirstInLineOffset;
393  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
394  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
395  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
396  // Calculate the offset of the start of the current line.
397  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
398  if (LineOffset == StringRef::npos) {
399  LineOffset = 0;
400  } else {
401  ++LineOffset;
402  }
403 
404  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
405  StringRef LineStart;
406  if (FirstSpace == StringRef::npos) {
407  LineStart = Buffer.substr(LineOffset);
408  } else {
409  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
410  }
411 
412  TokenType Type = TT_Unknown;
413  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
414  Type = TT_ConflictStart;
415  } else if (LineStart == "|||||||" || LineStart == "=======" ||
416  LineStart == "====") {
417  Type = TT_ConflictAlternative;
418  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
419  Type = TT_ConflictEnd;
420  }
421 
422  if (Type != TT_Unknown) {
423  FormatToken *Next = Tokens.back();
424 
425  Tokens.resize(FirstInLineIndex + 1);
426  // We do not need to build a complete token here, as we will skip it
427  // during parsing anyway (as we must not touch whitespace around conflict
428  // markers).
429  Tokens.back()->Type = Type;
430  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
431 
432  Tokens.push_back(Next);
433  return true;
434  }
435 
436  return false;
437 }
438 
439 FormatToken *FormatTokenLexer::getStashedToken() {
440  // Create a synthesized second '>' or '<' token.
441  Token Tok = FormatTok->Tok;
442  StringRef TokenText = FormatTok->TokenText;
443 
444  unsigned OriginalColumn = FormatTok->OriginalColumn;
445  FormatTok = new (Allocator.Allocate()) FormatToken;
446  FormatTok->Tok = Tok;
447  SourceLocation TokLocation =
448  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
449  FormatTok->Tok.setLocation(TokLocation);
450  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
451  FormatTok->TokenText = TokenText;
452  FormatTok->ColumnWidth = 1;
453  FormatTok->OriginalColumn = OriginalColumn + 1;
454 
455  return FormatTok;
456 }
457 
458 FormatToken *FormatTokenLexer::getNextToken() {
459  if (StateStack.top() == LexerState::TOKEN_STASHED) {
460  StateStack.pop();
461  return getStashedToken();
462  }
463 
464  FormatTok = new (Allocator.Allocate()) FormatToken;
465  readRawToken(*FormatTok);
466  SourceLocation WhitespaceStart =
467  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
468  FormatTok->IsFirst = IsFirstToken;
469  IsFirstToken = false;
470 
471  // Consume and record whitespace until we find a significant token.
472  unsigned WhitespaceLength = TrailingWhitespace;
473  while (FormatTok->Tok.is(tok::unknown)) {
474  StringRef Text = FormatTok->TokenText;
475  auto EscapesNewline = [&](int pos) {
476  // A '\r' here is just part of '\r\n'. Skip it.
477  if (pos >= 0 && Text[pos] == '\r')
478  --pos;
479  // See whether there is an odd number of '\' before this.
480  // FIXME: This is wrong. A '\' followed by a newline is always removed,
481  // regardless of whether there is another '\' before it.
482  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
483  unsigned count = 0;
484  for (; pos >= 0; --pos, ++count)
485  if (Text[pos] != '\\')
486  break;
487  return count & 1;
488  };
489  // FIXME: This miscounts tok:unknown tokens that are not just
490  // whitespace, e.g. a '`' character.
491  for (int i = 0, e = Text.size(); i != e; ++i) {
492  switch (Text[i]) {
493  case '\n':
494  ++FormatTok->NewlinesBefore;
495  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
496  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
497  Column = 0;
498  break;
499  case '\r':
500  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
501  Column = 0;
502  break;
503  case '\f':
504  case '\v':
505  Column = 0;
506  break;
507  case ' ':
508  ++Column;
509  break;
510  case '\t':
511  Column += Style.TabWidth - Column % Style.TabWidth;
512  break;
513  case '\\':
514  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
515  FormatTok->Type = TT_ImplicitStringLiteral;
516  break;
517  default:
518  FormatTok->Type = TT_ImplicitStringLiteral;
519  break;
520  }
521  if (FormatTok->Type == TT_ImplicitStringLiteral)
522  break;
523  }
524 
525  if (FormatTok->is(TT_ImplicitStringLiteral))
526  break;
527  WhitespaceLength += FormatTok->Tok.getLength();
528 
529  readRawToken(*FormatTok);
530  }
531 
532  // In case the token starts with escaped newlines, we want to
533  // take them into account as whitespace - this pattern is quite frequent
534  // in macro definitions.
535  // FIXME: Add a more explicit test.
536  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
537  FormatTok->TokenText[1] == '\n') {
538  ++FormatTok->NewlinesBefore;
539  WhitespaceLength += 2;
540  FormatTok->LastNewlineOffset = 2;
541  Column = 0;
542  FormatTok->TokenText = FormatTok->TokenText.substr(2);
543  }
544 
545  FormatTok->WhitespaceRange = SourceRange(
546  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
547 
548  FormatTok->OriginalColumn = Column;
549 
550  TrailingWhitespace = 0;
551  if (FormatTok->Tok.is(tok::comment)) {
552  // FIXME: Add the trimmed whitespace to Column.
553  StringRef UntrimmedText = FormatTok->TokenText;
554  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
555  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
556  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
557  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
558  FormatTok->Tok.setIdentifierInfo(&Info);
559  FormatTok->Tok.setKind(Info.getTokenID());
560  if (Style.Language == FormatStyle::LK_Java &&
561  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
562  tok::kw_operator)) {
563  FormatTok->Tok.setKind(tok::identifier);
564  FormatTok->Tok.setIdentifierInfo(nullptr);
565  } else if (Style.Language == FormatStyle::LK_JavaScript &&
566  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
567  tok::kw_operator)) {
568  FormatTok->Tok.setKind(tok::identifier);
569  FormatTok->Tok.setIdentifierInfo(nullptr);
570  }
571  } else if (FormatTok->Tok.is(tok::greatergreater)) {
572  FormatTok->Tok.setKind(tok::greater);
573  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
574  ++Column;
575  StateStack.push(LexerState::TOKEN_STASHED);
576  } else if (FormatTok->Tok.is(tok::lessless)) {
577  FormatTok->Tok.setKind(tok::less);
578  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
579  ++Column;
580  StateStack.push(LexerState::TOKEN_STASHED);
581  }
582 
583  // Now FormatTok is the next non-whitespace token.
584 
585  StringRef Text = FormatTok->TokenText;
586  size_t FirstNewlinePos = Text.find('\n');
587  if (FirstNewlinePos == StringRef::npos) {
588  // FIXME: ColumnWidth actually depends on the start column, we need to
589  // take this into account when the token is moved.
590  FormatTok->ColumnWidth =
591  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
592  Column += FormatTok->ColumnWidth;
593  } else {
594  FormatTok->IsMultiline = true;
595  // FIXME: ColumnWidth actually depends on the start column, we need to
596  // take this into account when the token is moved.
598  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
599 
600  // The last line of the token always starts in column 0.
601  // Thus, the length can be precomputed even in the presence of tabs.
603  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
604  Column = FormatTok->LastLineColumnWidth;
605  }
606 
607  if (Style.isCpp()) {
608  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
609  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
610  tok::pp_define) &&
611  std::find(ForEachMacros.begin(), ForEachMacros.end(),
612  FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) {
613  FormatTok->Type = TT_ForEachMacro;
614  } else if (FormatTok->is(tok::identifier)) {
615  if (MacroBlockBeginRegex.match(Text)) {
616  FormatTok->Type = TT_MacroBlockBegin;
617  } else if (MacroBlockEndRegex.match(Text)) {
618  FormatTok->Type = TT_MacroBlockEnd;
619  }
620  }
621  }
622 
623  return FormatTok;
624 }
625 
626 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
627  Lex->LexFromRawLexer(Tok.Tok);
628  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
629  Tok.Tok.getLength());
630  // For formatting, treat unterminated string literals like normal string
631  // literals.
632  if (Tok.is(tok::unknown)) {
633  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
634  Tok.Tok.setKind(tok::string_literal);
635  Tok.IsUnterminatedLiteral = true;
636  } else if (Style.Language == FormatStyle::LK_JavaScript &&
637  Tok.TokenText == "''") {
638  Tok.Tok.setKind(tok::string_literal);
639  }
640  }
641 
642  if (Style.Language == FormatStyle::LK_JavaScript &&
643  Tok.is(tok::char_constant)) {
644  Tok.Tok.setKind(tok::string_literal);
645  }
646 
647  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
648  Tok.TokenText == "/* clang-format on */")) {
649  FormattingDisabled = false;
650  }
651 
652  Tok.Finalized = FormattingDisabled;
653 
654  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
655  Tok.TokenText == "/* clang-format off */")) {
656  FormattingDisabled = true;
657  }
658 }
659 
660 void FormatTokenLexer::resetLexer(unsigned Offset) {
661  StringRef Buffer = SourceMgr.getBufferData(ID);
662  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
663  getFormattingLangOpts(Style), Buffer.begin(),
664  Buffer.begin() + Offset, Buffer.end()));
665  Lex->SetKeepWhitespaceMode(true);
666  TrailingWhitespace = 0;
667 }
668 
669 } // namespace format
670 } // namespace clang
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:46
Token Tok
The Token.
Definition: FormatToken.h:123
Defines the SourceManager interface.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:214
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:305
std::unique_ptr< llvm::MemoryBuffer > Buffer
IdentTable(getFormattingLangOpts(Style))
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:152
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:155
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:129
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
Should be used for Java.
Definition: Format.h:1114
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:62
void setKind(tok::TokenKind K)
Definition: Token.h:91
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
uint32_t Offset
Definition: CacheTokens.cpp:43
detail::InMemoryDirectory::const_iterator I
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
const SmallVectorImpl< AnnotatedLine * >::const_iterator End
Should be used for JavaScript.
Definition: Format.h:1116
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:1990
MacroBlockBeginRegex(Style.MacroBlockBegin)
FormatToken * Token
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:124
Keywords(IdentTable)
MacroBlockEndRegex(Style.MacroBlockEnd)
FormattingDisabled(false)
The l-value was considered opaque, so the alignment was determined from a type.
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:140
SmallVectorImpl< AnnotatedLine * >::const_iterator Next
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Various functions to configurably format source code.
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:186
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:136
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:167
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:95
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, const FormatStyle &Style, encoding::Encoding Encoding)
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:1131
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:145
detail::InMemoryDirectory::const_iterator E
SourceMgr(SourceMgr)
Encoding(Encoding)
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:296
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:1451
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
unsigned getLength() const
Definition: Token.h:127
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
StringRef Text
Definition: Format.cpp:1302
bool isCpp() const
Definition: Format.h:1128
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
void setLocation(SourceLocation L)
Definition: Token.h:132
#define true
Definition: stdbool.h:32
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:133
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:149
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:177