clang  5.0.0
BreakableToken.h
Go to the documentation of this file.
1 //===--- BreakableToken.h - Format C++ code -------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Declares BreakableToken, BreakableStringLiteral, BreakableComment,
12 /// BreakableBlockComment and BreakableLineCommentSection classes, that contain
13 /// token type-specific logic to break long lines in tokens and reflow content
14 /// between tokens.
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
19 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
20 
21 #include "Encoding.h"
22 #include "TokenAnnotator.h"
23 #include "WhitespaceManager.h"
24 #include "llvm/Support/Regex.h"
25 #include <utility>
26 
27 namespace clang {
28 namespace format {
29 
30 /// \brief Checks if \p Token switches formatting, like /* clang-format off */.
31 /// \p Token must be a comment.
32 bool switchesFormatting(const FormatToken &Token);
33 
34 struct FormatStyle;
35 
36 /// \brief Base class for strategies on how to break tokens.
37 ///
38 /// This is organised around the concept of a \c Split, which is a whitespace
39 /// range that signifies a position of the content of a token where a
40 /// reformatting might be done. Operating with splits is divided into 3
41 /// operations:
42 /// - getSplit, for finding a split starting at a position,
43 /// - getLineLengthAfterSplit, for calculating the size in columns of the rest
44 /// of the content after a split has been used for breaking, and
45 /// - insertBreak, for executing the split using a whitespace manager.
46 ///
47 /// There is a pair of operations that are used to compress a long whitespace
48 /// range with a single space if that will bring the line lenght under the
49 /// column limit:
50 /// - getLineLengthAfterCompression, for calculating the size in columns of the
51 /// line after a whitespace range has been compressed, and
52 /// - compressWhitespace, for executing the whitespace compression using a
53 /// whitespace manager; note that the compressed whitespace may be in the
54 /// middle of the original line and of the reformatted line.
55 ///
56 /// For tokens where the whitespace before each line needs to be also
57 /// reformatted, for example for tokens supporting reflow, there are analogous
58 /// operations that might be executed before the main line breaking occurs:
59 /// - getSplitBefore, for finding a split such that the content preceding it
60 /// needs to be specially reflown,
61 /// - getLineLengthAfterSplitBefore, for calculating the line length in columns
62 /// of the remainder of the content after the beginning of the content has
63 /// been reformatted, and
64 /// - replaceWhitespaceBefore, for executing the reflow using a whitespace
65 /// manager.
66 ///
67 /// FIXME: The interface seems set in stone, so we might want to just pull the
68 /// strategy into the class, instead of controlling it from the outside.
70 public:
71  /// \brief Contains starting character index and length of split.
72  typedef std::pair<StringRef::size_type, unsigned> Split;
73 
74  virtual ~BreakableToken() {}
75 
76  /// \brief Returns the number of lines in this token in the original code.
77  virtual unsigned getLineCount() const = 0;
78 
79  /// \brief Returns the number of columns required to format the piece of line
80  /// at \p LineIndex, from byte offset \p TailOffset with length \p Length.
81  ///
82  /// Note that previous breaks are not taken into account. \p TailOffset is
83  /// always specified from the start of the (original) line.
84  /// \p Length can be set to StringRef::npos, which means "to the end of line".
85  virtual unsigned
86  getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset,
87  StringRef::size_type Length) const = 0;
88 
89  /// \brief Returns a range (offset, length) at which to break the line at
90  /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
91  /// violate \p ColumnLimit.
92  virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
93  unsigned ColumnLimit,
94  llvm::Regex &CommentPragmasRegex) const = 0;
95 
96  /// \brief Emits the previously retrieved \p Split via \p Whitespaces.
97  virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
99 
100  /// \brief Returns the number of columns required to format the piece of line
101  /// at \p LineIndex, from byte offset \p TailOffset after the whitespace range
102  /// \p Split has been compressed into a single space.
103  unsigned getLineLengthAfterCompression(unsigned RemainingTokenColumns,
104  Split Split) const;
105 
106  /// \brief Replaces the whitespace range described by \p Split with a single
107  /// space.
108  virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset,
109  Split Split,
111 
112  /// \brief Returns a whitespace range (offset, length) of the content at
113  /// \p LineIndex such that the content preceding this range needs to be
114  /// reformatted before any breaks are made to this line.
115  ///
116  /// \p PreviousEndColumn is the end column of the previous line after
117  /// formatting.
118  ///
119  /// A result having offset == StringRef::npos means that no piece of the line
120  /// needs to be reformatted before any breaks are made.
121  virtual Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn,
122  unsigned ColumnLimit,
123  llvm::Regex &CommentPragmasRegex) const {
124  return Split(StringRef::npos, 0);
125  }
126 
127  /// \brief Returns the number of columns required to format the piece of line
128  /// at \p LineIndex after the content preceding the whitespace range specified
129  /// \p SplitBefore has been reformatted, but before any breaks are made to
130  /// this line.
131  virtual unsigned getLineLengthAfterSplitBefore(unsigned LineIndex,
132  unsigned TailOffset,
133  unsigned PreviousEndColumn,
134  unsigned ColumnLimit,
135  Split SplitBefore) const {
136  return getLineLengthAfterSplit(LineIndex, TailOffset, StringRef::npos);
137  }
138 
139  /// \brief Replaces the whitespace between \p LineIndex-1 and \p LineIndex.
140  /// Performs a reformatting of the content at \p LineIndex preceding the
141  /// whitespace range \p SplitBefore.
142  virtual void replaceWhitespaceBefore(unsigned LineIndex,
143  unsigned PreviousEndColumn,
144  unsigned ColumnLimit, Split SplitBefore,
146 
147  /// \brief Updates the next token of \p State to the next token after this
148  /// one. This can be used when this token manages a set of underlying tokens
149  /// as a unit and is responsible for the formatting of the them.
150  virtual void updateNextToken(LineState &State) const {}
151 
152 protected:
155  : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
156  Style(Style) {}
157 
158  const FormatToken &Tok;
159  const bool InPPDirective;
162 };
163 
164 /// \brief Base class for single line tokens that can be broken.
165 ///
166 /// \c getSplit() needs to be implemented by child classes.
168 public:
169  unsigned getLineCount() const override;
170  unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset,
171  StringRef::size_type Length) const override;
172 
173 protected:
175  StringRef Prefix, StringRef Postfix,
177  const FormatStyle &Style);
178 
179  // The column in which the token starts.
180  unsigned StartColumn;
181  // The prefix a line needs after a break in the token.
182  StringRef Prefix;
183  // The postfix a line needs before introducing a break.
184  StringRef Postfix;
185  // The token text excluding the prefix and postfix.
186  StringRef Line;
187 };
188 
190 public:
191  /// \brief Creates a breakable token for a single line string literal.
192  ///
193  /// \p StartColumn specifies the column in which the token will start
194  /// after formatting.
196  StringRef Prefix, StringRef Postfix,
198  const FormatStyle &Style);
199 
200  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
201  llvm::Regex &CommentPragmasRegex) const override;
202  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
203  WhitespaceManager &Whitespaces) override;
204  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
205  WhitespaceManager &Whitespaces) override {}
206 };
207 
209 protected:
210  /// \brief Creates a breakable token for a comment.
211  ///
212  /// \p StartColumn specifies the column in which the comment will start after
213  /// formatting.
216  const FormatStyle &Style);
217 
218 public:
219  unsigned getLineCount() const override;
220  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
221  llvm::Regex &CommentPragmasRegex) const override;
222  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
223  WhitespaceManager &Whitespaces) override;
224 
225 protected:
226  virtual unsigned getContentStartColumn(unsigned LineIndex,
227  unsigned TailOffset) const = 0;
228 
229  // Returns a split that divides Text into a left and right parts, such that
230  // the left part is suitable for reflowing after PreviousEndColumn.
231  Split getReflowSplit(StringRef Text, StringRef ReflowPrefix,
232  unsigned PreviousEndColumn, unsigned ColumnLimit) const;
233 
234  // Returns the token containing the line at LineIndex.
235  const FormatToken &tokenAt(unsigned LineIndex) const;
236 
237  // Checks if the content of line LineIndex may be reflown with the previous
238  // line.
239  virtual bool mayReflow(unsigned LineIndex,
240  llvm::Regex &CommentPragmasRegex) const = 0;
241 
242  // Contains the original text of the lines of the block comment.
243  //
244  // In case of a block comments, excludes the leading /* in the first line and
245  // trailing */ in the last line. In case of line comments, excludes the
246  // leading // and spaces.
248 
249  // Contains the text of the lines excluding all leading and trailing
250  // whitespace between the lines. Note that the decoration (if present) is also
251  // not considered part of the text.
253 
254  // Tokens[i] contains a reference to the token containing Lines[i] if the
255  // whitespace range before that token is managed by this block.
256  // Otherwise, Tokens[i] is a null pointer.
258 
259  // ContentColumn[i] is the target column at which Content[i] should be.
260  // Note that this excludes a leading "* " or "*" in case of block comments
261  // where all lines have a "*" prefix, or the leading "// " or "//" in case of
262  // line comments.
263  //
264  // In block comments, the first line's target column is always positive. The
265  // remaining lines' target columns are relative to the first line to allow
266  // correct indentation of comments in \c WhitespaceManager. Thus they can be
267  // negative as well (in case the first line needs to be unindented more than
268  // there's actual whitespace in another line).
270 
271  // The intended start column of the first line of text from this section.
272  unsigned StartColumn;
273 
274  // The prefix to use in front a line that has been reflown up.
275  // For example, when reflowing the second line after the first here:
276  // // comment 1
277  // // comment 2
278  // we expect:
279  // // comment 1 comment 2
280  // and not:
281  // // comment 1comment 2
282  StringRef ReflowPrefix = " ";
283 };
284 
286 public:
288  unsigned OriginalStartColumn, bool FirstInLine,
290  const FormatStyle &Style);
291 
292  unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset,
293  StringRef::size_type Length) const override;
294  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
295  WhitespaceManager &Whitespaces) override;
296  Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn,
297  unsigned ColumnLimit,
298  llvm::Regex &CommentPragmasRegex) const override;
299  unsigned getLineLengthAfterSplitBefore(unsigned LineIndex,
300  unsigned TailOffset,
301  unsigned PreviousEndColumn,
302  unsigned ColumnLimit,
303  Split SplitBefore) const override;
304  void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn,
305  unsigned ColumnLimit, Split SplitBefore,
306  WhitespaceManager &Whitespaces) override;
307  bool mayReflow(unsigned LineIndex,
308  llvm::Regex &CommentPragmasRegex) const override;
309 
310 private:
311  // Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex].
312  //
313  // Updates Content[LineIndex-1] and Content[LineIndex] by stripping off
314  // leading and trailing whitespace.
315  //
316  // Sets ContentColumn to the intended column in which the text at
317  // Lines[LineIndex] starts (note that the decoration, if present, is not
318  // considered part of the text).
319  void adjustWhitespace(unsigned LineIndex, int IndentDelta);
320 
321  // Computes the end column if the full Content from LineIndex gets reflown
322  // after PreviousEndColumn.
323  unsigned getReflownColumn(StringRef Content, unsigned LineIndex,
324  unsigned PreviousEndColumn) const;
325 
326  unsigned getContentStartColumn(unsigned LineIndex,
327  unsigned TailOffset) const override;
328 
329  // The column at which the text of a broken line should start.
330  // Note that an optional decoration would go before that column.
331  // IndentAtLineBreak is a uniform position for all lines in a block comment,
332  // regardless of their relative position.
333  // FIXME: Revisit the decision to do this; the main reason was to support
334  // patterns like
335  // /**************//**
336  // * Comment
337  // We could also support such patterns by special casing the first line
338  // instead.
339  unsigned IndentAtLineBreak;
340 
341  // This is to distinguish between the case when the last line was empty and
342  // the case when it started with a decoration ("*" or "* ").
343  bool LastLineNeedsDecoration;
344 
345  // Either "* " if all lines begin with a "*", or empty.
346  StringRef Decoration;
347 
348  // If this block comment has decorations, this is the column of the start of
349  // the decorations.
350  unsigned DecorationColumn;
351 };
352 
354 public:
356  unsigned OriginalStartColumn, bool FirstInLine,
358  const FormatStyle &Style);
359 
360  unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset,
361  StringRef::size_type Length) const override;
362  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
363  WhitespaceManager &Whitespaces) override;
364  Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn,
365  unsigned ColumnLimit,
366  llvm::Regex &CommentPragmasRegex) const override;
367  unsigned getLineLengthAfterSplitBefore(unsigned LineIndex,
368  unsigned TailOffset,
369  unsigned PreviousEndColumn,
370  unsigned ColumnLimit,
371  Split SplitBefore) const override;
372  void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn,
373  unsigned ColumnLimit, Split SplitBefore,
374  WhitespaceManager &Whitespaces) override;
375  void updateNextToken(LineState &State) const override;
376  bool mayReflow(unsigned LineIndex,
377  llvm::Regex &CommentPragmasRegex) const override;
378 
379 private:
380  unsigned getContentStartColumn(unsigned LineIndex,
381  unsigned TailOffset) const override;
382 
383  // OriginalPrefix[i] contains the original prefix of line i, including
384  // trailing whitespace before the start of the content. The indentation
385  // preceding the prefix is not included.
386  // For example, if the line is:
387  // // content
388  // then the original prefix is "// ".
389  SmallVector<StringRef, 16> OriginalPrefix;
390 
391  // Prefix[i] contains the intended leading "//" with trailing spaces to
392  // account for the indentation of content within the comment at line i after
393  // formatting. It can be different than the original prefix when the original
394  // line starts like this:
395  // //content
396  // Then the original prefix is "//", but the prefix is "// ".
398 
399  SmallVector<unsigned, 16> OriginalContentColumn;
400 
401  /// \brief The token to which the last line of this breakable token belongs
402  /// to; nullptr if that token is the initial token.
403  ///
404  /// The distinction is because if the token of the last line of this breakable
405  /// token is distinct from the initial token, this breakable token owns the
406  /// whitespace before the token of the last line, and the whitespace manager
407  /// must be able to modify it.
408  FormatToken *LastLineTok = nullptr;
409 };
410 } // namespace format
411 } // namespace clang
412 
413 #endif
unsigned getLineCount() const override
Returns the number of lines in this token in the original code.
std::pair< StringRef::size_type, unsigned > Split
Contains starting character index and length of split.
unsigned Length
bool switchesFormatting(const FormatToken &Token)
Checks if Token switches formatting, like /* clang-format off.
void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore, WhitespaceManager &Whitespaces) override
Replaces the whitespace between LineIndex-1 and LineIndex.
BreakableToken(const FormatToken &Tok, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const =0
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...
Split getReflowSplit(StringRef Text, StringRef ReflowPrefix, unsigned PreviousEndColumn, unsigned ColumnLimit) const
BreakableBlockComment(const FormatToken &Token, unsigned StartColumn, unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
SmallVector< int, 16 > ContentColumn
const encoding::Encoding Encoding
virtual bool mayReflow(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const =0
LineState State
Contains functions for text encoding manipulation.
This file implements a token annotator, i.e.
void updateNextToken(LineState &State) const override
Updates the next token of State to the next token after this one.
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
Manages the whitespaces around tokens and their replacements.
BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Creates a breakable token for a single line string literal.
virtual unsigned getLineCount() const =0
Returns the number of lines in this token in the original code.
bool mayReflow(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const override
Base class for single line tokens that can be broken.
Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...
WhitespaceManager * Whitespaces
unsigned getLineCount() const override
Returns the number of lines in this token in the original code.
The current state when indenting a unwrapped line.
WhitespaceManager class manages whitespace around tokens and their replacements.
void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Replaces the whitespace range described by Split with a single space.
SmallVector< StringRef, 16 > Content
virtual void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore, WhitespaceManager &Whitespaces)
Replaces the whitespace between LineIndex-1 and LineIndex.
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:119
FormatToken * Token
virtual unsigned getLineLengthAfterSplitBefore(unsigned LineIndex, unsigned TailOffset, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore) const
Returns the number of columns required to format the piece of line at LineIndex after the content pre...
SmallVector< FormatToken *, 16 > Tokens
void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore, WhitespaceManager &Whitespaces) override
Replaces the whitespace between LineIndex-1 and LineIndex.
BreakableComment(const FormatToken &Token, unsigned StartColumn, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Creates a breakable token for a comment.
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...
BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn, unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a whitespace range (offset, length) of the content at LineIndex such that the content precedi...
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Replaces the whitespace range described by Split with a single space.
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
unsigned getLineLengthAfterSplitBefore(unsigned LineIndex, unsigned TailOffset, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore) const override
Returns the number of columns required to format the piece of line at LineIndex after the content pre...
Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a whitespace range (offset, length) of the content at LineIndex such that the content precedi...
unsigned getLineLengthAfterSplitBefore(unsigned LineIndex, unsigned TailOffset, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore) const override
Returns the number of columns required to format the piece of line at LineIndex after the content pre...
Base class for strategies on how to break tokens.
bool mayReflow(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const override
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
virtual void updateNextToken(LineState &State) const
Updates the next token of State to the next token after this one.
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
virtual unsigned getContentStartColumn(unsigned LineIndex, unsigned TailOffset) const =0
const FormatToken & tokenAt(unsigned LineIndex) const
unsigned getLineLengthAfterCompression(unsigned RemainingTokenColumns, Split Split) const
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
virtual Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const
Returns a whitespace range (offset, length) of the content at LineIndex such that the content precedi...
StringRef Text
Definition: Format.cpp:1302
SmallVector< StringRef, 16 > Lines
virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces)=0
Replaces the whitespace range described by Split with a single space.
virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces)=0
Emits the previously retrieved Split via Whitespaces.
virtual unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const =0
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...