clang  9.0.0
Tokens.h
Go to the documentation of this file.
1 //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Record tokens that a preprocessor emits and define operations to map between
9 // the tokens written in a file and tokens produced by the preprocessor.
10 //
11 // When running the compiler, there are two token streams we are interested in:
12 // - "spelled" tokens directly correspond to a substring written in some
13 // source file.
14 // - "expanded" tokens represent the result of preprocessing, parses consumes
15 // this token stream to produce the AST.
16 //
17 // Expanded tokens correspond directly to locations found in the AST, allowing
18 // to find subranges of the token stream covered by various AST nodes. Spelled
19 // tokens correspond directly to the source code written by the user.
20 //
21 // To allow composing these two use-cases, we also define operations that map
22 // between expanded and spelled tokens that produced them (macro calls,
23 // directives, etc).
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
28 #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
29 
34 #include "clang/Basic/TokenKinds.h"
35 #include "clang/Lex/Token.h"
36 #include "llvm/ADT/ArrayRef.h"
37 #include "llvm/ADT/Optional.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/Support/Compiler.h"
40 #include "llvm/Support/raw_ostream.h"
41 #include <cstdint>
42 #include <tuple>
43 
44 namespace clang {
45 class Preprocessor;
46 
47 namespace syntax {
48 
49 /// A half-open character range inside a particular file, the start offset is
50 /// included and the end offset is excluded from the range.
51 struct FileRange {
52  /// EXPECTS: File.isValid() && Begin <= End.
53  FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
54  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
55  FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
56  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
57  /// are the same.
58  FileRange(const SourceManager &SM, SourceLocation BeginLoc,
59  SourceLocation EndLoc);
60 
61  FileID file() const { return File; }
62  /// Start is a start offset (inclusive) in the corresponding file.
63  unsigned beginOffset() const { return Begin; }
64  /// End offset (exclusive) in the corresponding file.
65  unsigned endOffset() const { return End; }
66 
67  unsigned length() const { return End - Begin; }
68 
69  /// Check if \p Offset is inside the range.
70  bool contains(unsigned Offset) const {
71  return Begin <= Offset && Offset < End;
72  }
73  /// Check \p Offset is inside the range or equal to its endpoint.
74  bool touches(unsigned Offset) const {
75  return Begin <= Offset && Offset <= End;
76  }
77 
78  /// Gets the substring that this FileRange refers to.
79  llvm::StringRef text(const SourceManager &SM) const;
80 
81  friend bool operator==(const FileRange &L, const FileRange &R) {
82  return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
83  }
84  friend bool operator!=(const FileRange &L, const FileRange &R) {
85  return !(L == R);
86  }
87 
88 private:
89  FileID File;
90  unsigned Begin;
91  unsigned End;
92 };
93 
94 /// For debugging purposes.
95 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
96 
97 /// A token coming directly from a file or from a macro invocation. Has just
98 /// enough information to locate the token in the source code.
99 /// Can represent both expanded and spelled tokens.
100 class Token {
101 public:
102  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
103  /// EXPECTS: clang::Token is not an annotation token.
104  explicit Token(const clang::Token &T);
105 
106  tok::TokenKind kind() const { return Kind; }
107  /// Location of the first character of a token.
108  SourceLocation location() const { return Location; }
109  /// Location right after the last character of a token.
111  return Location.getLocWithOffset(Length);
112  }
113  unsigned length() const { return Length; }
114 
115  /// Get the substring covered by the token. Note that will include all
116  /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
117  /// in\
118  /// t
119  /// both have the same kind tok::kw_int, but results of text() are different.
120  llvm::StringRef text(const SourceManager &SM) const;
121 
122  /// Gets a range of this token.
123  /// EXPECTS: token comes from a file, not from a macro expansion.
124  FileRange range(const SourceManager &SM) const;
125 
126  /// Given two tokens inside the same file, returns a file range that starts at
127  /// \p First and ends at \p Last.
128  /// EXPECTS: First and Last are file tokens from the same file, Last starts
129  /// after First.
130  static FileRange range(const SourceManager &SM, const syntax::Token &First,
131  const syntax::Token &Last);
132 
133  std::string dumpForTests(const SourceManager &SM) const;
134  /// For debugging purposes.
135  std::string str() const;
136 
137 private:
138  SourceLocation Location;
139  unsigned Length;
141 };
142 /// For debugging purposes. Equivalent to a call to Token::str().
143 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
144 
145 /// A list of tokens obtained by preprocessing a text buffer and operations to
146 /// map between the expanded and spelled tokens, i.e. TokenBuffer has
147 /// information about two token streams:
148 /// 1. Expanded tokens: tokens produced by the preprocessor after all macro
149 /// replacements,
150 /// 2. Spelled tokens: corresponding directly to the source code of a file
151 /// before any macro replacements occurred.
152 /// Here's an example to illustrate a difference between those two:
153 /// #define FOO 10
154 /// int a = FOO;
155 ///
156 /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
157 /// Expanded tokens are {'int','a','=','10',';','eof'}.
158 ///
159 /// Note that the expanded token stream has a tok::eof token at the end, the
160 /// spelled tokens never store a 'eof' token.
161 ///
162 /// The full list expanded tokens can be obtained with expandedTokens(). Spelled
163 /// tokens for each of the files can be obtained via spelledTokens(FileID).
164 ///
165 /// To map between the expanded and spelled tokens use findSpelledByExpanded().
166 ///
167 /// To build a token buffer use the TokenCollector class. You can also compute
168 /// the spelled tokens of a file using the tokenize() helper.
169 ///
170 /// FIXME: allow to map from spelled to expanded tokens when use-case shows up.
171 /// FIXME: allow mappings into macro arguments.
172 class TokenBuffer {
173 public:
174  TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
175  /// All tokens produced by the preprocessor after all macro replacements,
176  /// directives, etc. Source locations found in the clang AST will always
177  /// point to one of these tokens.
178  /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
179  /// into two '>' tokens by the parser. However, TokenBuffer currently
180  /// keeps it as a single '>>' token.
182  return ExpandedTokens;
183  }
184 
185  /// Find the subrange of spelled tokens that produced the corresponding \p
186  /// Expanded tokens.
187  ///
188  /// EXPECTS: \p Expanded is a subrange of expandedTokens().
189  ///
190  /// Will fail if the expanded tokens do not correspond to a
191  /// sequence of spelled tokens. E.g. for the following example:
192  ///
193  /// #define FIRST f1 f2 f3
194  /// #define SECOND s1 s2 s3
195  ///
196  /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
197  ///
198  /// the results would be:
199  /// expanded => spelled
200  /// ------------------------
201  /// a => a
202  /// s1 s2 s3 => SECOND
203  /// a f1 f2 f3 => a FIRST
204  /// a f1 => can't map
205  /// s1 s2 => can't map
206  ///
207  /// If \p Expanded is empty, the returned value is llvm::None.
208  /// Complexity is logarithmic.
210  spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
211 
212  /// An expansion produced by the preprocessor, includes macro expansions and
213  /// preprocessor directives. Preprocessor always maps a non-empty range of
214  /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
215  /// few examples of expansions:
216  /// #pragma once // Expands to an empty range.
217  /// #define FOO 1 2 3 // Expands an empty range.
218  /// FOO // Expands to "1 2 3".
219  /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
220  /// #include <vector> // Expands to tokens produced by the include.
221  struct Expansion {
224  };
225  /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
226  /// a preprocessor directive) return the subrange of expanded tokens that the
227  /// macro expands to.
229  expansionStartingAt(const syntax::Token *Spelled) const;
230 
231  /// Lexed tokens of a file before preprocessing. E.g. for the following input
232  /// #define DECL(name) int name = 10
233  /// DECL(a);
234  /// spelledTokens() returns {"#", "define", "DECL", "(", "name", ")", "eof"}.
235  /// FIXME: we do not yet store tokens of directives, like #include, #define,
236  /// #pragma, etc.
237  llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
238 
239  const SourceManager &sourceManager() const { return *SourceMgr; }
240 
241  std::string dumpForTests() const;
242 
243 private:
244  /// Describes a mapping between a continuous subrange of spelled tokens and
245  /// expanded tokens. Represents macro expansions, preprocessor directives,
246  /// conditionally disabled pp regions, etc.
247  /// #define FOO 1+2
248  /// #define BAR(a) a + 1
249  /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
250  /// BAR(1) // invocation #2, tokens = {'a', '+', '1'},
251  /// macroTokens = {'BAR', '(', '1', ')'}.
252  struct Mapping {
253  // Positions in the corresponding spelled token stream. The corresponding
254  // range is never empty.
255  unsigned BeginSpelled = 0;
256  unsigned EndSpelled = 0;
257  // Positions in the expanded token stream. The corresponding range can be
258  // empty.
259  unsigned BeginExpanded = 0;
260  unsigned EndExpanded = 0;
261 
262  /// For debugging purposes.
263  std::string str() const;
264  };
265  /// Spelled tokens of the file with information about the subranges.
266  struct MarkedFile {
267  /// Lexed, but not preprocessed, tokens of the file. These map directly to
268  /// text in the corresponding files and include tokens of all preprocessor
269  /// directives.
270  /// FIXME: spelled tokens don't change across FileID that map to the same
271  /// FileEntry. We could consider deduplicating them to save memory.
272  std::vector<syntax::Token> SpelledTokens;
273  /// A sorted list to convert between the spelled and expanded token streams.
274  std::vector<Mapping> Mappings;
275  /// The first expanded token produced for this FileID.
276  unsigned BeginExpanded = 0;
277  unsigned EndExpanded = 0;
278  };
279 
280  friend class TokenCollector;
281 
282  /// Maps a single expanded token to its spelled counterpart or a mapping that
283  /// produced it.
284  std::pair<const syntax::Token *, const Mapping *>
285  spelledForExpandedToken(const syntax::Token *Expanded) const;
286 
287  /// Token stream produced after preprocessing, conceputally this captures the
288  /// same stream as 'clang -E' (excluding the preprocessor directives like
289  /// #file, etc.).
290  std::vector<syntax::Token> ExpandedTokens;
291  llvm::DenseMap<FileID, MarkedFile> Files;
292  // The value is never null, pointer instead of reference to avoid disabling
293  // implicit assignment operator.
294  const SourceManager *SourceMgr;
295 };
296 
297 /// Lex the text buffer, corresponding to \p FID, in raw mode and record the
298 /// resulting spelled tokens. Does minimal post-processing on raw identifiers,
299 /// setting the appropriate token kind (instead of the raw_identifier reported
300 /// by lexer in raw mode). This is a very low-level function, most users should
301 /// prefer to use TokenCollector. Lexing in raw mode produces wildly different
302 /// results from what one might expect when running a C++ frontend, e.g.
303 /// preprocessor does not run at all.
304 /// The result will *not* have a 'eof' token at the end.
305 std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
306  const LangOptions &LO);
307 
308 /// Collects tokens for the main file while running the frontend action. An
309 /// instance of this object should be created on
310 /// FrontendAction::BeginSourceFile() and the results should be consumed after
311 /// FrontendAction::Execute() finishes.
313 public:
314  /// Adds the hooks to collect the tokens. Should be called before the
315  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
316  /// CreateASTConsumer().
318 
319  /// Finalizes token collection. Should be called after preprocessing is
320  /// finished, i.e. after running Execute().
321  LLVM_NODISCARD TokenBuffer consume() &&;
322 
323 private:
324  /// Maps from a start to an end spelling location of transformations
325  /// performed by the preprocessor. These include:
326  /// 1. range from '#' to the last token in the line for PP directives,
327  /// 2. macro name and arguments for macro expansions.
328  /// Note that we record only top-level macro expansions, intermediate
329  /// expansions (e.g. inside macro arguments) are ignored.
330  ///
331  /// Used to find correct boundaries of macro calls and directives when
332  /// building mappings from spelled to expanded tokens.
333  ///
334  /// Logically, at each point of the preprocessor execution there is a stack of
335  /// macro expansions being processed and we could use it to recover the
336  /// location information we need. However, the public preprocessor API only
337  /// exposes the points when macro expansions start (when we push a macro onto
338  /// the stack) and not when they end (when we pop a macro from the stack).
339  /// To workaround this limitation, we rely on source location information
340  /// stored in this map.
341  using PPExpansions = llvm::DenseMap</*SourceLocation*/ int, SourceLocation>;
342  class Builder;
343  class CollectPPExpansions;
344 
345  std::vector<syntax::Token> Expanded;
346  // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
347  PPExpansions Expansions;
348  Preprocessor &PP;
349  CollectPPExpansions *Collector;
350 };
351 
352 } // namespace syntax
353 } // namespace clang
354 
355 #endif
llvm::StringRef text(const SourceManager &SM) const
Gets the substring that this FileRange refers to.
Definition: Tokens.cpp:112
const SourceManager & sourceManager() const
Definition: Tokens.h:239
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
friend bool operator==(const FileRange &L, const FileRange &R)
Definition: Tokens.h:81
Defines the clang::FileManager interface and associated types.
Defines the SourceManager interface.
llvm::raw_ostream & operator<<(llvm::raw_ostream &OS, NodeKind K)
For debugging purposes.
Definition: Nodes.cpp:13
StringRef P
A token coming directly from a file or from a macro invocation.
Definition: Tokens.h:100
FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset)
EXPECTS: File.isValid() && Begin <= End.
Definition: Tokens.cpp:78
RangeSelector range(RangeSelector Begin, RangeSelector End)
Selects from the start of Begin and to the end of End.
Records information reqired to construct mappings for the token buffer that we are collecting...
Definition: Tokens.cpp:264
llvm::ArrayRef< syntax::Token > expandedTokens() const
All tokens produced by the preprocessor after all macro replacements, directives, etc...
Definition: Tokens.h:181
Token - This structure provides full information about a lexed token.
Definition: Token.h:34
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:49
llvm::ArrayRef< syntax::Token > Spelled
Definition: Tokens.h:222
SourceLocation location() const
Location of the first character of a token.
Definition: Tokens.h:108
bool touches(unsigned Offset) const
Check Offset is inside the range or equal to its endpoint.
Definition: Tokens.h:74
SourceLocation endLocation() const
Location right after the last character of a token.
Definition: Tokens.h:110
unsigned length() const
Definition: Tokens.h:113
TokenBuffer(const SourceManager &SourceMgr)
Definition: Tokens.h:174
bool contains(unsigned Offset) const
Check if Offset is inside the range.
Definition: Tokens.h:70
unsigned Offset
Definition: Format.cpp:1713
Defines the clang::LangOptions interface.
unsigned length() const
Definition: Tokens.h:67
const SourceManager & SM
Definition: Format.cpp:1572
Kind
Encodes a location in the source.
tok::TokenKind kind() const
Definition: Tokens.h:106
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:24
FileID file() const
Definition: Tokens.h:61
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
unsigned beginOffset() const
Start is a start offset (inclusive) in the corresponding file.
Definition: Tokens.h:63
Dataflow Directional Tag Classes.
A half-open character range inside a particular file, the start offset is included and the end offset...
Definition: Tokens.h:51
Collects tokens for the main file while running the frontend action.
Definition: Tokens.h:312
std::vector< syntax::Token > tokenize(FileID FID, const SourceManager &SM, const LangOptions &LO)
Lex the text buffer, corresponding to FID, in raw mode and record the resulting spelled tokens...
Definition: Tokens.cpp:235
Defines the clang::TokenKind enum and support functions.
Defines the clang::SourceLocation class and associated facilities.
Builds mappings and spelled tokens in the TokenBuffer based on the expanded token stream...
Definition: Tokens.cpp:331
An expansion produced by the preprocessor, includes macro expansions and preprocessor directives...
Definition: Tokens.h:221
friend bool operator!=(const FileRange &L, const FileRange &R)
Definition: Tokens.h:84
A list of tokens obtained by preprocessing a text buffer and operations to map between the expanded a...
Definition: Tokens.h:172
llvm::ArrayRef< syntax::Token > Expanded
Definition: Tokens.h:223
unsigned endOffset() const
End offset (exclusive) in the corresponding file.
Definition: Tokens.h:65
This class handles loading and caching of source files into memory.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:124