clang  7.0.0
InterpolatingCompilationDatabase.cpp
Go to the documentation of this file.
1 //===- InterpolatingCompilationDatabase.cpp ---------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // InterpolatingCompilationDatabase wraps another CompilationDatabase and
11 // attempts to heuristically determine appropriate compile commands for files
12 // that are not included, such as headers or newly created files.
13 //
14 // Motivating cases include:
15 // Header files that live next to their implementation files. These typically
16 // share a base filename. (libclang/CXString.h, libclang/CXString.cpp).
17 // Some projects separate headers from includes. Filenames still typically
18 // match, maybe other path segments too. (include/llvm/IR/Use.h, lib/IR/Use.cc).
19 // Matches are sometimes only approximate (Sema.h, SemaDecl.cpp). This goes
20 // for directories too (Support/Unix/Process.inc, lib/Support/Process.cpp).
21 // Even if we can't find a "right" compile command, even a random one from
22 // the project will tend to get important flags like -I and -x right.
23 //
24 // We "borrow" the compile command for the closest available file:
25 // - points are awarded if the filename matches (ignoring extension)
26 // - points are awarded if the directory structure matches
27 // - ties are broken by length of path prefix match
28 //
29 // The compile command is adjusted, replacing the filename and removing output
30 // file arguments. The -x and -std flags may be affected too.
31 //
32 // Source language is a tricky issue: is it OK to use a .c file's command
33 // for building a .cc file? What language is a .h file in?
34 // - We only consider compile commands for c-family languages as candidates.
35 // - For files whose language is implied by the filename (e.g. .m, .hpp)
36 // we prefer candidates from the same language.
37 // If we must cross languages, we drop any -x and -std flags.
38 // - For .h files, candidates from any c-family language are acceptable.
39 // We use the candidate's language, inserting e.g. -x c++-header.
40 //
41 // This class is only useful when wrapping databases that can enumerate all
42 // their compile commands. If getAllFilenames() is empty, no inference occurs.
43 //
44 //===----------------------------------------------------------------------===//
45 
46 #include "clang/Driver/Options.h"
47 #include "clang/Driver/Types.h"
50 #include "llvm/ADT/DenseMap.h"
51 #include "llvm/ADT/StringExtras.h"
52 #include "llvm/ADT/StringSwitch.h"
53 #include "llvm/Option/ArgList.h"
54 #include "llvm/Option/OptTable.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/Path.h"
57 #include "llvm/Support/StringSaver.h"
58 #include "llvm/Support/raw_ostream.h"
59 #include <memory>
60 
61 namespace clang {
62 namespace tooling {
63 namespace {
64 using namespace llvm;
65 namespace types = clang::driver::types;
66 namespace path = llvm::sys::path;
67 
68 // The length of the prefix these two strings have in common.
69 size_t matchingPrefix(StringRef L, StringRef R) {
70  size_t Limit = std::min(L.size(), R.size());
71  for (size_t I = 0; I < Limit; ++I)
72  if (L[I] != R[I])
73  return I;
74  return Limit;
75 }
76 
77 // A comparator for searching SubstringWithIndexes with std::equal_range etc.
78 // Optionaly prefix semantics: compares equal if the key is a prefix.
79 template <bool Prefix> struct Less {
80  bool operator()(StringRef Key, std::pair<StringRef, size_t> Value) const {
81  StringRef V = Prefix ? Value.first.substr(0, Key.size()) : Value.first;
82  return Key < V;
83  }
84  bool operator()(std::pair<StringRef, size_t> Value, StringRef Key) const {
85  StringRef V = Prefix ? Value.first.substr(0, Key.size()) : Value.first;
86  return V < Key;
87  }
88 };
89 
90 // Infer type from filename. If we might have gotten it wrong, set *Certain.
91 // *.h will be inferred as a C header, but not certain.
92 types::ID guessType(StringRef Filename, bool *Certain = nullptr) {
93  // path::extension is ".cpp", lookupTypeForExtension wants "cpp".
94  auto Lang =
95  types::lookupTypeForExtension(path::extension(Filename).substr(1));
96  if (Certain)
97  *Certain = Lang != types::TY_CHeader && Lang != types::TY_INVALID;
98  return Lang;
99 }
100 
101 // Return Lang as one of the canonical supported types.
102 // e.g. c-header --> c; fortran --> TY_INVALID
103 static types::ID foldType(types::ID Lang) {
104  switch (Lang) {
105  case types::TY_C:
106  case types::TY_CHeader:
107  return types::TY_C;
108  case types::TY_ObjC:
109  case types::TY_ObjCHeader:
110  return types::TY_ObjC;
111  case types::TY_CXX:
112  case types::TY_CXXHeader:
113  return types::TY_CXX;
114  case types::TY_ObjCXX:
115  case types::TY_ObjCXXHeader:
116  return types::TY_ObjCXX;
117  default:
118  return types::TY_INVALID;
119  }
120 }
121 
122 // A CompileCommand that can be applied to another file.
123 struct TransferableCommand {
124  // Flags that should not apply to all files are stripped from CommandLine.
125  CompileCommand Cmd;
126  // Language detected from -x or the filename.
128  // Standard specified by -std.
130 
131  TransferableCommand(CompileCommand C)
132  : Cmd(std::move(C)), Type(guessType(Cmd.Filename)) {
133  std::vector<std::string> NewArgs = {Cmd.CommandLine.front()};
134  // Parse the old args in order to strip out and record unwanted flags.
135  auto OptTable = clang::driver::createDriverOptTable();
136  std::vector<const char *> Argv;
137  for (unsigned I = 1; I < Cmd.CommandLine.size(); ++I)
138  Argv.push_back(Cmd.CommandLine[I].c_str());
139  unsigned MissingI, MissingC;
140  auto ArgList = OptTable->ParseArgs(Argv, MissingI, MissingC);
141  for (const auto *Arg : ArgList) {
142  const auto &option = Arg->getOption();
143  // Strip input and output files.
144  if (option.matches(clang::driver::options::OPT_INPUT) ||
145  option.matches(clang::driver::options::OPT_o)) {
146  continue;
147  }
148  // Strip -x, but record the overridden language.
149  if (option.matches(clang::driver::options::OPT_x)) {
150  for (const char *Value : Arg->getValues())
151  Type = types::lookupTypeForTypeSpecifier(Value);
152  continue;
153  }
154  // Strip --std, but record the value.
155  if (option.matches(clang::driver::options::OPT_std_EQ)) {
156  for (const char *Value : Arg->getValues()) {
157  Std = llvm::StringSwitch<LangStandard::Kind>(Value)
158 #define LANGSTANDARD(id, name, lang, desc, features) \
159  .Case(name, LangStandard::lang_##id)
160 #define LANGSTANDARD_ALIAS(id, alias) .Case(alias, LangStandard::lang_##id)
161 #include "clang/Frontend/LangStandards.def"
162  .Default(Std);
163  }
164  continue;
165  }
166  llvm::opt::ArgStringList ArgStrs;
167  Arg->render(ArgList, ArgStrs);
168  NewArgs.insert(NewArgs.end(), ArgStrs.begin(), ArgStrs.end());
169  }
170  Cmd.CommandLine = std::move(NewArgs);
171 
172  if (Std != LangStandard::lang_unspecified) // -std take precedence over -x
173  Type = toType(LangStandard::getLangStandardForKind(Std).getLanguage());
174  Type = foldType(Type);
175  }
176 
177  // Produce a CompileCommand for \p filename, based on this one.
178  CompileCommand transferTo(StringRef Filename) const {
179  CompileCommand Result = Cmd;
180  Result.Filename = Filename;
181  bool TypeCertain;
182  auto TargetType = guessType(Filename, &TypeCertain);
183  // If the filename doesn't determine the language (.h), transfer with -x.
184  if (!TypeCertain) {
185  TargetType = types::onlyPrecompileType(TargetType) // header?
187  : Type;
188  Result.CommandLine.push_back("-x");
189  Result.CommandLine.push_back(types::getTypeName(TargetType));
190  }
191  // --std flag may only be transferred if the language is the same.
192  // We may consider "translating" these, e.g. c++11 -> c11.
193  if (Std != LangStandard::lang_unspecified && foldType(TargetType) == Type) {
194  Result.CommandLine.push_back(
195  "-std=" +
196  std::string(LangStandard::getLangStandardForKind(Std).getName()));
197  }
198  Result.CommandLine.push_back(Filename);
199  return Result;
200  }
201 
202 private:
203  // Map the language from the --std flag to that of the -x flag.
204  static types::ID toType(InputKind::Language Lang) {
205  switch (Lang) {
206  case InputKind::C:
207  return types::TY_C;
208  case InputKind::CXX:
209  return types::TY_CXX;
210  case InputKind::ObjC:
211  return types::TY_ObjC;
212  case InputKind::ObjCXX:
213  return types::TY_ObjCXX;
214  default:
215  return types::TY_INVALID;
216  }
217  }
218 };
219 
220 // CommandIndex does the real work: given a filename, it produces the best
221 // matching TransferableCommand by matching filenames. Basic strategy:
222 // - Build indexes of each of the substrings we want to look up by.
223 // These indexes are just sorted lists of the substrings.
224 // - Forward requests to the inner CDB. If it fails, we must pick a proxy.
225 // - Each criterion corresponds to a range lookup into the index, so we only
226 // need O(log N) string comparisons to determine scores.
227 // - We then break ties among the candidates with the highest score.
228 class CommandIndex {
229 public:
230  CommandIndex(std::vector<TransferableCommand> AllCommands)
231  : Commands(std::move(AllCommands)), Strings(Arena) {
232  // Sort commands by filename for determinism (index is a tiebreaker later).
233  llvm::sort(
234  Commands.begin(), Commands.end(),
235  [](const TransferableCommand &Left, const TransferableCommand &Right) {
236  return Left.Cmd.Filename < Right.Cmd.Filename;
237  });
238  for (size_t I = 0; I < Commands.size(); ++I) {
239  StringRef Path =
240  Strings.save(StringRef(Commands[I].Cmd.Filename).lower());
241  Paths.push_back({Path, I});
242  Stems.emplace_back(sys::path::stem(Path), I);
243  auto Dir = ++sys::path::rbegin(Path), DirEnd = sys::path::rend(Path);
244  for (int J = 0; J < DirectorySegmentsIndexed && Dir != DirEnd; ++J, ++Dir)
245  if (Dir->size() > ShortDirectorySegment) // not trivial ones
246  Components.emplace_back(*Dir, I);
247  }
248  llvm::sort(Paths.begin(), Paths.end());
249  llvm::sort(Stems.begin(), Stems.end());
250  llvm::sort(Components.begin(), Components.end());
251  }
252 
253  bool empty() const { return Commands.empty(); }
254 
255  // Returns the command that best fits OriginalFilename.
256  // Candidates with PreferLanguage will be chosen over others (unless it's
257  // TY_INVALID, or all candidates are bad).
258  const TransferableCommand &chooseProxy(StringRef OriginalFilename,
259  types::ID PreferLanguage) const {
260  assert(!empty() && "need at least one candidate!");
261  std::string Filename = OriginalFilename.lower();
262  auto Candidates = scoreCandidates(Filename);
263  std::pair<size_t, int> Best =
264  pickWinner(Candidates, Filename, PreferLanguage);
265 
266  DEBUG_WITH_TYPE("interpolate",
267  llvm::dbgs()
268  << "interpolate: chose "
269  << Commands[Best.first].Cmd.Filename << " as proxy for "
270  << OriginalFilename << " preferring "
271  << (PreferLanguage == types::TY_INVALID
272  ? "none"
273  : types::getTypeName(PreferLanguage))
274  << " score=" << Best.second << "\n");
275  return Commands[Best.first];
276  }
277 
278 private:
279  using SubstringAndIndex = std::pair<StringRef, size_t>;
280  // Directory matching parameters: we look at the last two segments of the
281  // parent directory (usually the semantically significant ones in practice).
282  // We search only the last four of each candidate (for efficiency).
283  constexpr static int DirectorySegmentsIndexed = 4;
284  constexpr static int DirectorySegmentsQueried = 2;
285  constexpr static int ShortDirectorySegment = 1; // Only look at longer names.
286 
287  // Award points to candidate entries that should be considered for the file.
288  // Returned keys are indexes into paths, and the values are (nonzero) scores.
289  DenseMap<size_t, int> scoreCandidates(StringRef Filename) const {
290  // Decompose Filename into the parts we care about.
291  // /some/path/complicated/project/Interesting.h
292  // [-prefix--][---dir---] [-dir-] [--stem---]
293  StringRef Stem = sys::path::stem(Filename);
295  llvm::StringRef Prefix;
296  auto Dir = ++sys::path::rbegin(Filename),
297  DirEnd = sys::path::rend(Filename);
298  for (int I = 0; I < DirectorySegmentsQueried && Dir != DirEnd; ++I, ++Dir) {
299  if (Dir->size() > ShortDirectorySegment)
300  Dirs.push_back(*Dir);
301  Prefix = Filename.substr(0, Dir - DirEnd);
302  }
303 
304  // Now award points based on lookups into our various indexes.
305  DenseMap<size_t, int> Candidates; // Index -> score.
306  auto Award = [&](int Points, ArrayRef<SubstringAndIndex> Range) {
307  for (const auto &Entry : Range)
308  Candidates[Entry.second] += Points;
309  };
310  // Award one point if the file's basename is a prefix of the candidate,
311  // and another if it's an exact match (so exact matches get two points).
312  Award(1, indexLookup</*Prefix=*/true>(Stem, Stems));
313  Award(1, indexLookup</*Prefix=*/false>(Stem, Stems));
314  // For each of the last few directories in the Filename, award a point
315  // if it's present in the candidate.
316  for (StringRef Dir : Dirs)
317  Award(1, indexLookup</*Prefix=*/false>(Dir, Components));
318  // Award one more point if the whole rest of the path matches.
319  if (sys::path::root_directory(Prefix) != Prefix)
320  Award(1, indexLookup</*Prefix=*/true>(Prefix, Paths));
321  return Candidates;
322  }
323 
324  // Pick a single winner from the set of scored candidates.
325  // Returns (index, score).
326  std::pair<size_t, int> pickWinner(const DenseMap<size_t, int> &Candidates,
327  StringRef Filename,
328  types::ID PreferredLanguage) const {
329  struct ScoredCandidate {
330  size_t Index;
331  bool Preferred;
332  int Points;
333  size_t PrefixLength;
334  };
335  // Choose the best candidate by (preferred, points, prefix length, alpha).
336  ScoredCandidate Best = {size_t(-1), false, 0, 0};
337  for (const auto &Candidate : Candidates) {
338  ScoredCandidate S;
339  S.Index = Candidate.first;
340  S.Preferred = PreferredLanguage == types::TY_INVALID ||
341  PreferredLanguage == Commands[S.Index].Type;
342  S.Points = Candidate.second;
343  if (!S.Preferred && Best.Preferred)
344  continue;
345  if (S.Preferred == Best.Preferred) {
346  if (S.Points < Best.Points)
347  continue;
348  if (S.Points == Best.Points) {
349  S.PrefixLength = matchingPrefix(Filename, Paths[S.Index].first);
350  if (S.PrefixLength < Best.PrefixLength)
351  continue;
352  // hidden heuristics should at least be deterministic!
353  if (S.PrefixLength == Best.PrefixLength)
354  if (S.Index > Best.Index)
355  continue;
356  }
357  }
358  // PrefixLength was only set above if actually needed for a tiebreak.
359  // But it definitely needs to be set to break ties in the future.
360  S.PrefixLength = matchingPrefix(Filename, Paths[S.Index].first);
361  Best = S;
362  }
363  // Edge case: no candidate got any points.
364  // We ignore PreferredLanguage at this point (not ideal).
365  if (Best.Index == size_t(-1))
366  return {longestMatch(Filename, Paths).second, 0};
367  return {Best.Index, Best.Points};
368  }
369 
370  // Returns the range within a sorted index that compares equal to Key.
371  // If Prefix is true, it's instead the range starting with Key.
372  template <bool Prefix>
374  indexLookup(StringRef Key, const std::vector<SubstringAndIndex> &Idx) const {
375  // Use pointers as iteratiors to ease conversion of result to ArrayRef.
376  auto Range = std::equal_range(Idx.data(), Idx.data() + Idx.size(), Key,
377  Less<Prefix>());
378  return {Range.first, Range.second};
379  }
380 
381  // Performs a point lookup into a nonempty index, returning a longest match.
382  SubstringAndIndex
383  longestMatch(StringRef Key, const std::vector<SubstringAndIndex> &Idx) const {
384  assert(!Idx.empty());
385  // Longest substring match will be adjacent to a direct lookup.
386  auto It =
387  std::lower_bound(Idx.begin(), Idx.end(), SubstringAndIndex{Key, 0});
388  if (It == Idx.begin())
389  return *It;
390  if (It == Idx.end())
391  return *--It;
392  // Have to choose between It and It-1
393  size_t Prefix = matchingPrefix(Key, It->first);
394  size_t PrevPrefix = matchingPrefix(Key, (It - 1)->first);
395  return Prefix > PrevPrefix ? *It : *--It;
396  }
397 
398  std::vector<TransferableCommand> Commands; // Indexes point into this.
399  BumpPtrAllocator Arena;
400  StringSaver Strings;
401  // Indexes of candidates by certain substrings.
402  // String is lowercase and sorted, index points into OriginalPaths.
403  std::vector<SubstringAndIndex> Paths; // Full path.
404  std::vector<SubstringAndIndex> Stems; // Basename, without extension.
405  std::vector<SubstringAndIndex> Components; // Last path components.
406 };
407 
408 // The actual CompilationDatabase wrapper delegates to its inner database.
409 // If no match, looks up a command in CommandIndex and transfers it to the file.
410 class InterpolatingCompilationDatabase : public CompilationDatabase {
411 public:
412  InterpolatingCompilationDatabase(std::unique_ptr<CompilationDatabase> Inner)
413  : Inner(std::move(Inner)), Index(allCommands()) {}
414 
415  std::vector<CompileCommand>
416  getCompileCommands(StringRef Filename) const override {
417  auto Known = Inner->getCompileCommands(Filename);
418  if (Index.empty() || !Known.empty())
419  return Known;
420  bool TypeCertain;
421  auto Lang = guessType(Filename, &TypeCertain);
422  if (!TypeCertain)
423  Lang = types::TY_INVALID;
424  return {Index.chooseProxy(Filename, foldType(Lang)).transferTo(Filename)};
425  }
426 
427  std::vector<std::string> getAllFiles() const override {
428  return Inner->getAllFiles();
429  }
430 
431  std::vector<CompileCommand> getAllCompileCommands() const override {
432  return Inner->getAllCompileCommands();
433  }
434 
435 private:
436  std::vector<TransferableCommand> allCommands() {
437  std::vector<TransferableCommand> Result;
438  for (auto Command : Inner->getAllCompileCommands()) {
439  Result.emplace_back(std::move(Command));
440  if (Result.back().Type == types::TY_INVALID)
441  Result.pop_back();
442  }
443  return Result;
444  }
445 
446  std::unique_ptr<CompilationDatabase> Inner;
447  CommandIndex Index;
448 };
449 
450 } // namespace
451 
452 std::unique_ptr<CompilationDatabase>
453 inferMissingCompileCommands(std::unique_ptr<CompilationDatabase> Inner) {
454  return llvm::make_unique<InterpolatingCompilationDatabase>(std::move(Inner));
455 }
456 
457 } // namespace tooling
458 } // namespace clang
__SIZE_TYPE__ size_t
The unsigned integer type of the result of the sizeof operator.
Definition: opencl-c.h:60
DominatorTree GraphTraits specialization so the DominatorTree can be iterable by generic graph iterat...
Definition: Dominators.h:30
static const LangStandard & getLangStandardForKind(Kind K)
std::string getName(ArrayRef< StringRef > Parts) const
Get the platform-specific name separator.
Definition: Format.h:2031
Languages that the frontend can parse and compile.
CompileCommand Cmd
std::unique_ptr< llvm::opt::OptTable > createDriverOptTable()
StringRef Filename
Definition: Format.cpp:1605
ID lookupHeaderTypeForSourceType(ID Id)
Lookup header file input type that corresponds to given source file type (used for clang-cl emulation...
Definition: Types.cpp:310
const char * getTypeName(ID Id)
getTypeName - Return the name of the type for Id.
Definition: Types.cpp:39
std::unique_ptr< CompilationDatabase > inferMissingCompileCommands(std::unique_ptr< CompilationDatabase >)
Returns a wrapped CompilationDatabase that defers to the provided one, but getCompileCommands() will ...
Dataflow Directional Tag Classes.
Language
The language for the input, used to select and validate the language standard and possible actions...
ID lookupTypeForExtension(llvm::StringRef Ext)
lookupTypeForExtension - Lookup the type to use for the file extension Ext.
Definition: Types.cpp:195
bool onlyPrecompileType(ID Id)
onlyPrecompileType - Should this type only be precompiled.
Definition: Types.cpp:76
__DEVICE__ int min(int __a, int __b)
ID lookupTypeForTypeSpecifier(const char *Name)
lookupTypeForTypSpecifier - Lookup the type to use for a user specified type name.
Definition: Types.cpp:256
LangStandard::Kind Std
#define LANGSTANDARD(id, name, lang, desc, features)