LLVM  9.0.0svn
TargetTransformInfo.h
Go to the documentation of this file.
1 //===- TargetTransformInfo.h ------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This pass exposes codegen information to IR-level passes. Every
10 /// transformation that uses codegen information is broken into three parts:
11 /// 1. The IR-level analysis pass.
12 /// 2. The IR-level transformation interface which provides the needed
13 /// information.
14 /// 3. Codegen-level implementation which uses target-specific hooks.
15 ///
16 /// This file defines #2, which is the interface that IR-level transformations
17 /// use for querying the codegen.
18 ///
19 //===----------------------------------------------------------------------===//
20 
21 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
22 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
23 
24 #include "llvm/ADT/Optional.h"
25 #include "llvm/IR/Operator.h"
26 #include "llvm/IR/PassManager.h"
27 #include "llvm/Pass.h"
29 #include "llvm/Support/DataTypes.h"
30 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/IR/Dominators.h"
34 #include <functional>
35 
36 namespace llvm {
37 
38 namespace Intrinsic {
39 enum ID : unsigned;
40 }
41 
42 class AssumptionCache;
43 class BranchInst;
44 class Function;
45 class GlobalValue;
46 class IntrinsicInst;
47 class LoadInst;
48 class Loop;
49 class SCEV;
50 class ScalarEvolution;
51 class StoreInst;
52 class SwitchInst;
53 class TargetLibraryInfo;
54 class Type;
55 class User;
56 class Value;
57 
58 /// Information about a load/store intrinsic defined by the target.
60  /// This is the pointer that the intrinsic is loading from or storing to.
61  /// If this is non-null, then analysis/optimization passes can assume that
62  /// this intrinsic is functionally equivalent to a load/store from this
63  /// pointer.
64  Value *PtrVal = nullptr;
65 
66  // Ordering for atomic operations.
68 
69  // Same Id is set by the target for corresponding load/store intrinsics.
70  unsigned short MatchingId = 0;
71 
72  bool ReadMem = false;
73  bool WriteMem = false;
74  bool IsVolatile = false;
75 
76  bool isUnordered() const {
77  return (Ordering == AtomicOrdering::NotAtomic ||
78  Ordering == AtomicOrdering::Unordered) && !IsVolatile;
79  }
80 };
81 
82 /// Attributes of a target dependent hardware loop.
84  HardwareLoopInfo() = delete;
85  HardwareLoopInfo(Loop *L) : L(L) {}
86  Loop *L = nullptr;
87  BasicBlock *ExitBlock = nullptr;
88  BranchInst *ExitBranch = nullptr;
89  const SCEV *ExitCount = nullptr;
90  IntegerType *CountType = nullptr;
91  Value *LoopDecrement = nullptr; // Decrement the loop counter by this
92  // value in every iteration.
93  bool IsNestingLegal = false; // Can a hardware loop be a parent to
94  // another hardware loop?
95  bool CounterInReg = false; // Should loop counter be updated in
96  // the loop via a phi?
97  bool PerformEntryTest = false; // Generate the intrinsic which also performs
98  // icmp ne zero on the loop counter value and
99  // produces an i1 to guard the loop entry.
100  bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI,
101  DominatorTree &DT, bool ForceNestedLoop = false,
102  bool ForceHardwareLoopPHI = false);
103  bool canAnalyze(LoopInfo &LI);
104 };
105 
106 /// This pass provides access to the codegen interfaces that are needed
107 /// for IR-level transformations.
109 public:
110  /// Construct a TTI object using a type implementing the \c Concept
111  /// API below.
112  ///
113  /// This is used by targets to construct a TTI wrapping their target-specific
114  /// implementation that encodes appropriate costs for their target.
115  template <typename T> TargetTransformInfo(T Impl);
116 
117  /// Construct a baseline TTI object using a minimal implementation of
118  /// the \c Concept API below.
119  ///
120  /// The TTI implementation will reflect the information in the DataLayout
121  /// provided if non-null.
122  explicit TargetTransformInfo(const DataLayout &DL);
123 
124  // Provide move semantics.
126  TargetTransformInfo &operator=(TargetTransformInfo &&RHS);
127 
128  // We need to define the destructor out-of-line to define our sub-classes
129  // out-of-line.
131 
132  /// Handle the invalidation of this information.
133  ///
134  /// When used as a result of \c TargetIRAnalysis this method will be called
135  /// when the function this was computed for changes. When it returns false,
136  /// the information is preserved across those changes.
139  // FIXME: We should probably in some way ensure that the subtarget
140  // information for a function hasn't changed.
141  return false;
142  }
143 
144  /// \name Generic Target Information
145  /// @{
146 
147  /// The kind of cost model.
148  ///
149  /// There are several different cost models that can be customized by the
150  /// target. The normalization of each cost model may be target specific.
152  TCK_RecipThroughput, ///< Reciprocal throughput.
153  TCK_Latency, ///< The latency of instruction.
154  TCK_CodeSize ///< Instruction code size.
155  };
156 
157  /// Query the cost of a specified instruction.
158  ///
159  /// Clients should use this interface to query the cost of an existing
160  /// instruction. The instruction must have a valid parent (basic block).
161  ///
162  /// Note, this method does not cache the cost calculation and it
163  /// can be expensive in some cases.
164  int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const {
165  switch (kind){
166  case TCK_RecipThroughput:
167  return getInstructionThroughput(I);
168 
169  case TCK_Latency:
170  return getInstructionLatency(I);
171 
172  case TCK_CodeSize:
173  return getUserCost(I);
174  }
175  llvm_unreachable("Unknown instruction cost kind");
176  }
177 
178  /// Underlying constants for 'cost' values in this interface.
179  ///
180  /// Many APIs in this interface return a cost. This enum defines the
181  /// fundamental values that should be used to interpret (and produce) those
182  /// costs. The costs are returned as an int rather than a member of this
183  /// enumeration because it is expected that the cost of one IR instruction
184  /// may have a multiplicative factor to it or otherwise won't fit directly
185  /// into the enum. Moreover, it is common to sum or average costs which works
186  /// better as simple integral values. Thus this enum only provides constants.
187  /// Also note that the returned costs are signed integers to make it natural
188  /// to add, subtract, and test with zero (a common boundary condition). It is
189  /// not expected that 2^32 is a realistic cost to be modeling at any point.
190  ///
191  /// Note that these costs should usually reflect the intersection of code-size
192  /// cost and execution cost. A free instruction is typically one that folds
193  /// into another instruction. For example, reg-to-reg moves can often be
194  /// skipped by renaming the registers in the CPU, but they still are encoded
195  /// and thus wouldn't be considered 'free' here.
197  TCC_Free = 0, ///< Expected to fold away in lowering.
198  TCC_Basic = 1, ///< The cost of a typical 'add' instruction.
199  TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86.
200  };
201 
202  /// Estimate the cost of a specific operation when lowered.
203  ///
204  /// Note that this is designed to work on an arbitrary synthetic opcode, and
205  /// thus work for hypothetical queries before an instruction has even been
206  /// formed. However, this does *not* work for GEPs, and must not be called
207  /// for a GEP instruction. Instead, use the dedicated getGEPCost interface as
208  /// analyzing a GEP's cost required more information.
209  ///
210  /// Typically only the result type is required, and the operand type can be
211  /// omitted. However, if the opcode is one of the cast instructions, the
212  /// operand type is required.
213  ///
214  /// The returned cost is defined in terms of \c TargetCostConstants, see its
215  /// comments for a detailed explanation of the cost values.
216  int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy = nullptr) const;
217 
218  /// Estimate the cost of a GEP operation when lowered.
219  ///
220  /// The contract for this function is the same as \c getOperationCost except
221  /// that it supports an interface that provides extra information specific to
222  /// the GEP operation.
223  int getGEPCost(Type *PointeeType, const Value *Ptr,
224  ArrayRef<const Value *> Operands) const;
225 
226  /// Estimate the cost of a EXT operation when lowered.
227  ///
228  /// The contract for this function is the same as \c getOperationCost except
229  /// that it supports an interface that provides extra information specific to
230  /// the EXT operation.
231  int getExtCost(const Instruction *I, const Value *Src) const;
232 
233  /// Estimate the cost of a function call when lowered.
234  ///
235  /// The contract for this is the same as \c getOperationCost except that it
236  /// supports an interface that provides extra information specific to call
237  /// instructions.
238  ///
239  /// This is the most basic query for estimating call cost: it only knows the
240  /// function type and (potentially) the number of arguments at the call site.
241  /// The latter is only interesting for varargs function types.
242  int getCallCost(FunctionType *FTy, int NumArgs = -1,
243  const User *U = nullptr) const;
244 
245  /// Estimate the cost of calling a specific function when lowered.
246  ///
247  /// This overload adds the ability to reason about the particular function
248  /// being called in the event it is a library call with special lowering.
249  int getCallCost(const Function *F, int NumArgs = -1,
250  const User *U = nullptr) const;
251 
252  /// Estimate the cost of calling a specific function when lowered.
253  ///
254  /// This overload allows specifying a set of candidate argument values.
255  int getCallCost(const Function *F, ArrayRef<const Value *> Arguments,
256  const User *U = nullptr) const;
257 
258  /// \returns A value by which our inlining threshold should be multiplied.
259  /// This is primarily used to bump up the inlining threshold wholesale on
260  /// targets where calls are unusually expensive.
261  ///
262  /// TODO: This is a rather blunt instrument. Perhaps altering the costs of
263  /// individual classes of instructions would be better.
264  unsigned getInliningThresholdMultiplier() const;
265 
266  /// \returns Vector bonus in percent.
267  ///
268  /// Vector bonuses: We want to more aggressively inline vector-dense kernels
269  /// and apply this bonus based on the percentage of vector instructions. A
270  /// bonus is applied if the vector instructions exceed 50% and half that amount
271  /// is applied if it exceeds 10%. Note that these bonuses are some what
272  /// arbitrary and evolved over time by accident as much as because they are
273  /// principled bonuses.
274  /// FIXME: It would be nice to base the bonus values on something more
275  /// scientific. A target may has no bonus on vector instructions.
276  int getInlinerVectorBonusPercent() const;
277 
278  /// Estimate the cost of an intrinsic when lowered.
279  ///
280  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
281  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
282  ArrayRef<Type *> ParamTys,
283  const User *U = nullptr) const;
284 
285  /// Estimate the cost of an intrinsic when lowered.
286  ///
287  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
288  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
290  const User *U = nullptr) const;
291 
292  /// \return the expected cost of a memcpy, which could e.g. depend on the
293  /// source/destination type and alignment and the number of bytes copied.
294  int getMemcpyCost(const Instruction *I) const;
295 
296  /// \return The estimated number of case clusters when lowering \p 'SI'.
297  /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
298  /// table.
299  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
300  unsigned &JTSize) const;
301 
302  /// Estimate the cost of a given IR user when lowered.
303  ///
304  /// This can estimate the cost of either a ConstantExpr or Instruction when
305  /// lowered. It has two primary advantages over the \c getOperationCost and
306  /// \c getGEPCost above, and one significant disadvantage: it can only be
307  /// used when the IR construct has already been formed.
308  ///
309  /// The advantages are that it can inspect the SSA use graph to reason more
310  /// accurately about the cost. For example, all-constant-GEPs can often be
311  /// folded into a load or other instruction, but if they are used in some
312  /// other context they may not be folded. This routine can distinguish such
313  /// cases.
314  ///
315  /// \p Operands is a list of operands which can be a result of transformations
316  /// of the current operands. The number of the operands on the list must equal
317  /// to the number of the current operands the IR user has. Their order on the
318  /// list must be the same as the order of the current operands the IR user
319  /// has.
320  ///
321  /// The returned cost is defined in terms of \c TargetCostConstants, see its
322  /// comments for a detailed explanation of the cost values.
323  int getUserCost(const User *U, ArrayRef<const Value *> Operands) const;
324 
325  /// This is a helper function which calls the two-argument getUserCost
326  /// with \p Operands which are the current operands U has.
327  int getUserCost(const User *U) const {
329  U->value_op_end());
330  return getUserCost(U, Operands);
331  }
332 
333  /// Return true if branch divergence exists.
334  ///
335  /// Branch divergence has a significantly negative impact on GPU performance
336  /// when threads in the same wavefront take different paths due to conditional
337  /// branches.
338  bool hasBranchDivergence() const;
339 
340  /// Returns whether V is a source of divergence.
341  ///
342  /// This function provides the target-dependent information for
343  /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis first
344  /// builds the dependency graph, and then runs the reachability algorithm
345  /// starting with the sources of divergence.
346  bool isSourceOfDivergence(const Value *V) const;
347 
348  // Returns true for the target specific
349  // set of operations which produce uniform result
350  // even taking non-uniform arguments
351  bool isAlwaysUniform(const Value *V) const;
352 
353  /// Returns the address space ID for a target's 'flat' address space. Note
354  /// this is not necessarily the same as addrspace(0), which LLVM sometimes
355  /// refers to as the generic address space. The flat address space is a
356  /// generic address space that can be used access multiple segments of memory
357  /// with different address spaces. Access of a memory location through a
358  /// pointer with this address space is expected to be legal but slower
359  /// compared to the same memory location accessed through a pointer with a
360  /// different address space.
361  //
362  /// This is for targets with different pointer representations which can
363  /// be converted with the addrspacecast instruction. If a pointer is converted
364  /// to this address space, optimizations should attempt to replace the access
365  /// with the source address space.
366  ///
367  /// \returns ~0u if the target does not have such a flat address space to
368  /// optimize away.
369  unsigned getFlatAddressSpace() const;
370 
371  /// Test whether calls to a function lower to actual program function
372  /// calls.
373  ///
374  /// The idea is to test whether the program is likely to require a 'call'
375  /// instruction or equivalent in order to call the given function.
376  ///
377  /// FIXME: It's not clear that this is a good or useful query API. Client's
378  /// should probably move to simpler cost metrics using the above.
379  /// Alternatively, we could split the cost interface into distinct code-size
380  /// and execution-speed costs. This would allow modelling the core of this
381  /// query more accurately as a call is a single small instruction, but
382  /// incurs significant execution cost.
383  bool isLoweredToCall(const Function *F) const;
384 
385  struct LSRCost {
386  /// TODO: Some of these could be merged. Also, a lexical ordering
387  /// isn't always optimal.
388  unsigned Insns;
389  unsigned NumRegs;
390  unsigned AddRecCost;
391  unsigned NumIVMuls;
392  unsigned NumBaseAdds;
393  unsigned ImmCost;
394  unsigned SetupCost;
395  unsigned ScaleCost;
396  };
397 
398  /// Parameters that control the generic loop unrolling transformation.
400  /// The cost threshold for the unrolled loop. Should be relative to the
401  /// getUserCost values returned by this API, and the expectation is that
402  /// the unrolled loop's instructions when run through that interface should
403  /// not exceed this cost. However, this is only an estimate. Also, specific
404  /// loops may be unrolled even with a cost above this threshold if deemed
405  /// profitable. Set this to UINT_MAX to disable the loop body cost
406  /// restriction.
407  unsigned Threshold;
408  /// If complete unrolling will reduce the cost of the loop, we will boost
409  /// the Threshold by a certain percent to allow more aggressive complete
410  /// unrolling. This value provides the maximum boost percentage that we
411  /// can apply to Threshold (The value should be no less than 100).
412  /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
413  /// MaxPercentThresholdBoost / 100)
414  /// E.g. if complete unrolling reduces the loop execution time by 50%
415  /// then we boost the threshold by the factor of 2x. If unrolling is not
416  /// expected to reduce the running time, then we do not increase the
417  /// threshold.
419  /// The cost threshold for the unrolled loop when optimizing for size (set
420  /// to UINT_MAX to disable).
422  /// The cost threshold for the unrolled loop, like Threshold, but used
423  /// for partial/runtime unrolling (set to UINT_MAX to disable).
425  /// The cost threshold for the unrolled loop when optimizing for size, like
426  /// OptSizeThreshold, but used for partial/runtime unrolling (set to
427  /// UINT_MAX to disable).
429  /// A forced unrolling factor (the number of concatenated bodies of the
430  /// original loop in the unrolled loop body). When set to 0, the unrolling
431  /// transformation will select an unrolling factor based on the current cost
432  /// threshold and other factors.
433  unsigned Count;
434  /// A forced peeling factor (the number of bodied of the original loop
435  /// that should be peeled off before the loop body). When set to 0, the
436  /// unrolling transformation will select a peeling factor based on profile
437  /// information and other factors.
438  unsigned PeelCount;
439  /// Default unroll count for loops with run-time trip count.
441  // Set the maximum unrolling factor. The unrolling factor may be selected
442  // using the appropriate cost threshold, but may not exceed this number
443  // (set to UINT_MAX to disable). This does not apply in cases where the
444  // loop is being fully unrolled.
445  unsigned MaxCount;
446  /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
447  /// applies even if full unrolling is selected. This allows a target to fall
448  /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
450  // Represents number of instructions optimized when "back edge"
451  // becomes "fall through" in unrolled loop.
452  // For now we count a conditional branch on a backedge and a comparison
453  // feeding it.
454  unsigned BEInsns;
455  /// Allow partial unrolling (unrolling of loops to expand the size of the
456  /// loop body, not only to eliminate small constant-trip-count loops).
457  bool Partial;
458  /// Allow runtime unrolling (unrolling of loops to expand the size of the
459  /// loop body even when the number of loop iterations is not known at
460  /// compile time).
461  bool Runtime;
462  /// Allow generation of a loop remainder (extra iterations after unroll).
464  /// Allow emitting expensive instructions (such as divisions) when computing
465  /// the trip count of a loop for runtime unrolling.
467  /// Apply loop unroll on any kind of loop
468  /// (mainly to loops that fail runtime unrolling).
469  bool Force;
470  /// Allow using trip count upper bound to unroll loops.
472  /// Allow peeling off loop iterations for loops with low dynamic tripcount.
474  /// Allow unrolling of all the iterations of the runtime loop remainder.
476  /// Allow unroll and jam. Used to enable unroll and jam for the target.
478  /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
479  /// value above is used during unroll and jam for the outer loop size.
480  /// This value is used in the same manner to limit the size of the inner
481  /// loop.
483  };
484 
485  /// Get target-customized preferences for the generic loop unrolling
486  /// transformation. The caller will initialize UP with the current
487  /// target-independent defaults.
488  void getUnrollingPreferences(Loop *L, ScalarEvolution &,
489  UnrollingPreferences &UP) const;
490 
491  /// Query the target whether it would be profitable to convert the given loop
492  /// into a hardware loop.
493  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
494  AssumptionCache &AC,
495  TargetLibraryInfo *LibInfo,
496  HardwareLoopInfo &HWLoopInfo) const;
497 
498  /// @}
499 
500  /// \name Scalar Target Information
501  /// @{
502 
503  /// Flags indicating the kind of support for population count.
504  ///
505  /// Compared to the SW implementation, HW support is supposed to
506  /// significantly boost the performance when the population is dense, and it
507  /// may or may not degrade performance if the population is sparse. A HW
508  /// support is considered as "Fast" if it can outperform, or is on a par
509  /// with, SW implementation when the population is sparse; otherwise, it is
510  /// considered as "Slow".
511  enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
512 
513  /// Return true if the specified immediate is legal add immediate, that
514  /// is the target has add instructions which can add a register with the
515  /// immediate without having to materialize the immediate into a register.
516  bool isLegalAddImmediate(int64_t Imm) const;
517 
518  /// Return true if the specified immediate is legal icmp immediate,
519  /// that is the target has icmp instructions which can compare a register
520  /// against the immediate without having to materialize the immediate into a
521  /// register.
522  bool isLegalICmpImmediate(int64_t Imm) const;
523 
524  /// Return true if the addressing mode represented by AM is legal for
525  /// this target, for a load/store of the specified type.
526  /// The type may be VoidTy, in which case only return true if the addressing
527  /// mode is legal for a load/store of any legal type.
528  /// If target returns true in LSRWithInstrQueries(), I may be valid.
529  /// TODO: Handle pre/postinc as well.
530  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
531  bool HasBaseReg, int64_t Scale,
532  unsigned AddrSpace = 0,
533  Instruction *I = nullptr) const;
534 
535  /// Return true if LSR cost of C1 is lower than C1.
536  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
537  TargetTransformInfo::LSRCost &C2) const;
538 
539  /// Return true if the target can fuse a compare and branch.
540  /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
541  /// calculation for the instructions in a loop.
542  bool canMacroFuseCmp() const;
543 
544  /// Return true if the target can save a compare for loop count, for example
545  /// hardware loop saves a compare.
546  bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
548  TargetLibraryInfo *LibInfo) const;
549 
550  /// \return True is LSR should make efforts to create/preserve post-inc
551  /// addressing mode expressions.
552  bool shouldFavorPostInc() const;
553 
554  /// Return true if LSR should make efforts to generate indexed addressing
555  /// modes that operate across loop iterations.
556  bool shouldFavorBackedgeIndex(const Loop *L) const;
557 
558  /// Return true if the target supports masked load.
559  bool isLegalMaskedStore(Type *DataType) const;
560  /// Return true if the target supports masked store.
561  bool isLegalMaskedLoad(Type *DataType) const;
562 
563  /// Return true if the target supports nontemporal store.
564  bool isLegalNTStore(Type *DataType, unsigned Alignment) const;
565  /// Return true if the target supports nontemporal load.
566  bool isLegalNTLoad(Type *DataType, unsigned Alignment) const;
567 
568  /// Return true if the target supports masked scatter.
569  bool isLegalMaskedScatter(Type *DataType) const;
570  /// Return true if the target supports masked gather.
571  bool isLegalMaskedGather(Type *DataType) const;
572 
573  /// Return true if the target supports masked compress store.
574  bool isLegalMaskedCompressStore(Type *DataType) const;
575  /// Return true if the target supports masked expand load.
576  bool isLegalMaskedExpandLoad(Type *DataType) const;
577 
578  /// Return true if the target has a unified operation to calculate division
579  /// and remainder. If so, the additional implicit multiplication and
580  /// subtraction required to calculate a remainder from division are free. This
581  /// can enable more aggressive transformations for division and remainder than
582  /// would typically be allowed using throughput or size cost models.
583  bool hasDivRemOp(Type *DataType, bool IsSigned) const;
584 
585  /// Return true if the given instruction (assumed to be a memory access
586  /// instruction) has a volatile variant. If that's the case then we can avoid
587  /// addrspacecast to generic AS for volatile loads/stores. Default
588  /// implementation returns false, which prevents address space inference for
589  /// volatile loads/stores.
590  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const;
591 
592  /// Return true if target doesn't mind addresses in vectors.
593  bool prefersVectorizedAddressing() const;
594 
595  /// Return the cost of the scaling factor used in the addressing
596  /// mode represented by AM for this target, for a load/store
597  /// of the specified type.
598  /// If the AM is supported, the return value must be >= 0.
599  /// If the AM is not supported, it returns a negative value.
600  /// TODO: Handle pre/postinc as well.
601  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
602  bool HasBaseReg, int64_t Scale,
603  unsigned AddrSpace = 0) const;
604 
605  /// Return true if the loop strength reduce pass should make
606  /// Instruction* based TTI queries to isLegalAddressingMode(). This is
607  /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned
608  /// immediate offset and no index register.
609  bool LSRWithInstrQueries() const;
610 
611  /// Return true if it's free to truncate a value of type Ty1 to type
612  /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
613  /// by referencing its sub-register AX.
614  bool isTruncateFree(Type *Ty1, Type *Ty2) const;
615 
616  /// Return true if it is profitable to hoist instruction in the
617  /// then/else to before if.
618  bool isProfitableToHoist(Instruction *I) const;
619 
620  bool useAA() const;
621 
622  /// Return true if this type is legal.
623  bool isTypeLegal(Type *Ty) const;
624 
625  /// Returns the target's jmp_buf alignment in bytes.
626  unsigned getJumpBufAlignment() const;
627 
628  /// Returns the target's jmp_buf size in bytes.
629  unsigned getJumpBufSize() const;
630 
631  /// Return true if switches should be turned into lookup tables for the
632  /// target.
633  bool shouldBuildLookupTables() const;
634 
635  /// Return true if switches should be turned into lookup tables
636  /// containing this constant value for the target.
637  bool shouldBuildLookupTablesForConstant(Constant *C) const;
638 
639  /// Return true if the input function which is cold at all call sites,
640  /// should use coldcc calling convention.
641  bool useColdCCForColdCall(Function &F) const;
642 
643  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
644 
645  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
646  unsigned VF) const;
647 
648  /// If target has efficient vector element load/store instructions, it can
649  /// return true here so that insertion/extraction costs are not added to
650  /// the scalarization cost of a load/store.
651  bool supportsEfficientVectorElementLoadStore() const;
652 
653  /// Don't restrict interleaved unrolling to small loops.
654  bool enableAggressiveInterleaving(bool LoopHasReductions) const;
655 
656  /// Returns options for expansion of memcmp. IsZeroCmp is
657  // true if this is the expansion of memcmp(p1, p2, s) == 0.
659  // Return true if memcmp expansion is enabled.
660  operator bool() const { return MaxNumLoads > 0; }
661 
662  // Maximum number of load operations.
663  unsigned MaxNumLoads = 0;
664 
665  // The list of available load sizes (in bytes), sorted in decreasing order.
667 
668  // For memcmp expansion when the memcmp result is only compared equal or
669  // not-equal to 0, allow up to this number of load pairs per block. As an
670  // example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
671  // a0 = load2bytes &a[0]
672  // b0 = load2bytes &b[0]
673  // a2 = load1byte &a[2]
674  // b2 = load1byte &b[2]
675  // r = cmp eq (a0 ^ b0 | a2 ^ b2), 0
676  unsigned NumLoadsPerBlock = 1;
677 
678  // Set to true to allow overlapping loads. For example, 7-byte compares can
679  // be done with two 4-byte compares instead of 4+2+1-byte compares. This
680  // requires all loads in LoadSizes to be doable in an unaligned way.
681  bool AllowOverlappingLoads = false;
682  };
683  MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
684  bool IsZeroCmp) const;
685 
686  /// Enable matching of interleaved access groups.
687  bool enableInterleavedAccessVectorization() const;
688 
689  /// Enable matching of interleaved access groups that contain predicated
690  /// accesses or gaps and therefore vectorized using masked
691  /// vector loads/stores.
692  bool enableMaskedInterleavedAccessVectorization() const;
693 
694  /// Indicate that it is potentially unsafe to automatically vectorize
695  /// floating-point operations because the semantics of vector and scalar
696  /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
697  /// does not support IEEE-754 denormal numbers, while depending on the
698  /// platform, scalar floating-point math does.
699  /// This applies to floating-point math operations and calls, not memory
700  /// operations, shuffles, or casts.
701  bool isFPVectorizationPotentiallyUnsafe() const;
702 
703  /// Determine if the target supports unaligned memory accesses.
704  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
705  unsigned BitWidth, unsigned AddressSpace = 0,
706  unsigned Alignment = 1,
707  bool *Fast = nullptr) const;
708 
709  /// Return hardware support for population count.
710  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
711 
712  /// Return true if the hardware has a fast square-root instruction.
713  bool haveFastSqrt(Type *Ty) const;
714 
715  /// Return true if it is faster to check if a floating-point value is NaN
716  /// (or not-NaN) versus a comparison against a constant FP zero value.
717  /// Targets should override this if materializing a 0.0 for comparison is
718  /// generally as cheap as checking for ordered/unordered.
719  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
720 
721  /// Return the expected cost of supporting the floating point operation
722  /// of the specified type.
723  int getFPOpCost(Type *Ty) const;
724 
725  /// Return the expected cost of materializing for the given integer
726  /// immediate of the specified type.
727  int getIntImmCost(const APInt &Imm, Type *Ty) const;
728 
729  /// Return the expected cost of materialization for the given integer
730  /// immediate of the specified type for a given instruction. The cost can be
731  /// zero if the immediate can be folded into the specified instruction.
732  int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
733  Type *Ty) const;
734  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
735  Type *Ty) const;
736 
737  /// Return the expected cost for the given integer when optimising
738  /// for size. This is different than the other integer immediate cost
739  /// functions in that it is subtarget agnostic. This is useful when you e.g.
740  /// target one ISA such as Aarch32 but smaller encodings could be possible
741  /// with another such as Thumb. This return value is used as a penalty when
742  /// the total costs for a constant is calculated (the bigger the cost, the
743  /// more beneficial constant hoisting is).
744  int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
745  Type *Ty) const;
746  /// @}
747 
748  /// \name Vector Target Information
749  /// @{
750 
751  /// The various kinds of shuffle patterns for vector queries.
752  enum ShuffleKind {
753  SK_Broadcast, ///< Broadcast element 0 to all other elements.
754  SK_Reverse, ///< Reverse the order of the vector.
755  SK_Select, ///< Selects elements from the corresponding lane of
756  ///< either source operand. This is equivalent to a
757  ///< vector select with a constant condition operand.
758  SK_Transpose, ///< Transpose two vectors.
759  SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
760  SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset.
761  SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one
762  ///< with any shuffle mask.
763  SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any
764  ///< shuffle mask.
765  };
766 
767  /// Additional information about an operand's possible values.
769  OK_AnyValue, // Operand can have any value.
770  OK_UniformValue, // Operand is uniform (splat of a value).
771  OK_UniformConstantValue, // Operand is uniform constant.
772  OK_NonUniformConstantValue // Operand is a non uniform constant value.
773  };
774 
775  /// Additional properties of an operand's values.
776  enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
777 
778  /// \return The number of scalar or vector registers that the target has.
779  /// If 'Vectors' is true, it returns the number of vector registers. If it is
780  /// set to false, it returns the number of scalar registers.
781  unsigned getNumberOfRegisters(bool Vector) const;
782 
783  /// \return The width of the largest scalar or vector register type.
784  unsigned getRegisterBitWidth(bool Vector) const;
785 
786  /// \return The width of the smallest vector register type.
787  unsigned getMinVectorRegisterBitWidth() const;
788 
789  /// \return True if the vectorization factor should be chosen to
790  /// make the vector of the smallest element type match the size of a
791  /// vector register. For wider element types, this could result in
792  /// creating vectors that span multiple vector registers.
793  /// If false, the vectorization factor will be chosen based on the
794  /// size of the widest element type.
795  bool shouldMaximizeVectorBandwidth(bool OptSize) const;
796 
797  /// \return The minimum vectorization factor for types of given element
798  /// bit width, or 0 if there is no minimum VF. The returned value only
799  /// applies when shouldMaximizeVectorBandwidth returns true.
800  unsigned getMinimumVF(unsigned ElemWidth) const;
801 
802  /// \return True if it should be considered for address type promotion.
803  /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
804  /// profitable without finding other extensions fed by the same input.
805  bool shouldConsiderAddressTypePromotion(
806  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;
807 
808  /// \return The size of a cache line in bytes.
809  unsigned getCacheLineSize() const;
810 
811  /// The possible cache levels
812  enum class CacheLevel {
813  L1D, // The L1 data cache
814  L2D, // The L2 data cache
815 
816  // We currently do not model L3 caches, as their sizes differ widely between
817  // microarchitectures. Also, we currently do not have a use for L3 cache
818  // size modeling yet.
819  };
820 
821  /// \return The size of the cache level in bytes, if available.
822  llvm::Optional<unsigned> getCacheSize(CacheLevel Level) const;
823 
824  /// \return The associativity of the cache level, if available.
825  llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) const;
826 
827  /// \return How much before a load we should place the prefetch instruction.
828  /// This is currently measured in number of instructions.
829  unsigned getPrefetchDistance() const;
830 
831  /// \return Some HW prefetchers can handle accesses up to a certain constant
832  /// stride. This is the minimum stride in bytes where it makes sense to start
833  /// adding SW prefetches. The default is 1, i.e. prefetch with any stride.
834  unsigned getMinPrefetchStride() const;
835 
836  /// \return The maximum number of iterations to prefetch ahead. If the
837  /// required number of iterations is more than this number, no prefetching is
838  /// performed.
839  unsigned getMaxPrefetchIterationsAhead() const;
840 
841  /// \return The maximum interleave factor that any transform should try to
842  /// perform for this target. This number depends on the level of parallelism
843  /// and the number of execution units in the CPU.
844  unsigned getMaxInterleaveFactor(unsigned VF) const;
845 
846  /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
847  static OperandValueKind getOperandInfo(Value *V,
848  OperandValueProperties &OpProps);
849 
850  /// This is an approximation of reciprocal throughput of a math/logic op.
851  /// A higher cost indicates less expected throughput.
852  /// From Agner Fog's guides, reciprocal throughput is "the average number of
853  /// clock cycles per instruction when the instructions are not part of a
854  /// limiting dependency chain."
855  /// Therefore, costs should be scaled to account for multiple execution units
856  /// on the target that can process this type of instruction. For example, if
857  /// there are 5 scalar integer units and 2 vector integer units that can
858  /// calculate an 'add' in a single cycle, this model should indicate that the
859  /// cost of the vector add instruction is 2.5 times the cost of the scalar
860  /// add instruction.
861  /// \p Args is an optional argument which holds the instruction operands
862  /// values so the TTI can analyze those values searching for special
863  /// cases or optimizations based on those values.
864  int getArithmeticInstrCost(
865  unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
866  OperandValueKind Opd2Info = OK_AnyValue,
867  OperandValueProperties Opd1PropInfo = OP_None,
868  OperandValueProperties Opd2PropInfo = OP_None,
870 
871  /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
872  /// The index and subtype parameters are used by the subvector insertion and
873  /// extraction shuffle kinds to show the insert/extract point and the type of
874  /// the subvector being inserted/extracted.
875  /// NOTE: For subvector extractions Tp represents the source type.
876  int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
877  Type *SubTp = nullptr) const;
878 
879  /// \return The expected cost of cast instructions, such as bitcast, trunc,
880  /// zext, etc. If there is an existing instruction that holds Opcode, it
881  /// may be passed in the 'I' parameter.
882  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
883  const Instruction *I = nullptr) const;
884 
885  /// \return The expected cost of a sign- or zero-extended vector extract. Use
886  /// -1 to indicate that there is no information about the index value.
887  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
888  unsigned Index = -1) const;
889 
890  /// \return The expected cost of control-flow related instructions such as
891  /// Phi, Ret, Br.
892  int getCFInstrCost(unsigned Opcode) const;
893 
894  /// \returns The expected cost of compare and select instructions. If there
895  /// is an existing instruction that holds Opcode, it may be passed in the
896  /// 'I' parameter.
897  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
898  Type *CondTy = nullptr, const Instruction *I = nullptr) const;
899 
900  /// \return The expected cost of vector Insert and Extract.
901  /// Use -1 to indicate that there is no information on the index value.
902  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index = -1) const;
903 
904  /// \return The cost of Load and Store instructions.
905  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
906  unsigned AddressSpace, const Instruction *I = nullptr) const;
907 
908  /// \return The cost of masked Load and Store instructions.
909  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
910  unsigned AddressSpace) const;
911 
912  /// \return The cost of Gather or Scatter operation
913  /// \p Opcode - is a type of memory access Load or Store
914  /// \p DataTy - a vector type of the data to be loaded or stored
915  /// \p Ptr - pointer [or vector of pointers] - address[es] in memory
916  /// \p VariableMask - true when the memory access is predicated with a mask
917  /// that is not a compile-time constant
918  /// \p Alignment - alignment of single element
919  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
920  bool VariableMask, unsigned Alignment) const;
921 
922  /// \return The cost of the interleaved memory operation.
923  /// \p Opcode is the memory operation code
924  /// \p VecTy is the vector type of the interleaved access.
925  /// \p Factor is the interleave factor
926  /// \p Indices is the indices for interleaved load members (as interleaved
927  /// load allows gaps)
928  /// \p Alignment is the alignment of the memory operation
929  /// \p AddressSpace is address space of the pointer.
930  /// \p UseMaskForCond indicates if the memory access is predicated.
931  /// \p UseMaskForGaps indicates if gaps should be masked.
932  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
933  ArrayRef<unsigned> Indices, unsigned Alignment,
934  unsigned AddressSpace,
935  bool UseMaskForCond = false,
936  bool UseMaskForGaps = false) const;
937 
938  /// Calculate the cost of performing a vector reduction.
939  ///
940  /// This is the cost of reducing the vector value of type \p Ty to a scalar
941  /// value using the operation denoted by \p Opcode. The form of the reduction
942  /// can either be a pairwise reduction or a reduction that splits the vector
943  /// at every reduction level.
944  ///
945  /// Pairwise:
946  /// (v0, v1, v2, v3)
947  /// ((v0+v1), (v2+v3), undef, undef)
948  /// Split:
949  /// (v0, v1, v2, v3)
950  /// ((v0+v2), (v1+v3), undef, undef)
951  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
952  bool IsPairwiseForm) const;
953  int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm,
954  bool IsUnsigned) const;
955 
956  /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
957  /// Three cases are handled: 1. scalar instruction 2. vector instruction
958  /// 3. scalar instruction which is to be vectorized with VF.
959  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
961  unsigned VF = 1) const;
962 
963  /// \returns The cost of Intrinsic instructions. Types analysis only.
964  /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
965  /// arguments and the return value will be computed based on types.
966  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
968  unsigned ScalarizationCostPassed = UINT_MAX) const;
969 
970  /// \returns The cost of Call instructions.
971  int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
972 
973  /// \returns The number of pieces into which the provided type must be
974  /// split during legalization. Zero is returned when the answer is unknown.
975  unsigned getNumberOfParts(Type *Tp) const;
976 
977  /// \returns The cost of the address computation. For most targets this can be
978  /// merged into the instruction indexing mode. Some targets might want to
979  /// distinguish between address computation for memory operations on vector
980  /// types and scalar types. Such targets should override this function.
981  /// The 'SE' parameter holds pointer for the scalar evolution object which
982  /// is used in order to get the Ptr step value in case of constant stride.
983  /// The 'Ptr' parameter holds SCEV of the access pointer.
984  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE = nullptr,
985  const SCEV *Ptr = nullptr) const;
986 
987  /// \returns The cost, if any, of keeping values of the given types alive
988  /// over a callsite.
989  ///
990  /// Some types may require the use of register classes that do not have
991  /// any callee-saved registers, so would require a spill and fill.
992  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const;
993 
994  /// \returns True if the intrinsic is a supported memory intrinsic. Info
995  /// will contain additional information - whether the intrinsic may write
996  /// or read to memory, volatility and the pointer. Info is undefined
997  /// if false is returned.
998  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
999 
1000  /// \returns The maximum element size, in bytes, for an element
1001  /// unordered-atomic memory intrinsic.
1002  unsigned getAtomicMemIntrinsicMaxElementSize() const;
1003 
1004  /// \returns A value which is the result of the given memory intrinsic. New
1005  /// instructions may be created to extract the result from the given intrinsic
1006  /// memory operation. Returns nullptr if the target cannot create a result
1007  /// from the given intrinsic.
1008  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1009  Type *ExpectedType) const;
1010 
1011  /// \returns The type to use in a loop expansion of a memcpy call.
1012  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1013  unsigned SrcAlign, unsigned DestAlign) const;
1014 
1015  /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
1016  /// \param RemainingBytes The number of bytes to copy.
1017  ///
1018  /// Calculates the operand types to use when copying \p RemainingBytes of
1019  /// memory, where source and destination alignments are \p SrcAlign and
1020  /// \p DestAlign respectively.
1021  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
1022  LLVMContext &Context,
1023  unsigned RemainingBytes,
1024  unsigned SrcAlign,
1025  unsigned DestAlign) const;
1026 
1027  /// \returns True if the two functions have compatible attributes for inlining
1028  /// purposes.
1029  bool areInlineCompatible(const Function *Caller,
1030  const Function *Callee) const;
1031 
1032  /// \returns True if the caller and callee agree on how \p Args will be passed
1033  /// to the callee.
1034  /// \param[out] Args The list of compatible arguments. The implementation may
1035  /// filter out any incompatible args from this list.
1036  bool areFunctionArgsABICompatible(const Function *Caller,
1037  const Function *Callee,
1038  SmallPtrSetImpl<Argument *> &Args) const;
1039 
1040  /// The type of load/store indexing.
1042  MIM_Unindexed, ///< No indexing.
1043  MIM_PreInc, ///< Pre-incrementing.
1044  MIM_PreDec, ///< Pre-decrementing.
1045  MIM_PostInc, ///< Post-incrementing.
1046  MIM_PostDec ///< Post-decrementing.
1047  };
1048 
1049  /// \returns True if the specified indexed load for the given type is legal.
1050  bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const;
1051 
1052  /// \returns True if the specified indexed store for the given type is legal.
1053  bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const;
1054 
1055  /// \returns The bitwidth of the largest vector type that should be used to
1056  /// load/store in the given address space.
1057  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
1058 
1059  /// \returns True if the load instruction is legal to vectorize.
1060  bool isLegalToVectorizeLoad(LoadInst *LI) const;
1061 
1062  /// \returns True if the store instruction is legal to vectorize.
1063  bool isLegalToVectorizeStore(StoreInst *SI) const;
1064 
1065  /// \returns True if it is legal to vectorize the given load chain.
1066  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1067  unsigned Alignment,
1068  unsigned AddrSpace) const;
1069 
1070  /// \returns True if it is legal to vectorize the given store chain.
1071  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1072  unsigned Alignment,
1073  unsigned AddrSpace) const;
1074 
1075  /// \returns The new vector factor value if the target doesn't support \p
1076  /// SizeInBytes loads or has a better vector factor.
1077  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1078  unsigned ChainSizeInBytes,
1079  VectorType *VecTy) const;
1080 
1081  /// \returns The new vector factor value if the target doesn't support \p
1082  /// SizeInBytes stores or has a better vector factor.
1083  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1084  unsigned ChainSizeInBytes,
1085  VectorType *VecTy) const;
1086 
1087  /// Flags describing the kind of vector reduction.
1089  ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
1090  bool IsMaxOp; ///< If the op a min/max kind, true if it's a max operation.
1091  bool IsSigned; ///< Whether the operation is a signed int reduction.
1092  bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present.
1093  };
1094 
1095  /// \returns True if the target wants to handle the given reduction idiom in
1096  /// the intrinsics form instead of the shuffle form.
1097  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1098  ReductionFlags Flags) const;
1099 
1100  /// \returns True if the target wants to expand the given reduction intrinsic
1101  /// into a shuffle sequence.
1102  bool shouldExpandReduction(const IntrinsicInst *II) const;
1103 
1104  /// \returns the size cost of rematerializing a GlobalValue address relative
1105  /// to a stack reload.
1106  unsigned getGISelRematGlobalCost() const;
1107 
1108  /// @}
1109 
1110 private:
1111  /// Estimate the latency of specified instruction.
1112  /// Returns 1 as the default value.
1113  int getInstructionLatency(const Instruction *I) const;
1114 
1115  /// Returns the expected throughput cost of the instruction.
1116  /// Returns -1 if the cost is unknown.
1117  int getInstructionThroughput(const Instruction *I) const;
1118 
1119  /// The abstract base class used to type erase specific TTI
1120  /// implementations.
1121  class Concept;
1122 
1123  /// The template model for the base class which wraps a concrete
1124  /// implementation in a type erased interface.
1125  template <typename T> class Model;
1126 
1127  std::unique_ptr<Concept> TTIImpl;
1128 };
1129 
1131 public:
1132  virtual ~Concept() = 0;
1133  virtual const DataLayout &getDataLayout() const = 0;
1134  virtual int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0;
1135  virtual int getGEPCost(Type *PointeeType, const Value *Ptr,
1136  ArrayRef<const Value *> Operands) = 0;
1137  virtual int getExtCost(const Instruction *I, const Value *Src) = 0;
1138  virtual int getCallCost(FunctionType *FTy, int NumArgs, const User *U) = 0;
1139  virtual int getCallCost(const Function *F, int NumArgs, const User *U) = 0;
1140  virtual int getCallCost(const Function *F,
1141  ArrayRef<const Value *> Arguments, const User *U) = 0;
1142  virtual unsigned getInliningThresholdMultiplier() = 0;
1143  virtual int getInlinerVectorBonusPercent() = 0;
1144  virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1145  ArrayRef<Type *> ParamTys, const User *U) = 0;
1146  virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1147  ArrayRef<const Value *> Arguments,
1148  const User *U) = 0;
1149  virtual int getMemcpyCost(const Instruction *I) = 0;
1150  virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
1151  unsigned &JTSize) = 0;
1152  virtual int
1153  getUserCost(const User *U, ArrayRef<const Value *> Operands) = 0;
1154  virtual bool hasBranchDivergence() = 0;
1155  virtual bool isSourceOfDivergence(const Value *V) = 0;
1156  virtual bool isAlwaysUniform(const Value *V) = 0;
1157  virtual unsigned getFlatAddressSpace() = 0;
1158  virtual bool isLoweredToCall(const Function *F) = 0;
1159  virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
1160  UnrollingPreferences &UP) = 0;
1161  virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1162  AssumptionCache &AC,
1163  TargetLibraryInfo *LibInfo,
1164  HardwareLoopInfo &HWLoopInfo) = 0;
1165  virtual bool isLegalAddImmediate(int64_t Imm) = 0;
1166  virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
1167  virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
1168  int64_t BaseOffset, bool HasBaseReg,
1169  int64_t Scale,
1170  unsigned AddrSpace,
1171  Instruction *I) = 0;
1172  virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1174  virtual bool canMacroFuseCmp() = 0;
1175  virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
1176  LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
1177  TargetLibraryInfo *LibInfo) = 0;
1178  virtual bool shouldFavorPostInc() const = 0;
1179  virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
1180  virtual bool isLegalMaskedStore(Type *DataType) = 0;
1181  virtual bool isLegalMaskedLoad(Type *DataType) = 0;
1182  virtual bool isLegalNTStore(Type *DataType, unsigned Alignment) = 0;
1183  virtual bool isLegalNTLoad(Type *DataType, unsigned Alignment) = 0;
1184  virtual bool isLegalMaskedScatter(Type *DataType) = 0;
1185  virtual bool isLegalMaskedGather(Type *DataType) = 0;
1186  virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
1187  virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
1188  virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
1189  virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
1190  virtual bool prefersVectorizedAddressing() = 0;
1191  virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
1192  int64_t BaseOffset, bool HasBaseReg,
1193  int64_t Scale, unsigned AddrSpace) = 0;
1194  virtual bool LSRWithInstrQueries() = 0;
1195  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
1196  virtual bool isProfitableToHoist(Instruction *I) = 0;
1197  virtual bool useAA() = 0;
1198  virtual bool isTypeLegal(Type *Ty) = 0;
1199  virtual unsigned getJumpBufAlignment() = 0;
1200  virtual unsigned getJumpBufSize() = 0;
1201  virtual bool shouldBuildLookupTables() = 0;
1202  virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
1203  virtual bool useColdCCForColdCall(Function &F) = 0;
1204  virtual unsigned
1205  getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
1206  virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1207  unsigned VF) = 0;
1208  virtual bool supportsEfficientVectorElementLoadStore() = 0;
1209  virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
1210  virtual MemCmpExpansionOptions
1211  enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
1212  virtual bool enableInterleavedAccessVectorization() = 0;
1213  virtual bool enableMaskedInterleavedAccessVectorization() = 0;
1214  virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
1215  virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
1216  unsigned BitWidth,
1217  unsigned AddressSpace,
1218  unsigned Alignment,
1219  bool *Fast) = 0;
1220  virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
1221  virtual bool haveFastSqrt(Type *Ty) = 0;
1222  virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
1223  virtual int getFPOpCost(Type *Ty) = 0;
1224  virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1225  Type *Ty) = 0;
1226  virtual int getIntImmCost(const APInt &Imm, Type *Ty) = 0;
1227  virtual int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1228  Type *Ty) = 0;
1229  virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
1230  Type *Ty) = 0;
1231  virtual unsigned getNumberOfRegisters(bool Vector) = 0;
1232  virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
1233  virtual unsigned getMinVectorRegisterBitWidth() = 0;
1234  virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
1235  virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
1236  virtual bool shouldConsiderAddressTypePromotion(
1237  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
1238  virtual unsigned getCacheLineSize() = 0;
1239  virtual llvm::Optional<unsigned> getCacheSize(CacheLevel Level) = 0;
1240  virtual llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) = 0;
1241  virtual unsigned getPrefetchDistance() = 0;
1242  virtual unsigned getMinPrefetchStride() = 0;
1243  virtual unsigned getMaxPrefetchIterationsAhead() = 0;
1244  virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
1245  virtual unsigned
1246  getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
1247  OperandValueKind Opd2Info,
1248  OperandValueProperties Opd1PropInfo,
1249  OperandValueProperties Opd2PropInfo,
1250  ArrayRef<const Value *> Args) = 0;
1251  virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
1252  Type *SubTp) = 0;
1253  virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1254  const Instruction *I) = 0;
1255  virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1256  VectorType *VecTy, unsigned Index) = 0;
1257  virtual int getCFInstrCost(unsigned Opcode) = 0;
1258  virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1259  Type *CondTy, const Instruction *I) = 0;
1260  virtual int getVectorInstrCost(unsigned Opcode, Type *Val,
1261  unsigned Index) = 0;
1262  virtual int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1263  unsigned AddressSpace, const Instruction *I) = 0;
1264  virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
1265  unsigned Alignment,
1266  unsigned AddressSpace) = 0;
1267  virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1268  Value *Ptr, bool VariableMask,
1269  unsigned Alignment) = 0;
1270  virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
1271  unsigned Factor,
1272  ArrayRef<unsigned> Indices,
1273  unsigned Alignment,
1274  unsigned AddressSpace,
1275  bool UseMaskForCond = false,
1276  bool UseMaskForGaps = false) = 0;
1277  virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1278  bool IsPairwiseForm) = 0;
1279  virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
1280  bool IsPairwiseForm, bool IsUnsigned) = 0;
1281  virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1283  unsigned ScalarizationCostPassed) = 0;
1284  virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1285  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) = 0;
1286  virtual int getCallInstrCost(Function *F, Type *RetTy,
1287  ArrayRef<Type *> Tys) = 0;
1288  virtual unsigned getNumberOfParts(Type *Tp) = 0;
1289  virtual int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1290  const SCEV *Ptr) = 0;
1291  virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
1292  virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
1293  MemIntrinsicInfo &Info) = 0;
1294  virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0;
1295  virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1296  Type *ExpectedType) = 0;
1297  virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1298  unsigned SrcAlign,
1299  unsigned DestAlign) const = 0;
1300  virtual void getMemcpyLoopResidualLoweringType(
1301  SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
1302  unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const = 0;
1303  virtual bool areInlineCompatible(const Function *Caller,
1304  const Function *Callee) const = 0;
1305  virtual bool
1306  areFunctionArgsABICompatible(const Function *Caller, const Function *Callee,
1307  SmallPtrSetImpl<Argument *> &Args) const = 0;
1308  virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0;
1309  virtual bool isIndexedStoreLegal(MemIndexedMode Mode,Type *Ty) const = 0;
1310  virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0;
1311  virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0;
1312  virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0;
1313  virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1314  unsigned Alignment,
1315  unsigned AddrSpace) const = 0;
1316  virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1317  unsigned Alignment,
1318  unsigned AddrSpace) const = 0;
1319  virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1320  unsigned ChainSizeInBytes,
1321  VectorType *VecTy) const = 0;
1322  virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1323  unsigned ChainSizeInBytes,
1324  VectorType *VecTy) const = 0;
1325  virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1326  ReductionFlags) const = 0;
1327  virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
1328  virtual unsigned getGISelRematGlobalCost() const = 0;
1329  virtual int getInstructionLatency(const Instruction *I) = 0;
1330 };
1331 
1332 template <typename T>
1334  T Impl;
1335 
1336 public:
1337  Model(T Impl) : Impl(std::move(Impl)) {}
1338  ~Model() override {}
1339 
1340  const DataLayout &getDataLayout() const override {
1341  return Impl.getDataLayout();
1342  }
1343 
1344  int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) override {
1345  return Impl.getOperationCost(Opcode, Ty, OpTy);
1346  }
1347  int getGEPCost(Type *PointeeType, const Value *Ptr,
1348  ArrayRef<const Value *> Operands) override {
1349  return Impl.getGEPCost(PointeeType, Ptr, Operands);
1350  }
1351  int getExtCost(const Instruction *I, const Value *Src) override {
1352  return Impl.getExtCost(I, Src);
1353  }
1354  int getCallCost(FunctionType *FTy, int NumArgs, const User *U) override {
1355  return Impl.getCallCost(FTy, NumArgs, U);
1356  }
1357  int getCallCost(const Function *F, int NumArgs, const User *U) override {
1358  return Impl.getCallCost(F, NumArgs, U);
1359  }
1360  int getCallCost(const Function *F,
1361  ArrayRef<const Value *> Arguments, const User *U) override {
1362  return Impl.getCallCost(F, Arguments, U);
1363  }
1364  unsigned getInliningThresholdMultiplier() override {
1365  return Impl.getInliningThresholdMultiplier();
1366  }
1367  int getInlinerVectorBonusPercent() override {
1368  return Impl.getInlinerVectorBonusPercent();
1369  }
1370  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1371  ArrayRef<Type *> ParamTys, const User *U = nullptr) override {
1372  return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U);
1373  }
1374  int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
1375  ArrayRef<const Value *> Arguments,
1376  const User *U = nullptr) override {
1377  return Impl.getIntrinsicCost(IID, RetTy, Arguments, U);
1378  }
1379  int getMemcpyCost(const Instruction *I) override {
1380  return Impl.getMemcpyCost(I);
1381  }
1382  int getUserCost(const User *U, ArrayRef<const Value *> Operands) override {
1383  return Impl.getUserCost(U, Operands);
1384  }
1385  bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
1386  bool isSourceOfDivergence(const Value *V) override {
1387  return Impl.isSourceOfDivergence(V);
1388  }
1389 
1390  bool isAlwaysUniform(const Value *V) override {
1391  return Impl.isAlwaysUniform(V);
1392  }
1393 
1394  unsigned getFlatAddressSpace() override {
1395  return Impl.getFlatAddressSpace();
1396  }
1397 
1398  bool isLoweredToCall(const Function *F) override {
1399  return Impl.isLoweredToCall(F);
1400  }
1401  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1402  UnrollingPreferences &UP) override {
1403  return Impl.getUnrollingPreferences(L, SE, UP);
1404  }
1405  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1406  AssumptionCache &AC,
1407  TargetLibraryInfo *LibInfo,
1408  HardwareLoopInfo &HWLoopInfo) override {
1409  return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
1410  }
1411  bool isLegalAddImmediate(int64_t Imm) override {
1412  return Impl.isLegalAddImmediate(Imm);
1413  }
1414  bool isLegalICmpImmediate(int64_t Imm) override {
1415  return Impl.isLegalICmpImmediate(Imm);
1416  }
1417  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1418  bool HasBaseReg, int64_t Scale,
1419  unsigned AddrSpace,
1420  Instruction *I) override {
1421  return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
1422  Scale, AddrSpace, I);
1423  }
1424  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
1425  TargetTransformInfo::LSRCost &C2) override {
1426  return Impl.isLSRCostLess(C1, C2);
1427  }
1428  bool canMacroFuseCmp() override {
1429  return Impl.canMacroFuseCmp();
1430  }
1431  bool canSaveCmp(Loop *L, BranchInst **BI,
1432  ScalarEvolution *SE,
1433  LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
1434  TargetLibraryInfo *LibInfo) override {
1435  return Impl.canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo);
1436  }
1437  bool shouldFavorPostInc() const override {
1438  return Impl.shouldFavorPostInc();
1439  }
1440  bool shouldFavorBackedgeIndex(const Loop *L) const override {
1441  return Impl.shouldFavorBackedgeIndex(L);
1442  }
1443  bool isLegalMaskedStore(Type *DataType) override {
1444  return Impl.isLegalMaskedStore(DataType);
1445  }
1446  bool isLegalMaskedLoad(Type *DataType) override {
1447  return Impl.isLegalMaskedLoad(DataType);
1448  }
1449  bool isLegalNTStore(Type *DataType, unsigned Alignment) override {
1450  return Impl.isLegalNTStore(DataType, Alignment);
1451  }
1452  bool isLegalNTLoad(Type *DataType, unsigned Alignment) override {
1453  return Impl.isLegalNTLoad(DataType, Alignment);
1454  }
1455  bool isLegalMaskedScatter(Type *DataType) override {
1456  return Impl.isLegalMaskedScatter(DataType);
1457  }
1458  bool isLegalMaskedGather(Type *DataType) override {
1459  return Impl.isLegalMaskedGather(DataType);
1460  }
1461  bool isLegalMaskedCompressStore(Type *DataType) override {
1462  return Impl.isLegalMaskedCompressStore(DataType);
1463  }
1464  bool isLegalMaskedExpandLoad(Type *DataType) override {
1465  return Impl.isLegalMaskedExpandLoad(DataType);
1466  }
1467  bool hasDivRemOp(Type *DataType, bool IsSigned) override {
1468  return Impl.hasDivRemOp(DataType, IsSigned);
1469  }
1470  bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
1471  return Impl.hasVolatileVariant(I, AddrSpace);
1472  }
1473  bool prefersVectorizedAddressing() override {
1474  return Impl.prefersVectorizedAddressing();
1475  }
1476  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
1477  bool HasBaseReg, int64_t Scale,
1478  unsigned AddrSpace) override {
1479  return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
1480  Scale, AddrSpace);
1481  }
1482  bool LSRWithInstrQueries() override {
1483  return Impl.LSRWithInstrQueries();
1484  }
1485  bool isTruncateFree(Type *Ty1, Type *Ty2) override {
1486  return Impl.isTruncateFree(Ty1, Ty2);
1487  }
1488  bool isProfitableToHoist(Instruction *I) override {
1489  return Impl.isProfitableToHoist(I);
1490  }
1491  bool useAA() override { return Impl.useAA(); }
1492  bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
1493  unsigned getJumpBufAlignment() override { return Impl.getJumpBufAlignment(); }
1494  unsigned getJumpBufSize() override { return Impl.getJumpBufSize(); }
1495  bool shouldBuildLookupTables() override {
1496  return Impl.shouldBuildLookupTables();
1497  }
1498  bool shouldBuildLookupTablesForConstant(Constant *C) override {
1499  return Impl.shouldBuildLookupTablesForConstant(C);
1500  }
1501  bool useColdCCForColdCall(Function &F) override {
1502  return Impl.useColdCCForColdCall(F);
1503  }
1504 
1505  unsigned getScalarizationOverhead(Type *Ty, bool Insert,
1506  bool Extract) override {
1507  return Impl.getScalarizationOverhead(Ty, Insert, Extract);
1508  }
1509  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
1510  unsigned VF) override {
1511  return Impl.getOperandsScalarizationOverhead(Args, VF);
1512  }
1513 
1514  bool supportsEfficientVectorElementLoadStore() override {
1515  return Impl.supportsEfficientVectorElementLoadStore();
1516  }
1517 
1518  bool enableAggressiveInterleaving(bool LoopHasReductions) override {
1519  return Impl.enableAggressiveInterleaving(LoopHasReductions);
1520  }
1521  MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
1522  bool IsZeroCmp) const override {
1523  return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp);
1524  }
1525  bool enableInterleavedAccessVectorization() override {
1526  return Impl.enableInterleavedAccessVectorization();
1527  }
1528  bool enableMaskedInterleavedAccessVectorization() override {
1529  return Impl.enableMaskedInterleavedAccessVectorization();
1530  }
1531  bool isFPVectorizationPotentiallyUnsafe() override {
1532  return Impl.isFPVectorizationPotentiallyUnsafe();
1533  }
1534  bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
1535  unsigned BitWidth, unsigned AddressSpace,
1536  unsigned Alignment, bool *Fast) override {
1537  return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
1538  Alignment, Fast);
1539  }
1540  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
1541  return Impl.getPopcntSupport(IntTyWidthInBit);
1542  }
1543  bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
1544 
1545  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override {
1546  return Impl.isFCmpOrdCheaperThanFCmpZero(Ty);
1547  }
1548 
1549  int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); }
1550 
1551  int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1552  Type *Ty) override {
1553  return Impl.getIntImmCodeSizeCost(Opc, Idx, Imm, Ty);
1554  }
1555  int getIntImmCost(const APInt &Imm, Type *Ty) override {
1556  return Impl.getIntImmCost(Imm, Ty);
1557  }
1558  int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
1559  Type *Ty) override {
1560  return Impl.getIntImmCost(Opc, Idx, Imm, Ty);
1561  }
1562  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
1563  Type *Ty) override {
1564  return Impl.getIntImmCost(IID, Idx, Imm, Ty);
1565  }
1566  unsigned getNumberOfRegisters(bool Vector) override {
1567  return Impl.getNumberOfRegisters(Vector);
1568  }
1569  unsigned getRegisterBitWidth(bool Vector) const override {
1570  return Impl.getRegisterBitWidth(Vector);
1571  }
1572  unsigned getMinVectorRegisterBitWidth() override {
1573  return Impl.getMinVectorRegisterBitWidth();
1574  }
1575  bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
1576  return Impl.shouldMaximizeVectorBandwidth(OptSize);
1577  }
1578  unsigned getMinimumVF(unsigned ElemWidth) const override {
1579  return Impl.getMinimumVF(ElemWidth);
1580  }
1581  bool shouldConsiderAddressTypePromotion(
1582  const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
1583  return Impl.shouldConsiderAddressTypePromotion(
1584  I, AllowPromotionWithoutCommonHeader);
1585  }
1586  unsigned getCacheLineSize() override {
1587  return Impl.getCacheLineSize();
1588  }
1589  llvm::Optional<unsigned> getCacheSize(CacheLevel Level) override {
1590  return Impl.getCacheSize(Level);
1591  }
1592  llvm::Optional<unsigned> getCacheAssociativity(CacheLevel Level) override {
1593  return Impl.getCacheAssociativity(Level);
1594  }
1595  unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); }
1596  unsigned getMinPrefetchStride() override {
1597  return Impl.getMinPrefetchStride();
1598  }
1599  unsigned getMaxPrefetchIterationsAhead() override {
1600  return Impl.getMaxPrefetchIterationsAhead();
1601  }
1602  unsigned getMaxInterleaveFactor(unsigned VF) override {
1603  return Impl.getMaxInterleaveFactor(VF);
1604  }
1605  unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
1606  unsigned &JTSize) override {
1607  return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize);
1608  }
1609  unsigned
1610  getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
1611  OperandValueKind Opd2Info,
1612  OperandValueProperties Opd1PropInfo,
1613  OperandValueProperties Opd2PropInfo,
1614  ArrayRef<const Value *> Args) override {
1615  return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
1616  Opd1PropInfo, Opd2PropInfo, Args);
1617  }
1618  int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
1619  Type *SubTp) override {
1620  return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
1621  }
1622  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1623  const Instruction *I) override {
1624  return Impl.getCastInstrCost(Opcode, Dst, Src, I);
1625  }
1626  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
1627  unsigned Index) override {
1628  return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
1629  }
1630  int getCFInstrCost(unsigned Opcode) override {
1631  return Impl.getCFInstrCost(Opcode);
1632  }
1633  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1634  const Instruction *I) override {
1635  return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
1636  }
1637  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override {
1638  return Impl.getVectorInstrCost(Opcode, Val, Index);
1639  }
1640  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1641  unsigned AddressSpace, const Instruction *I) override {
1642  return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
1643  }
1644  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
1645  unsigned AddressSpace) override {
1646  return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
1647  }
1648  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1649  Value *Ptr, bool VariableMask,
1650  unsigned Alignment) override {
1651  return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1652  Alignment);
1653  }
1654  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
1655  ArrayRef<unsigned> Indices, unsigned Alignment,
1656  unsigned AddressSpace, bool UseMaskForCond,
1657  bool UseMaskForGaps) override {
1658  return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1659  Alignment, AddressSpace,
1660  UseMaskForCond, UseMaskForGaps);
1661  }
1662  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
1663  bool IsPairwiseForm) override {
1664  return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm);
1665  }
1666  int getMinMaxReductionCost(Type *Ty, Type *CondTy,
1667  bool IsPairwiseForm, bool IsUnsigned) override {
1668  return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned);
1669  }
1670  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
1671  FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
1672  return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
1673  ScalarizationCostPassed);
1674  }
1675  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1676  ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) override {
1677  return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
1678  }
1679  int getCallInstrCost(Function *F, Type *RetTy,
1680  ArrayRef<Type *> Tys) override {
1681  return Impl.getCallInstrCost(F, RetTy, Tys);
1682  }
1683  unsigned getNumberOfParts(Type *Tp) override {
1684  return Impl.getNumberOfParts(Tp);
1685  }
1686  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
1687  const SCEV *Ptr) override {
1688  return Impl.getAddressComputationCost(Ty, SE, Ptr);
1689  }
1690  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
1691  return Impl.getCostOfKeepingLiveOverCall(Tys);
1692  }
1693  bool getTgtMemIntrinsic(IntrinsicInst *Inst,
1694  MemIntrinsicInfo &Info) override {
1695  return Impl.getTgtMemIntrinsic(Inst, Info);
1696  }
1697  unsigned getAtomicMemIntrinsicMaxElementSize() const override {
1698  return Impl.getAtomicMemIntrinsicMaxElementSize();
1699  }
1700  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
1701  Type *ExpectedType) override {
1702  return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
1703  }
1704  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
1705  unsigned SrcAlign,
1706  unsigned DestAlign) const override {
1707  return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAlign, DestAlign);
1708  }
1709  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
1710  LLVMContext &Context,
1711  unsigned RemainingBytes,
1712  unsigned SrcAlign,
1713  unsigned DestAlign) const override {
1714  Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
1715  SrcAlign, DestAlign);
1716  }
1717  bool areInlineCompatible(const Function *Caller,
1718  const Function *Callee) const override {
1719  return Impl.areInlineCompatible(Caller, Callee);
1720  }
1722  const Function *Caller, const Function *Callee,
1723  SmallPtrSetImpl<Argument *> &Args) const override {
1724  return Impl.areFunctionArgsABICompatible(Caller, Callee, Args);
1725  }
1726  bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const override {
1727  return Impl.isIndexedLoadLegal(Mode, Ty, getDataLayout());
1728  }
1729  bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const override {
1730  return Impl.isIndexedStoreLegal(Mode, Ty, getDataLayout());
1731  }
1732  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override {
1733  return Impl.getLoadStoreVecRegBitWidth(AddrSpace);
1734  }
1735  bool isLegalToVectorizeLoad(LoadInst *LI) const override {
1736  return Impl.isLegalToVectorizeLoad(LI);
1737  }
1738  bool isLegalToVectorizeStore(StoreInst *SI) const override {
1739  return Impl.isLegalToVectorizeStore(SI);
1740  }
1741  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1742  unsigned Alignment,
1743  unsigned AddrSpace) const override {
1744  return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,
1745  AddrSpace);
1746  }
1747  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1748  unsigned Alignment,
1749  unsigned AddrSpace) const override {
1750  return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,
1751  AddrSpace);
1752  }
1753  unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
1754  unsigned ChainSizeInBytes,
1755  VectorType *VecTy) const override {
1756  return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy);
1757  }
1758  unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
1759  unsigned ChainSizeInBytes,
1760  VectorType *VecTy) const override {
1761  return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
1762  }
1763  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
1764  ReductionFlags Flags) const override {
1765  return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
1766  }
1767  bool shouldExpandReduction(const IntrinsicInst *II) const override {
1768  return Impl.shouldExpandReduction(II);
1769  }
1770 
1771  unsigned getGISelRematGlobalCost() const override {
1772  return Impl.getGISelRematGlobalCost();
1773  }
1774 
1775  int getInstructionLatency(const Instruction *I) override {
1776  return Impl.getInstructionLatency(I);
1777  }
1778 };
1779 
1780 template <typename T>
1782  : TTIImpl(new Model<T>(Impl)) {}
1783 
1784 /// Analysis pass providing the \c TargetTransformInfo.
1785 ///
1786 /// The core idea of the TargetIRAnalysis is to expose an interface through
1787 /// which LLVM targets can analyze and provide information about the middle
1788 /// end's target-independent IR. This supports use cases such as target-aware
1789 /// cost modeling of IR constructs.
1790 ///
1791 /// This is a function analysis because much of the cost modeling for targets
1792 /// is done in a subtarget specific way and LLVM supports compiling different
1793 /// functions targeting different subtargets in order to support runtime
1794 /// dispatch according to the observed subtarget.
1795 class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> {
1796 public:
1798 
1799  /// Default construct a target IR analysis.
1800  ///
1801  /// This will use the module's datalayout to construct a baseline
1802  /// conservative TTI result.
1803  TargetIRAnalysis();
1804 
1805  /// Construct an IR analysis pass around a target-provide callback.
1806  ///
1807  /// The callback will be called with a particular function for which the TTI
1808  /// is needed and must return a TTI object for that function.
1809  TargetIRAnalysis(std::function<Result(const Function &)> TTICallback);
1810 
1811  // Value semantics. We spell out the constructors for MSVC.
1813  : TTICallback(Arg.TTICallback) {}
1815  : TTICallback(std::move(Arg.TTICallback)) {}
1817  TTICallback = RHS.TTICallback;
1818  return *this;
1819  }
1821  TTICallback = std::move(RHS.TTICallback);
1822  return *this;
1823  }
1824 
1825  Result run(const Function &F, FunctionAnalysisManager &);
1826 
1827 private:
1829  static AnalysisKey Key;
1830 
1831  /// The callback used to produce a result.
1832  ///
1833  /// We use a completely opaque callback so that targets can provide whatever
1834  /// mechanism they desire for constructing the TTI for a given function.
1835  ///
1836  /// FIXME: Should we really use std::function? It's relatively inefficient.
1837  /// It might be possible to arrange for even stateful callbacks to outlive
1838  /// the analysis and thus use a function_ref which would be lighter weight.
1839  /// This may also be less error prone as the callback is likely to reference
1840  /// the external TargetMachine, and that reference needs to never dangle.
1841  std::function<Result(const Function &)> TTICallback;
1842 
1843  /// Helper function used as the callback in the default constructor.
1844  static Result getDefaultTTI(const Function &F);
1845 };
1846 
1847 /// Wrapper pass for TargetTransformInfo.
1848 ///
1849 /// This pass can be constructed from a TTI object which it stores internally
1850 /// and is queried by passes.
1852  TargetIRAnalysis TIRA;
1854 
1855  virtual void anchor();
1856 
1857 public:
1858  static char ID;
1859 
1860  /// We must provide a default constructor for the pass but it should
1861  /// never be used.
1862  ///
1863  /// Use the constructor below or call one of the creation routines.
1865 
1867 
1868  TargetTransformInfo &getTTI(const Function &F);
1869 };
1870 
1871 /// Create an analysis pass wrapper around a TTI object.
1872 ///
1873 /// This analysis pass just holds the TTI instance and makes it available to
1874 /// clients.
1876 
1877 } // End llvm namespace
1878 
1879 #endif
uint64_t CallInst * C
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
LLVMContext & Context
Atomic ordering constants.
SI Whole Quad Mode
This class represents lattice values for constants.
Definition: AllocatorList.h:23
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
TargetIRAnalysis & operator=(const TargetIRAnalysis &RHS)
value_op_iterator value_op_begin()
Definition: User.h:255
The main scalar evolution driver.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold, but used for partial/runtime unrolling (set to UINT_MAX to disable).
MemIndexedMode
The type of load/store indexing.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
ImmutablePass * createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA)
Create an analysis pass wrapper around a TTI object.
A cache of @llvm.assume calls within a function.
Analysis pass providing the TargetTransformInfo.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
value_op_iterator value_op_end()
Definition: User.h:258
F(f)
An instruction for reading from memory.
Definition: Instructions.h:167
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
TargetIRAnalysis & operator=(TargetIRAnalysis &&RHS)
bool areInlineCompatible(const Function &Caller, const Function &Callee)
int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const
Query the cost of a specified instruction.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
static bool areFunctionArgsABICompatible(const Function &F, const TargetTransformInfo &TTI, SmallPtrSetImpl< Argument *> &ArgsToPromote, SmallPtrSetImpl< Argument *> &ByValArgsToTransform)
Definition: BitVector.h:937
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
bool AllowPeeling
Allow peeling off loop iterations for loops with low dynamic tripcount.
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
AtomicOrdering
Atomic ordering for LLVM&#39;s memory model.
Key
PAL metadata keys.
Class to represent function types.
Definition: DerivedTypes.h:103
PopcntSupportKind
Flags indicating the kind of support for population count.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
An instruction for storing to memory.
Definition: Instructions.h:320
Reverse the order of the vector.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
bool AllowRemainder
Allow generation of a loop remainder (extra iterations after unroll).
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
ExtractSubvector Index indicates start offset.
Returns options for expansion of memcmp. IsZeroCmp is.
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
Flags describing the kind of vector reduction.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:64
Conditional or Unconditional Branch instruction.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
This is an important base class in LLVM.
Definition: Constant.h:41
static cl::opt< bool > ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false), cl::desc("Force allowance of nested hardware loops"))
A CRTP mix-in that provides informational APIs needed for analysis passes.
Definition: PassManager.h:389
AMDGPU Lower Kernel Arguments
static cl::opt< unsigned > LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1), cl::desc("Set the loop decrement value"))
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
TargetIRAnalysis(const TargetIRAnalysis &Arg)
Class to represent integer types.
Definition: DerivedTypes.h:40
static cl::opt< bool > ForceHardwareLoopPHI("force-hardware-loop-phi", cl::Hidden, cl::init(false), cl::desc("Force hardware loop counter to be updated through a phi"))
Attributes of a target dependent hardware loop.
bool IsMaxOp
If the op a min/max kind, true if it&#39;s a max operation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
OperandValueProperties
Additional properties of an operand&#39;s values.
ImmutablePass class - This class is used to provide information that does not need to be run...
Definition: Pass.h:255
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
Provides information about what library functions are available for the current target.
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Class to represent vector types.
Definition: DerivedTypes.h:427
Class for arbitrary precision integers.
Definition: APInt.h:69
amdgpu Simplify well known AMD library false FunctionCallee Callee
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
TargetTransformInfo(T Impl)
Construct a TTI object using a type implementing the Concept API below.
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
unsigned Threshold
The cost threshold for the unrolled loop.
This class represents an analyzed expression in the program.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:506
Parameters that control the generic loop unrolling transformation.
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable)...
TargetIRAnalysis(TargetIRAnalysis &&Arg)
#define I(x, y, z)
Definition: MD5.cpp:58
TargetCostConstants
Underlying constants for &#39;cost&#39; values in this interface.
int getUserCost(const User *U) const
This is a helper function which calls the two-argument getUserCost with Operands which are the curren...
InsertSubvector. Index indicates start offset.
unsigned Insns
TODO: Some of these could be merged.
API to communicate dependencies between analyses during invalidation.
Definition: PassManager.h:648
Multiway switch.
TargetTransformInfo Result
LLVM Value Representation.
Definition: Value.h:72
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
unsigned MaxPercentThresholdBoost
If complete unrolling will reduce the cost of the loop, we will boost the Threshold by a certain perc...
Broadcast element 0 to all other elements.
bool invalidate(Function &, const PreservedAnalyses &, FunctionAnalysisManager::Invalidator &)
Handle the invalidation of this information.
bool UpperBound
Allow using trip count upper bound to unroll loops.
print Print MemDeps of function
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:159
OperandValueKind
Additional information about an operand&#39;s possible values.
A container for analyses that lazily runs them and caches their results.
TargetCostKind
The kind of cost model.
CacheLevel
The possible cache levels.
This header defines various interfaces for pass management in LLVM.
Information about a load/store intrinsic defined by the target.
bool NoNaN
If op is an fp min/max, whether NaNs may be present.
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: PassManager.h:70
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:43
ShuffleKind
The various kinds of shuffle patterns for vector queries.
bool IsSigned
Whether the operation is a signed int reduction.