LLVM  14.0.0git
NVPTXTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "NVPTXUtilities.h"
11 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
17 #include "llvm/IR/IntrinsicsNVPTX.h"
18 #include "llvm/Support/Debug.h"
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "NVPTXtti"
22 
23 // Whether the given intrinsic reads threadIdx.x/y/z.
24 static bool readsThreadIndex(const IntrinsicInst *II) {
25  switch (II->getIntrinsicID()) {
26  default: return false;
27  case Intrinsic::nvvm_read_ptx_sreg_tid_x:
28  case Intrinsic::nvvm_read_ptx_sreg_tid_y:
29  case Intrinsic::nvvm_read_ptx_sreg_tid_z:
30  return true;
31  }
32 }
33 
34 static bool readsLaneId(const IntrinsicInst *II) {
35  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
36 }
37 
38 // Whether the given intrinsic is an atomic instruction in PTX.
39 static bool isNVVMAtomic(const IntrinsicInst *II) {
40  switch (II->getIntrinsicID()) {
41  default: return false;
42  case Intrinsic::nvvm_atomic_load_inc_32:
43  case Intrinsic::nvvm_atomic_load_dec_32:
44 
45  case Intrinsic::nvvm_atomic_add_gen_f_cta:
46  case Intrinsic::nvvm_atomic_add_gen_f_sys:
47  case Intrinsic::nvvm_atomic_add_gen_i_cta:
48  case Intrinsic::nvvm_atomic_add_gen_i_sys:
49  case Intrinsic::nvvm_atomic_and_gen_i_cta:
50  case Intrinsic::nvvm_atomic_and_gen_i_sys:
51  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
52  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
53  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
54  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
55  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
56  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
57  case Intrinsic::nvvm_atomic_max_gen_i_cta:
58  case Intrinsic::nvvm_atomic_max_gen_i_sys:
59  case Intrinsic::nvvm_atomic_min_gen_i_cta:
60  case Intrinsic::nvvm_atomic_min_gen_i_sys:
61  case Intrinsic::nvvm_atomic_or_gen_i_cta:
62  case Intrinsic::nvvm_atomic_or_gen_i_sys:
63  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
64  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
65  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
66  case Intrinsic::nvvm_atomic_xor_gen_i_sys:
67  return true;
68  }
69 }
70 
72  // Without inter-procedural analysis, we conservatively assume that arguments
73  // to __device__ functions are divergent.
74  if (const Argument *Arg = dyn_cast<Argument>(V))
75  return !isKernelFunction(*Arg->getParent());
76 
77  if (const Instruction *I = dyn_cast<Instruction>(V)) {
78  // Without pointer analysis, we conservatively assume values loaded from
79  // generic or local address space are divergent.
80  if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
81  unsigned AS = LI->getPointerAddressSpace();
82  return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
83  }
84  // Atomic instructions may cause divergence. Atomic instructions are
85  // executed sequentially across all threads in a warp. Therefore, an earlier
86  // executed thread may see different memory inputs than a later executed
87  // thread. For example, suppose *a = 0 initially.
88  //
89  // atom.global.add.s32 d, [a], 1
90  //
91  // returns 0 for the first thread that enters the critical region, and 1 for
92  // the second thread.
93  if (I->isAtomic())
94  return true;
95  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
96  // Instructions that read threadIdx are obviously divergent.
97  if (readsThreadIndex(II) || readsLaneId(II))
98  return true;
99  // Handle the NVPTX atomic instrinsics that cannot be represented as an
100  // atomic IR instruction.
101  if (isNVVMAtomic(II))
102  return true;
103  }
104  // Conservatively consider the return value of function calls as divergent.
105  // We could analyze callees with bodies more precisely using
106  // inter-procedural analysis.
107  if (isa<CallInst>(I))
108  return true;
109  }
110 
111  return false;
112 }
113 
114 // Convert NVVM intrinsics to target-generic LLVM code where possible.
116  // Each NVVM intrinsic we can simplify can be replaced with one of:
117  //
118  // * an LLVM intrinsic,
119  // * an LLVM cast operation,
120  // * an LLVM binary operation, or
121  // * ad-hoc LLVM IR for the particular operation.
122 
123  // Some transformations are only valid when the module's
124  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
125  // transformations are valid regardless of the module's ftz setting.
126  enum FtzRequirementTy {
127  FTZ_Any, // Any ftz setting is ok.
128  FTZ_MustBeOn, // Transformation is valid only if ftz is on.
129  FTZ_MustBeOff, // Transformation is valid only if ftz is off.
130  };
131  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
132  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
133  // simplify.
134  enum SpecialCase {
135  SPC_Reciprocal,
136  };
137 
138  // SimplifyAction is a poor-man's variant (plus an additional flag) that
139  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
140  struct SimplifyAction {
141  // Invariant: At most one of these Optionals has a value.
145  Optional<SpecialCase> Special;
146 
147  FtzRequirementTy FtzRequirement = FTZ_Any;
148 
149  SimplifyAction() = default;
150 
151  SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
152  : IID(IID), FtzRequirement(FtzReq) {}
153 
154  // Cast operations don't have anything to do with FTZ, so we skip that
155  // argument.
156  SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
157 
158  SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
159  : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
160 
161  SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
162  : Special(Special), FtzRequirement(FtzReq) {}
163  };
164 
165  // Try to generate a SimplifyAction describing how to replace our
166  // IntrinsicInstr with target-generic LLVM IR.
167  const SimplifyAction Action = [II]() -> SimplifyAction {
168  switch (II->getIntrinsicID()) {
169  // NVVM intrinsics that map directly to LLVM intrinsics.
170  case Intrinsic::nvvm_ceil_d:
171  return {Intrinsic::ceil, FTZ_Any};
172  case Intrinsic::nvvm_ceil_f:
173  return {Intrinsic::ceil, FTZ_MustBeOff};
174  case Intrinsic::nvvm_ceil_ftz_f:
175  return {Intrinsic::ceil, FTZ_MustBeOn};
176  case Intrinsic::nvvm_fabs_d:
177  return {Intrinsic::fabs, FTZ_Any};
178  case Intrinsic::nvvm_fabs_f:
179  return {Intrinsic::fabs, FTZ_MustBeOff};
180  case Intrinsic::nvvm_fabs_ftz_f:
181  return {Intrinsic::fabs, FTZ_MustBeOn};
182  case Intrinsic::nvvm_floor_d:
183  return {Intrinsic::floor, FTZ_Any};
184  case Intrinsic::nvvm_floor_f:
185  return {Intrinsic::floor, FTZ_MustBeOff};
186  case Intrinsic::nvvm_floor_ftz_f:
187  return {Intrinsic::floor, FTZ_MustBeOn};
188  case Intrinsic::nvvm_fma_rn_d:
189  return {Intrinsic::fma, FTZ_Any};
190  case Intrinsic::nvvm_fma_rn_f:
191  return {Intrinsic::fma, FTZ_MustBeOff};
192  case Intrinsic::nvvm_fma_rn_ftz_f:
193  return {Intrinsic::fma, FTZ_MustBeOn};
194  case Intrinsic::nvvm_fmax_d:
195  return {Intrinsic::maxnum, FTZ_Any};
196  case Intrinsic::nvvm_fmax_f:
197  return {Intrinsic::maxnum, FTZ_MustBeOff};
198  case Intrinsic::nvvm_fmax_ftz_f:
199  return {Intrinsic::maxnum, FTZ_MustBeOn};
200  case Intrinsic::nvvm_fmin_d:
201  return {Intrinsic::minnum, FTZ_Any};
202  case Intrinsic::nvvm_fmin_f:
203  return {Intrinsic::minnum, FTZ_MustBeOff};
204  case Intrinsic::nvvm_fmin_ftz_f:
205  return {Intrinsic::minnum, FTZ_MustBeOn};
206  case Intrinsic::nvvm_round_d:
207  return {Intrinsic::round, FTZ_Any};
208  case Intrinsic::nvvm_round_f:
209  return {Intrinsic::round, FTZ_MustBeOff};
210  case Intrinsic::nvvm_round_ftz_f:
211  return {Intrinsic::round, FTZ_MustBeOn};
212  case Intrinsic::nvvm_sqrt_rn_d:
213  return {Intrinsic::sqrt, FTZ_Any};
214  case Intrinsic::nvvm_sqrt_f:
215  // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
216  // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
217  // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
218  // the versions with explicit ftz-ness.
219  return {Intrinsic::sqrt, FTZ_Any};
220  case Intrinsic::nvvm_sqrt_rn_f:
221  return {Intrinsic::sqrt, FTZ_MustBeOff};
222  case Intrinsic::nvvm_sqrt_rn_ftz_f:
223  return {Intrinsic::sqrt, FTZ_MustBeOn};
224  case Intrinsic::nvvm_trunc_d:
225  return {Intrinsic::trunc, FTZ_Any};
226  case Intrinsic::nvvm_trunc_f:
227  return {Intrinsic::trunc, FTZ_MustBeOff};
228  case Intrinsic::nvvm_trunc_ftz_f:
229  return {Intrinsic::trunc, FTZ_MustBeOn};
230 
231  // NVVM intrinsics that map to LLVM cast operations.
232  //
233  // Note that llvm's target-generic conversion operators correspond to the rz
234  // (round to zero) versions of the nvvm conversion intrinsics, even though
235  // most everything else here uses the rn (round to nearest even) nvvm ops.
236  case Intrinsic::nvvm_d2i_rz:
237  case Intrinsic::nvvm_f2i_rz:
238  case Intrinsic::nvvm_d2ll_rz:
239  case Intrinsic::nvvm_f2ll_rz:
240  return {Instruction::FPToSI};
241  case Intrinsic::nvvm_d2ui_rz:
242  case Intrinsic::nvvm_f2ui_rz:
243  case Intrinsic::nvvm_d2ull_rz:
244  case Intrinsic::nvvm_f2ull_rz:
245  return {Instruction::FPToUI};
246  case Intrinsic::nvvm_i2d_rz:
247  case Intrinsic::nvvm_i2f_rz:
248  case Intrinsic::nvvm_ll2d_rz:
249  case Intrinsic::nvvm_ll2f_rz:
250  return {Instruction::SIToFP};
251  case Intrinsic::nvvm_ui2d_rz:
252  case Intrinsic::nvvm_ui2f_rz:
253  case Intrinsic::nvvm_ull2d_rz:
254  case Intrinsic::nvvm_ull2f_rz:
255  return {Instruction::UIToFP};
256 
257  // NVVM intrinsics that map to LLVM binary ops.
258  case Intrinsic::nvvm_add_rn_d:
259  return {Instruction::FAdd, FTZ_Any};
260  case Intrinsic::nvvm_add_rn_f:
261  return {Instruction::FAdd, FTZ_MustBeOff};
262  case Intrinsic::nvvm_add_rn_ftz_f:
263  return {Instruction::FAdd, FTZ_MustBeOn};
264  case Intrinsic::nvvm_mul_rn_d:
265  return {Instruction::FMul, FTZ_Any};
266  case Intrinsic::nvvm_mul_rn_f:
267  return {Instruction::FMul, FTZ_MustBeOff};
268  case Intrinsic::nvvm_mul_rn_ftz_f:
269  return {Instruction::FMul, FTZ_MustBeOn};
270  case Intrinsic::nvvm_div_rn_d:
271  return {Instruction::FDiv, FTZ_Any};
272  case Intrinsic::nvvm_div_rn_f:
273  return {Instruction::FDiv, FTZ_MustBeOff};
274  case Intrinsic::nvvm_div_rn_ftz_f:
275  return {Instruction::FDiv, FTZ_MustBeOn};
276 
277  // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
278  // need special handling.
279  //
280  // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
281  // as well.
282  case Intrinsic::nvvm_rcp_rn_d:
283  return {SPC_Reciprocal, FTZ_Any};
284  case Intrinsic::nvvm_rcp_rn_f:
285  return {SPC_Reciprocal, FTZ_MustBeOff};
286  case Intrinsic::nvvm_rcp_rn_ftz_f:
287  return {SPC_Reciprocal, FTZ_MustBeOn};
288 
289  // We do not currently simplify intrinsics that give an approximate
290  // answer. These include:
291  //
292  // - nvvm_cos_approx_{f,ftz_f}
293  // - nvvm_ex2_approx_{d,f,ftz_f}
294  // - nvvm_lg2_approx_{d,f,ftz_f}
295  // - nvvm_sin_approx_{f,ftz_f}
296  // - nvvm_sqrt_approx_{f,ftz_f}
297  // - nvvm_rsqrt_approx_{d,f,ftz_f}
298  // - nvvm_div_approx_{ftz_d,ftz_f,f}
299  // - nvvm_rcp_approx_ftz_d
300  //
301  // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
302  // means that fastmath is enabled in the intrinsic. Unfortunately only
303  // binary operators (currently) have a fastmath bit in SelectionDAG, so
304  // this information gets lost and we can't select on it.
305  //
306  // TODO: div and rcp are lowered to a binary op, so these we could in
307  // theory lower them to "fast fdiv".
308 
309  default:
310  return {};
311  }
312  }();
313 
314  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
315  // can bail out now. (Notice that in the case that IID is not an NVVM
316  // intrinsic, we don't have to look up any module metadata, as
317  // FtzRequirementTy will be FTZ_Any.)
318  if (Action.FtzRequirement != FTZ_Any) {
319  StringRef Attr = II->getFunction()
320  ->getFnAttribute("denormal-fp-math-f32")
321  .getValueAsString();
323  bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
324 
325  if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
326  return nullptr;
327  }
328 
329  // Simplify to target-generic intrinsic.
330  if (Action.IID) {
332  // All the target-generic intrinsics currently of interest to us have one
333  // type argument, equal to that of the nvvm intrinsic's argument.
334  Type *Tys[] = {II->getArgOperand(0)->getType()};
335  return CallInst::Create(
336  Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
337  }
338 
339  // Simplify to target-generic binary op.
340  if (Action.BinaryOp)
341  return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
342  II->getArgOperand(1), II->getName());
343 
344  // Simplify to target-generic cast op.
345  if (Action.CastOp)
346  return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
347  II->getName());
348 
349  // All that's left are the special cases.
350  if (!Action.Special)
351  return nullptr;
352 
353  switch (*Action.Special) {
354  case SPC_Reciprocal:
355  // Simplify reciprocal.
356  return BinaryOperator::Create(
357  Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
358  II->getArgOperand(0), II->getName());
359  }
360  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
361 }
362 
365  if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
366  return I;
367  }
368  return None;
369 }
370 
372  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
373  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
374  TTI::OperandValueProperties Opd1PropInfo,
376  const Instruction *CxtI) {
377  // Legalize the type.
378  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
379 
380  int ISD = TLI->InstructionOpcodeToISD(Opcode);
381 
382  switch (ISD) {
383  default:
384  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
385  Opd2Info,
386  Opd1PropInfo, Opd2PropInfo);
387  case ISD::ADD:
388  case ISD::MUL:
389  case ISD::XOR:
390  case ISD::OR:
391  case ISD::AND:
392  // The machine code (SASS) simulates an i64 with two i32. Therefore, we
393  // estimate that arithmetic operations on i64 are twice as expensive as
394  // those on types that can fit into one machine register.
395  if (LT.second.SimpleTy == MVT::i64)
396  return 2 * LT.first;
397  // Delegate other cases to the basic TTI.
398  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
399  Opd2Info,
400  Opd1PropInfo, Opd2PropInfo);
401  }
402 }
403 
407  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
408 
409  // Enable partial unrolling and runtime unrolling, but reduce the
410  // threshold. This partially unrolls small loops which are often
411  // unrolled by the PTX to SASS compiler and unrolling earlier can be
412  // beneficial.
413  UP.Partial = UP.Runtime = true;
414  UP.PartialThreshold = UP.Threshold / 4;
415 }
416 
419  BaseT::getPeelingPreferences(L, SE, PP);
420 }
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
llvm::BasicTTIImplBase< NVPTXTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:485
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:453
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
isNVVMAtomic
static bool isNVVMAtomic(const IntrinsicInst *II)
Definition: NVPTXTargetTransformInfo.cpp:39
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::ADDRESS_SPACE_LOCAL
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
llvm::NVPTXTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: NVPTXTargetTransformInfo.cpp:404
llvm::CastInst::Create
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", Instruction *InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Definition: Instructions.cpp:3076
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::BasicTTIImplBase< NVPTXTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:751
ValueTracking.h
readsThreadIndex
static bool readsThreadIndex(const IntrinsicInst *II)
Definition: NVPTXTargetTransformInfo.cpp:24
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:481
llvm::NVPTXTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: NVPTXTargetTransformInfo.cpp:371
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:535
llvm::Optional
Definition: APInt.h:33
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1840
NVPTXTargetTransformInfo.h
NVPTXUtilities.h
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::parseDenormalFPAttribute
DenormalMode parseDenormalFPAttribute(StringRef Str)
Returns the denormal mode to use for inputs and outputs.
Definition: FloatingPointMode.h:174
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
TargetLowering.h
llvm::BasicTTIImplBase< NVPTXTTIImpl >::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:488
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::CallInst::Create
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1518
llvm::Function::getFnAttribute
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:652
llvm::Instruction::CastOps
CastOps
Definition: Instruction.h:799
llvm::Instruction
Definition: Instruction.h:45
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
llvm::Attribute::getValueAsString
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:301
llvm::None
const NoneType None
Definition: None.h:23
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1307
LoopInfo.h
llvm::isKernelFunction
bool isKernelFunction(const Function &F)
Definition: NVPTXUtilities.cpp:274
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:428
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:886
llvm::DenormalMode
Represent subnormal handling kind for floating point instruction inputs and outputs.
Definition: FloatingPointMode.h:67
readsLaneId
static bool readsLaneId(const IntrinsicInst *II)
Definition: NVPTXTargetTransformInfo.cpp:34
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::DenormalMode::IEEE
@ IEEE
IEEE-754 denormal numbers preserved.
Definition: FloatingPointMode.h:74
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:878
llvm::NVPTXTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: NVPTXTargetTransformInfo.cpp:417
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
simplifyNvvmIntrinsic
static Instruction * simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC)
Definition: NVPTXTargetTransformInfo.cpp:115
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::Instruction::getFunction
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1760
llvm::ADDRESS_SPACE_GENERIC
@ ADDRESS_SPACE_GENERIC
Definition: NVPTXBaseInfo.h:22
llvm::BasicTTIImplBase< NVPTXTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:560
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:634
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1296
CostTable.h
llvm::ConstantFP::get
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:972
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:785
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1328
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:436
TargetTransformInfo.h
llvm::NVPTXTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: NVPTXTargetTransformInfo.cpp:364
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
BasicTTIImpl.h
llvm::NVPTXTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V)
Definition: NVPTXTargetTransformInfo.cpp:71
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2674
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
Debug.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1319
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37