LLVM 23.0.0git
NVPTXTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "NVPTXUtilities.h"
11#include "llvm/ADT/STLExtras.h"
17#include "llvm/IR/Constants.h"
19#include "llvm/IR/Intrinsics.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
21#include "llvm/IR/Value.h"
26#include <optional>
27using namespace llvm;
28
29#define DEBUG_TYPE "NVPTXtti"
30
31// Whether the given intrinsic reads threadIdx.x/y/z.
32static bool readsThreadIndex(const IntrinsicInst *II) {
33 switch (II->getIntrinsicID()) {
34 default: return false;
35 case Intrinsic::nvvm_read_ptx_sreg_tid_x:
36 case Intrinsic::nvvm_read_ptx_sreg_tid_y:
37 case Intrinsic::nvvm_read_ptx_sreg_tid_z:
38 return true;
39 }
40}
41
42static bool readsLaneId(const IntrinsicInst *II) {
43 return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
44}
45
46// Whether the given intrinsic is an atomic instruction in PTX.
47static bool isNVVMAtomic(const IntrinsicInst *II) {
48 switch (II->getIntrinsicID()) {
49 default:
50 return false;
51 case Intrinsic::nvvm_atomic_add_gen_f_cta:
52 case Intrinsic::nvvm_atomic_add_gen_f_sys:
53 case Intrinsic::nvvm_atomic_add_gen_i_cta:
54 case Intrinsic::nvvm_atomic_add_gen_i_sys:
55 case Intrinsic::nvvm_atomic_and_gen_i_cta:
56 case Intrinsic::nvvm_atomic_and_gen_i_sys:
57 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
58 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
59 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
60 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
61 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
62 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
63 case Intrinsic::nvvm_atomic_max_gen_i_cta:
64 case Intrinsic::nvvm_atomic_max_gen_i_sys:
65 case Intrinsic::nvvm_atomic_min_gen_i_cta:
66 case Intrinsic::nvvm_atomic_min_gen_i_sys:
67 case Intrinsic::nvvm_atomic_or_gen_i_cta:
68 case Intrinsic::nvvm_atomic_or_gen_i_sys:
69 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
70 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
71 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
72 case Intrinsic::nvvm_atomic_xor_gen_i_sys:
73 return true;
74 }
75}
76
77bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) const {
78 // Without inter-procedural analysis, we conservatively assume that arguments
79 // to __device__ functions are divergent.
80 if (const Argument *Arg = dyn_cast<Argument>(V))
81 return !isKernelFunction(*Arg->getParent());
82
83 if (const Instruction *I = dyn_cast<Instruction>(V)) {
84 // Without pointer analysis, we conservatively assume values loaded from
85 // generic or local address space are divergent.
86 if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
87 unsigned AS = LI->getPointerAddressSpace();
88 return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
89 }
90 // Atomic instructions may cause divergence. Atomic instructions are
91 // executed sequentially across all threads in a warp. Therefore, an earlier
92 // executed thread may see different memory inputs than a later executed
93 // thread. For example, suppose *a = 0 initially.
94 //
95 // atom.global.add.s32 d, [a], 1
96 //
97 // returns 0 for the first thread that enters the critical region, and 1 for
98 // the second thread.
99 if (I->isAtomic())
100 return true;
101 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
102 // Instructions that read threadIdx are obviously divergent.
104 return true;
105 // Handle the NVPTX atomic intrinsics that cannot be represented as an
106 // atomic IR instruction.
107 if (isNVVMAtomic(II))
108 return true;
109 }
110 // Conservatively consider the return value of function calls as divergent.
111 // We could analyze callees with bodies more precisely using
112 // inter-procedural analysis.
113 if (isa<CallInst>(I))
114 return true;
115 }
116
117 return false;
118}
119
120// Convert NVVM intrinsics to target-generic LLVM code where possible.
122 IntrinsicInst *II) {
123 // Each NVVM intrinsic we can simplify can be replaced with one of:
124 //
125 // * an LLVM intrinsic,
126 // * an LLVM cast operation,
127 // * an LLVM binary operation, or
128 // * ad-hoc LLVM IR for the particular operation.
129
130 // Some transformations are only valid when the module's
131 // flush-denormals-to-zero (ftz) setting is true/false, whereas other
132 // transformations are valid regardless of the module's ftz setting.
133 enum FtzRequirementTy {
134 FTZ_Any, // Any ftz setting is ok.
135 FTZ_MustBeOn, // Transformation is valid only if ftz is on.
136 FTZ_MustBeOff, // Transformation is valid only if ftz is off.
137 };
138 // Classes of NVVM intrinsics that can't be replaced one-to-one with a
139 // target-generic intrinsic, cast op, or binary op but that we can nonetheless
140 // simplify.
141 enum SpecialCase {
142 SPC_Reciprocal,
143 SCP_FunnelShiftClamp,
144 };
145
146 // SimplifyAction is a poor-man's variant (plus an additional flag) that
147 // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
148 struct SimplifyAction {
149 // Invariant: At most one of these Optionals has a value.
150 std::optional<Intrinsic::ID> IID;
151 std::optional<Instruction::CastOps> CastOp;
152 std::optional<Instruction::BinaryOps> BinaryOp;
153 std::optional<SpecialCase> Special;
154
155 FtzRequirementTy FtzRequirement = FTZ_Any;
156 // Denormal handling is guarded by different attributes depending on the
157 // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
158 bool IsHalfTy = false;
159
160 SimplifyAction() = default;
161
162 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
163 bool IsHalfTy = false)
164 : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
165
166 // Cast operations don't have anything to do with FTZ, so we skip that
167 // argument.
168 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
169
170 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
171 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
172
173 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
174 : Special(Special), FtzRequirement(FtzReq) {}
175 };
176
177 // Try to generate a SimplifyAction describing how to replace our
178 // IntrinsicInstr with target-generic LLVM IR.
179 const SimplifyAction Action = [II]() -> SimplifyAction {
180 switch (II->getIntrinsicID()) {
181 // NVVM intrinsics that map directly to LLVM intrinsics.
182 case Intrinsic::nvvm_ceil_d:
183 return {Intrinsic::ceil, FTZ_Any};
184 case Intrinsic::nvvm_ceil_f:
185 return {Intrinsic::ceil, FTZ_MustBeOff};
186 case Intrinsic::nvvm_ceil_ftz_f:
187 return {Intrinsic::ceil, FTZ_MustBeOn};
188 case Intrinsic::nvvm_floor_d:
189 return {Intrinsic::floor, FTZ_Any};
190 case Intrinsic::nvvm_floor_f:
191 return {Intrinsic::floor, FTZ_MustBeOff};
192 case Intrinsic::nvvm_floor_ftz_f:
193 return {Intrinsic::floor, FTZ_MustBeOn};
194 case Intrinsic::nvvm_fma_rn_d:
195 return {Intrinsic::fma, FTZ_Any};
196 case Intrinsic::nvvm_fma_rn_f:
197 return {Intrinsic::fma, FTZ_MustBeOff};
198 case Intrinsic::nvvm_fma_rn_ftz_f:
199 return {Intrinsic::fma, FTZ_MustBeOn};
200 case Intrinsic::nvvm_fma_rn_f16:
201 return {Intrinsic::fma, FTZ_MustBeOff, true};
202 case Intrinsic::nvvm_fma_rn_ftz_f16:
203 return {Intrinsic::fma, FTZ_MustBeOn, true};
204 case Intrinsic::nvvm_fma_rn_f16x2:
205 return {Intrinsic::fma, FTZ_MustBeOff, true};
206 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
207 return {Intrinsic::fma, FTZ_MustBeOn, true};
208 case Intrinsic::nvvm_fma_rn_bf16:
209 return {Intrinsic::fma, FTZ_MustBeOff, true};
210 case Intrinsic::nvvm_fma_rn_bf16x2:
211 return {Intrinsic::fma, FTZ_MustBeOff, true};
212 case Intrinsic::nvvm_fmax_d:
213 return {Intrinsic::maxnum, FTZ_Any};
214 case Intrinsic::nvvm_fmax_f:
215 return {Intrinsic::maxnum, FTZ_MustBeOff};
216 case Intrinsic::nvvm_fmax_ftz_f:
217 return {Intrinsic::maxnum, FTZ_MustBeOn};
218 case Intrinsic::nvvm_fmax_nan_f:
219 return {Intrinsic::maximum, FTZ_MustBeOff};
220 case Intrinsic::nvvm_fmax_ftz_nan_f:
221 return {Intrinsic::maximum, FTZ_MustBeOn};
222 case Intrinsic::nvvm_fmax_f16:
223 return {Intrinsic::maxnum, FTZ_MustBeOff, true};
224 case Intrinsic::nvvm_fmax_ftz_f16:
225 return {Intrinsic::maxnum, FTZ_MustBeOn, true};
226 case Intrinsic::nvvm_fmax_f16x2:
227 return {Intrinsic::maxnum, FTZ_MustBeOff, true};
228 case Intrinsic::nvvm_fmax_ftz_f16x2:
229 return {Intrinsic::maxnum, FTZ_MustBeOn, true};
230 case Intrinsic::nvvm_fmax_nan_f16:
231 return {Intrinsic::maximum, FTZ_MustBeOff, true};
232 case Intrinsic::nvvm_fmax_ftz_nan_f16:
233 return {Intrinsic::maximum, FTZ_MustBeOn, true};
234 case Intrinsic::nvvm_fmax_nan_f16x2:
235 return {Intrinsic::maximum, FTZ_MustBeOff, true};
236 case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
237 return {Intrinsic::maximum, FTZ_MustBeOn, true};
238 case Intrinsic::nvvm_fmin_d:
239 return {Intrinsic::minnum, FTZ_Any};
240 case Intrinsic::nvvm_fmin_f:
241 return {Intrinsic::minnum, FTZ_MustBeOff};
242 case Intrinsic::nvvm_fmin_ftz_f:
243 return {Intrinsic::minnum, FTZ_MustBeOn};
244 case Intrinsic::nvvm_fmin_nan_f:
245 return {Intrinsic::minimum, FTZ_MustBeOff};
246 case Intrinsic::nvvm_fmin_ftz_nan_f:
247 return {Intrinsic::minimum, FTZ_MustBeOn};
248 case Intrinsic::nvvm_fmin_f16:
249 return {Intrinsic::minnum, FTZ_MustBeOff, true};
250 case Intrinsic::nvvm_fmin_ftz_f16:
251 return {Intrinsic::minnum, FTZ_MustBeOn, true};
252 case Intrinsic::nvvm_fmin_f16x2:
253 return {Intrinsic::minnum, FTZ_MustBeOff, true};
254 case Intrinsic::nvvm_fmin_ftz_f16x2:
255 return {Intrinsic::minnum, FTZ_MustBeOn, true};
256 case Intrinsic::nvvm_fmin_nan_f16:
257 return {Intrinsic::minimum, FTZ_MustBeOff, true};
258 case Intrinsic::nvvm_fmin_ftz_nan_f16:
259 return {Intrinsic::minimum, FTZ_MustBeOn, true};
260 case Intrinsic::nvvm_fmin_nan_f16x2:
261 return {Intrinsic::minimum, FTZ_MustBeOff, true};
262 case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
263 return {Intrinsic::minimum, FTZ_MustBeOn, true};
264 case Intrinsic::nvvm_sqrt_rn_d:
265 return {Intrinsic::sqrt, FTZ_Any};
266 case Intrinsic::nvvm_sqrt_f:
267 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
268 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
269 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
270 // the versions with explicit ftz-ness.
271 return {Intrinsic::sqrt, FTZ_Any};
272 case Intrinsic::nvvm_trunc_d:
273 return {Intrinsic::trunc, FTZ_Any};
274 case Intrinsic::nvvm_trunc_f:
275 return {Intrinsic::trunc, FTZ_MustBeOff};
276 case Intrinsic::nvvm_trunc_ftz_f:
277 return {Intrinsic::trunc, FTZ_MustBeOn};
278
279 // NVVM intrinsics that map to LLVM cast operations.
280 // Note - we cannot map intrinsics like nvvm_d2ll_rz to LLVM's
281 // FPToSI, as NaN to int conversion with FPToSI is considered UB and is
282 // eliminated. NVVM conversion intrinsics are translated to PTX cvt
283 // instructions which define the outcome for NaN rather than leaving as UB.
284 // Therefore, translate NVVM intrinsics to sitofp/uitofp, but not to
285 // fptosi/fptoui.
286 case Intrinsic::nvvm_i2d_rn:
287 case Intrinsic::nvvm_i2f_rn:
288 case Intrinsic::nvvm_ll2d_rn:
289 case Intrinsic::nvvm_ll2f_rn:
290 return {Instruction::SIToFP};
291 case Intrinsic::nvvm_ui2d_rn:
292 case Intrinsic::nvvm_ui2f_rn:
293 case Intrinsic::nvvm_ull2d_rn:
294 case Intrinsic::nvvm_ull2f_rn:
295 return {Instruction::UIToFP};
296
297 // NVVM intrinsics that map to LLVM binary ops.
298 case Intrinsic::nvvm_div_rn_d:
299 return {Instruction::FDiv, FTZ_Any};
300
301 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
302 // need special handling.
303 //
304 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
305 // as well.
306 case Intrinsic::nvvm_rcp_rn_d:
307 return {SPC_Reciprocal, FTZ_Any};
308
309 case Intrinsic::nvvm_fshl_clamp:
310 case Intrinsic::nvvm_fshr_clamp:
311 return {SCP_FunnelShiftClamp, FTZ_Any};
312
313 // We do not currently simplify intrinsics that give an approximate
314 // answer. These include:
315 //
316 // - nvvm_cos_approx_{f,ftz_f}
317 // - nvvm_ex2_approx(_ftz)
318 // - nvvm_lg2_approx_{d,f,ftz_f}
319 // - nvvm_sin_approx_{f,ftz_f}
320 // - nvvm_sqrt_approx_{f,ftz_f}
321 // - nvvm_rsqrt_approx_{d,f,ftz_f}
322 // - nvvm_div_approx_{ftz_d,ftz_f,f}
323 // - nvvm_rcp_approx_ftz_d
324 //
325 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
326 // means that fastmath is enabled in the intrinsic. Unfortunately only
327 // binary operators (currently) have a fastmath bit in SelectionDAG, so
328 // this information gets lost and we can't select on it.
329 //
330 // TODO: div and rcp are lowered to a binary op, so these we could in
331 // theory lower them to "fast fdiv".
332
333 default:
334 return {};
335 }
336 }();
337
338 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
339 // can bail out now. (Notice that in the case that IID is not an NVVM
340 // intrinsic, we don't have to look up any module metadata, as
341 // FtzRequirementTy will be FTZ_Any.)
342 if (Action.FtzRequirement != FTZ_Any) {
343 // FIXME: Broken for f64
344 DenormalMode Mode = II->getFunction()->getDenormalMode(
345 Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle());
346 bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign;
347
348 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
349 return nullptr;
350 }
351
352 // Simplify to target-generic intrinsic.
353 if (Action.IID) {
354 SmallVector<Value *, 4> Args(II->args());
355 // All the target-generic intrinsics currently of interest to us have one
356 // type argument, equal to that of the nvvm intrinsic's argument.
357 Type *Tys[] = {II->getArgOperand(0)->getType()};
358 return CallInst::Create(
359 Intrinsic::getOrInsertDeclaration(II->getModule(), *Action.IID, Tys),
360 Args);
361 }
362
363 // Simplify to target-generic binary op.
364 if (Action.BinaryOp)
365 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
366 II->getArgOperand(1), II->getName());
367
368 // Simplify to target-generic cast op.
369 if (Action.CastOp)
370 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
371 II->getName());
372
373 // All that's left are the special cases.
374 if (!Action.Special)
375 return nullptr;
376
377 switch (*Action.Special) {
378 case SPC_Reciprocal:
379 // Simplify reciprocal.
381 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
382 II->getArgOperand(0), II->getName());
383
384 case SCP_FunnelShiftClamp: {
385 // Canonicalize a clamping funnel shift to the generic llvm funnel shift
386 // when possible, as this is easier for llvm to optimize further.
387 if (const auto *ShiftConst = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
388 const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp;
389 if (ShiftConst->getZExtValue() >= II->getType()->getIntegerBitWidth())
390 return IC.replaceInstUsesWith(*II, II->getArgOperand(IsLeft ? 1 : 0));
391
392 const unsigned FshIID = IsLeft ? Intrinsic::fshl : Intrinsic::fshr;
394 II->getModule(), FshIID, II->getType()),
395 SmallVector<Value *, 3>(II->args()));
396 }
397 return nullptr;
398 }
399 }
400 llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
401}
402
403// Returns true/false when we know the answer, nullopt otherwise.
404static std::optional<bool> evaluateIsSpace(Intrinsic::ID IID, unsigned AS) {
407 return std::nullopt; // Got to check at run-time.
408 switch (IID) {
409 case Intrinsic::nvvm_isspacep_global:
411 case Intrinsic::nvvm_isspacep_local:
412 return AS == NVPTXAS::ADDRESS_SPACE_LOCAL;
413 case Intrinsic::nvvm_isspacep_shared:
414 // If shared cluster this can't be evaluated at compile time.
416 return std::nullopt;
418 case Intrinsic::nvvm_isspacep_shared_cluster:
421 case Intrinsic::nvvm_isspacep_const:
422 return AS == NVPTXAS::ADDRESS_SPACE_CONST;
423 default:
424 llvm_unreachable("Unexpected intrinsic");
425 }
426}
427
428// Returns an instruction pointer (may be nullptr if we do not know the answer).
429// Returns nullopt if `II` is not one of the `isspacep` intrinsics.
430//
431// TODO: If InferAddressSpaces were run early enough in the pipeline this could
432// be removed in favor of the constant folding that occurs there through
433// rewriteIntrinsicWithAddressSpace
434static std::optional<Instruction *>
436
437 switch (auto IID = II.getIntrinsicID()) {
438 case Intrinsic::nvvm_isspacep_global:
439 case Intrinsic::nvvm_isspacep_local:
440 case Intrinsic::nvvm_isspacep_shared:
441 case Intrinsic::nvvm_isspacep_shared_cluster:
442 case Intrinsic::nvvm_isspacep_const: {
443 Value *Op0 = II.getArgOperand(0);
444 unsigned AS = Op0->getType()->getPointerAddressSpace();
445 // Peek through ASC to generic AS.
446 // TODO: we could dig deeper through both ASCs and GEPs.
448 if (auto *ASCO = dyn_cast<AddrSpaceCastOperator>(Op0))
449 AS = ASCO->getOperand(0)->getType()->getPointerAddressSpace();
450
451 if (std::optional<bool> Answer = evaluateIsSpace(IID, AS))
452 return IC.replaceInstUsesWith(II,
453 ConstantInt::get(II.getType(), *Answer));
454 return nullptr; // Don't know the answer, got to check at run time.
455 }
456 default:
457 return std::nullopt;
458 }
459}
460
461std::optional<Instruction *>
463 if (std::optional<Instruction *> I = handleSpaceCheckIntrinsics(IC, II))
464 return *I;
466 return I;
467
468 return std::nullopt;
469}
470
475 if (const auto *CI = dyn_cast<CallInst>(U))
476 if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
477 // Without this implementation getCallCost() would return the number
478 // of arguments+1 as the cost. Because the cost-model assumes it is a call
479 // since it is classified as a call in the IR. A better cost model would
480 // be to return the number of asm instructions embedded in the asm
481 // string.
482 StringRef AsmStr = IA->getAsmString();
483 const unsigned InstCount =
484 count_if(split(AsmStr, ';'), [](StringRef AsmInst) {
485 // Trim off scopes denoted by '{' and '}' as these can be ignored
486 AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");
487 // This is pretty coarse but does a reasonably good job of
488 // identifying things that look like instructions, possibly with a
489 // predicate ("@").
490 return !AsmInst.empty() &&
491 (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
492 AsmInst.contains(".pragma"));
493 });
494 return InstCount * TargetTransformInfo::TCC_Basic;
495 }
496
497 return BaseT::getInstructionCost(U, Operands, CostKind);
498}
499
501 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
503 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
504 // Legalize the type.
505 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
506
507 int ISD = TLI->InstructionOpcodeToISD(Opcode);
508
509 switch (ISD) {
510 default:
511 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
512 Op2Info);
513 case ISD::ADD:
514 case ISD::MUL:
515 case ISD::XOR:
516 case ISD::OR:
517 case ISD::AND:
518 // The machine code (SASS) simulates an i64 with two i32. Therefore, we
519 // estimate that arithmetic operations on i64 are twice as expensive as
520 // those on types that can fit into one machine register.
521 if (LT.second.SimpleTy == MVT::i64)
522 return 2 * LT.first;
523 // Delegate other cases to the basic TTI.
524 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
525 Op2Info);
526 }
527}
528
531 OptimizationRemarkEmitter *ORE) const {
532 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
533
534 // Enable partial unrolling and runtime unrolling, but reduce the
535 // threshold. This partially unrolls small loops which are often
536 // unrolled by the PTX to SASS compiler and unrolling earlier can be
537 // beneficial.
538 UP.Partial = UP.Runtime = true;
539 UP.PartialThreshold = UP.Threshold / 4;
540}
541
546
548 Intrinsic::ID IID) const {
549 switch (IID) {
550 case Intrinsic::nvvm_isspacep_const:
551 case Intrinsic::nvvm_isspacep_global:
552 case Intrinsic::nvvm_isspacep_local:
553 case Intrinsic::nvvm_isspacep_shared:
554 case Intrinsic::nvvm_isspacep_shared_cluster:
555 case Intrinsic::nvvm_prefetch_tensormap: {
556 OpIndexes.push_back(0);
557 return true;
558 }
559 }
560 return false;
561}
562
564 Value *OldV,
565 Value *NewV) const {
566 const Intrinsic::ID IID = II->getIntrinsicID();
567 switch (IID) {
568 case Intrinsic::nvvm_isspacep_const:
569 case Intrinsic::nvvm_isspacep_global:
570 case Intrinsic::nvvm_isspacep_local:
571 case Intrinsic::nvvm_isspacep_shared:
572 case Intrinsic::nvvm_isspacep_shared_cluster: {
573 const unsigned NewAS = NewV->getType()->getPointerAddressSpace();
574 if (const auto R = evaluateIsSpace(IID, NewAS))
575 return ConstantInt::get(II->getType(), *R);
576 return nullptr;
577 }
578 case Intrinsic::nvvm_prefetch_tensormap: {
579 IRBuilder<> Builder(II);
580 const unsigned NewAS = NewV->getType()->getPointerAddressSpace();
581 if (NewAS == NVPTXAS::ADDRESS_SPACE_CONST ||
583 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
584 NewV);
585 return nullptr;
586 }
587 }
588 return nullptr;
589}
590
592 unsigned AddrSpace,
593 TTI::MaskKind MaskKind) const {
594 if (MaskKind != TTI::MaskKind::ConstantMask)
595 return false;
596
597 // We currently only support this feature for 256-bit vectors, so the
598 // alignment must be at least 32
599 if (Alignment < 32)
600 return false;
601
602 if (!ST->has256BitVectorLoadStore(AddrSpace))
603 return false;
604
605 auto *VTy = dyn_cast<FixedVectorType>(DataTy);
606 if (!VTy)
607 return false;
608
609 auto *ElemTy = VTy->getScalarType();
610 return (ElemTy->getScalarSizeInBits() == 32 && VTy->getNumElements() == 8) ||
611 (ElemTy->getScalarSizeInBits() == 64 && VTy->getNumElements() == 4);
612}
613
615 unsigned /*AddrSpace*/,
616 TTI::MaskKind MaskKind) const {
617 if (MaskKind != TTI::MaskKind::ConstantMask)
618 return false;
619
620 if (Alignment < DL.getTypeStoreSize(DataTy))
621 return false;
622
623 // We do not support sub-byte element type masked loads.
624 auto *VTy = dyn_cast<FixedVectorType>(DataTy);
625 if (!VTy)
626 return false;
627 return VTy->getElementType()->getScalarSizeInBits() >= 8;
628}
629
630unsigned NVPTXTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
631 // 256 bit loads/stores are currently only supported for global address space
632 if (ST->has256BitVectorLoadStore(AddrSpace))
633 return 256;
634 return 128;
635}
636
637unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
638 if (isa<AllocaInst>(V))
639 return ADDRESS_SPACE_LOCAL;
640
641 if (const Argument *Arg = dyn_cast<Argument>(V)) {
642 if (isKernelFunction(*Arg->getParent())) {
643 const NVPTXTargetMachine &TM =
644 static_cast<const NVPTXTargetMachine &>(getTLI()->getTargetMachine());
645 if (TM.getDrvInterface() == NVPTX::CUDA && !Arg->hasByValAttr())
647 } else {
648 // We assume that all device parameters that are passed byval will be
649 // placed in the local AS. Very simple cases will be updated after ISel to
650 // use the device param space where possible.
651 if (Arg->hasByValAttr())
652 return ADDRESS_SPACE_LOCAL;
653 }
654 }
655
656 return -1;
657}
658
660 const Function &F,
661 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
662 if (const auto Val = getMaxClusterRank(F))
663 LB.push_back({"maxclusterrank", *Val});
664
665 const auto MaxNTID = getMaxNTID(F);
666 if (MaxNTID.size() > 0)
667 LB.push_back({"maxntidx", MaxNTID[0]});
668 if (MaxNTID.size() > 1)
669 LB.push_back({"maxntidy", MaxNTID[1]});
670 if (MaxNTID.size() > 2)
671 LB.push_back({"maxntidz", MaxNTID[2]});
672}
673
676 if (isSourceOfDivergence(V))
678
680}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file provides the interface for the instcombine pass implementation.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
NVPTX address space definition.
static std::optional< Instruction * > handleSpaceCheckIntrinsics(InstCombiner &IC, IntrinsicInst &II)
static bool isNVVMAtomic(const IntrinsicInst *II)
static Instruction * convertNvvmIntrinsicToLlvm(InstCombiner &IC, IntrinsicInst *II)
static bool readsLaneId(const IntrinsicInst *II)
static std::optional< bool > evaluateIsSpace(Intrinsic::ID IID, unsigned AS)
static bool readsThreadIndex(const IntrinsicInst *II)
This file a TargetTransformInfoImplBase conforming object specific to the NVPTX target machine.
uint64_t IntrinsicInst * II
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2772
The core instruction combiner logic.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
A wrapper class for inspecting calls to intrinsic functions.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
InstructionUniformity getInstructionUniformity(const Value *V) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddrSpace, TTI::MaskKind MaskKind) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getAssumedAddrSpace(const Value *V) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddrSpace, TTI::MaskKind MaskKind) const override
NVPTX::DrvInterface getDrvInterface() const
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
StringRef ltrim(char Char) const
Return string with consecutive Char characters starting from the the left removed.
Definition StringRef.h:802
bool contains(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
Definition StringRef.h:426
StringRef trim(char Char) const
Return string with consecutive Char characters starting from the left and right removed.
Definition StringRef.h:826
virtual InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
iterator_range< SplittingIterator > split(StringRef Str, StringRef Separator)
Split the specified string over a separator and return a range-compatible iterable over its partition...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
std::optional< unsigned > getMaxClusterRank(const Function &F)
SmallVector< unsigned, 3 > getMaxNTID(const Function &F)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
bool isKernelFunction(const Function &F)
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...