LLVM 23.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
18#include "llvm/ADT/SetVector.h"
26#include "llvm/IR/Dominators.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/InstVisitor.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/IR/ValueHandle.h"
33#include "llvm/Pass.h"
39
40#define DEBUG_TYPE "amdgpu-codegenprepare"
41
42using namespace llvm;
43using namespace llvm::PatternMatch;
44
45namespace {
46
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
51 cl::init(false));
52
53static cl::opt<bool>
54 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
55 cl::desc("Break large PHI nodes for DAGISel"),
57
58static cl::opt<bool>
59 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc("For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
63
64static cl::opt<unsigned> BreakLargePHIsThreshold(
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68
69static cl::opt<bool> UseMul24Intrin(
70 "amdgpu-codegenprepare-mul24",
71 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::init(true));
74
75// Legalize 64-bit division by using the generic IR expansion.
76static cl::opt<bool> ExpandDiv64InIR(
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::init(false));
81
82// Leave all division operations as they are. This supersedes ExpandDiv64InIR
83// and is used for testing the legalizer.
84static cl::opt<bool> DisableIDivExpand(
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::init(false));
89
90// Disable processing of fdiv so we can better test the backend implementations.
91static cl::opt<bool> DisableFDivExpand(
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::init(false));
96
97class AMDGPUCodeGenPrepareImpl
98 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
99public:
100 Function &F;
101 const GCNSubtarget &ST;
102 const AMDGPUTargetMachine &TM;
103 const TargetLibraryInfo *TLI;
104 const UniformityInfo &UA;
105 const DataLayout &DL;
106 SimplifyQuery SQ;
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged = false;
109 mutable Function *SqrtF32 = nullptr;
110 mutable Function *LdexpF32 = nullptr;
111 mutable SmallVector<WeakVH> DeadVals;
112
113 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
114
115 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
116 const TargetLibraryInfo *TLI, AssumptionCache *AC,
117 const DominatorTree *DT, const UniformityInfo &UA)
118 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
119 DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
120 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
122
123 Function *getSqrtF32() const {
124 if (SqrtF32)
125 return SqrtF32;
126
127 LLVMContext &Ctx = F.getContext();
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
130 return SqrtF32;
131 }
132
133 Function *getLdexpF32() const {
134 if (LdexpF32)
135 return LdexpF32;
136
137 LLVMContext &Ctx = F.getContext();
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
141 return LdexpF32;
142 }
143
144 bool canBreakPHINode(const PHINode &I);
145
146 /// Return true if \p T is a legal scalar floating point type.
147 bool isLegalFloatingTy(const Type *T) const;
148
149 /// Wrapper to pass all the arguments to computeKnownFPClass
151 const Instruction *CtxI) const {
152 return llvm::computeKnownFPClass(V, Interested,
153 SQ.getWithInstruction(CtxI));
154 }
155
156 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
157 return HasFP32DenormalFlush ||
159 }
160
161 /// \returns The minimum number of bits needed to store the value of \Op as an
162 /// unsigned integer. Truncating to this size and then zero-extending to
163 /// the original will not change the value.
164 unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
165
166 /// \returns The minimum number of bits needed to store the value of \Op as a
167 /// signed integer. Truncating to this size and then sign-extending to
168 /// the original size will not change the value.
169 unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
170
171 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
172 /// SelectionDAG has an issue where an and asserting the bits are known
173 bool replaceMulWithMul24(BinaryOperator &I) const;
174
175 /// Perform same function as equivalently named function in DAGCombiner. Since
176 /// we expand some divisions here, we need to perform this before obscuring.
177 bool foldBinOpIntoSelect(BinaryOperator &I) const;
178
179 bool divHasSpecialOptimization(BinaryOperator &I,
180 Value *Num, Value *Den) const;
181 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
182 unsigned MaxDivBits, bool Signed) const;
183
184 /// Expands 24 bit div or rem.
185 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
186 Value *Num, Value *Den,
187 bool IsDiv, bool IsSigned) const;
188
189 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
190 Value *Num, Value *Den, unsigned NumBits,
191 bool IsDiv, bool IsSigned) const;
192
193 /// Expands 32 bit div or rem.
194 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
195 Value *Num, Value *Den) const;
196
197 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
198 Value *Num, Value *Den) const;
199 void expandDivRem64(BinaryOperator &I) const;
200
201 /// Widen a scalar load.
202 ///
203 /// \details \p Widen scalar load for uniform, small type loads from constant
204 // memory / to a full 32-bits and then truncate the input to allow a scalar
205 // load instead of a vector load.
206 //
207 /// \returns True.
208
209 bool canWidenScalarExtLoad(LoadInst &I) const;
210
211 Value *matchFractPatImpl(Value &V, const APFloat &C) const;
212 Value *matchFractPatNanAvoidant(Value &V);
213 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
214
215 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
216
217 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
218 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
219 const Instruction *CtxI) const;
220
221 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
222 FastMathFlags FMF, const Instruction *CtxI) const;
223 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
224 float ReqdAccuracy) const;
225
226 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
227 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
228 Value *RsqOp, const Instruction *FDiv,
229 float ReqdAccuracy) const;
230
231 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
232 Value *Src) const;
233
234 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
235 bool IsNegative) const;
236 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
237 FastMathFlags FMF) const;
238 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
239 FastMathFlags FMF) const;
240 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
241 FastMathFlags DivFMF, const Instruction *CtxI,
242 bool IsNegative) const;
243
244 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
245 void replaceWithWorkitemIdX(Instruction &I) const;
246 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
247 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
248
249 bool tryNarrowMathIfNoOverflow(Instruction *I);
250
251public:
252 bool visitFDiv(BinaryOperator &I);
253
254 bool visitInstruction(Instruction &I) { return false; }
255 bool visitBinaryOperator(BinaryOperator &I);
256 bool visitLoadInst(LoadInst &I);
257 bool visitSelectInst(SelectInst &I);
258 bool visitPHINode(PHINode &I);
259 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
260
261 bool visitIntrinsicInst(IntrinsicInst &I);
262 bool visitFMinLike(IntrinsicInst &I);
263 bool visitSqrt(IntrinsicInst &I);
264 bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);
265 bool visitMbcntLo(IntrinsicInst &I) const;
266 bool visitMbcntHi(IntrinsicInst &I) const;
267 bool run();
268};
269
270class AMDGPUCodeGenPrepare : public FunctionPass {
271public:
272 static char ID;
273 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
274 void getAnalysisUsage(AnalysisUsage &AU) const override {
278
279 // FIXME: Division expansion needs to preserve the dominator tree.
280 if (!ExpandDiv64InIR)
281 AU.setPreservesAll();
282 }
283 bool runOnFunction(Function &F) override;
284 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
285};
286
287} // end anonymous namespace
288
289bool AMDGPUCodeGenPrepareImpl::run() {
290 BreakPhiNodesCache.clear();
291 bool MadeChange = false;
292
293 // Need to use make_early_inc_range because integer division expansion is
294 // handled by Transform/Utils, and it can delete instructions such as the
295 // terminator of the BB.
296 for (BasicBlock &BB : reverse(F)) {
297 for (Instruction &I : make_early_inc_range(reverse(BB))) {
298 if (!isInstructionTriviallyDead(&I, TLI))
299 MadeChange |= visit(I);
300 }
301 }
302
303 while (!DeadVals.empty()) {
304 if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
306 }
307
308 return MadeChange;
309}
310
311bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
312 return Ty->isFloatTy() || Ty->isDoubleTy() ||
313 (Ty->isHalfTy() && ST.has16BitInsts());
314}
315
316bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
317 Type *Ty = I.getType();
318 int TySize = DL.getTypeSizeInBits(Ty);
319 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
320
321 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
322}
323
324unsigned
325AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
326 const Instruction *CtxI) const {
327 return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();
328}
329
330unsigned
331AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
332 const Instruction *CtxI) const {
333 return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);
334}
335
336static void extractValues(IRBuilder<> &Builder,
337 SmallVectorImpl<Value *> &Values, Value *V) {
338 auto *VT = dyn_cast<FixedVectorType>(V->getType());
339 if (!VT) {
340 Values.push_back(V);
341 return;
342 }
343
344 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
345 Values.push_back(Builder.CreateExtractElement(V, I));
346}
347
349 Type *Ty,
350 SmallVectorImpl<Value *> &Values) {
351 if (!Ty->isVectorTy()) {
352 assert(Values.size() == 1);
353 return Values[0];
354 }
355
356 Value *NewVal = PoisonValue::get(Ty);
357 for (int I = 0, E = Values.size(); I != E; ++I)
358 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
359
360 return NewVal;
361}
362
363bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
364 if (I.getOpcode() != Instruction::Mul)
365 return false;
366
367 Type *Ty = I.getType();
368 unsigned Size = Ty->getScalarSizeInBits();
369 if (Size <= 16 && ST.has16BitInsts())
370 return false;
371
372 // Prefer scalar if this could be s_mul_i32
373 if (UA.isUniform(&I))
374 return false;
375
376 Value *LHS = I.getOperand(0);
377 Value *RHS = I.getOperand(1);
378 IRBuilder<> Builder(&I);
379 Builder.SetCurrentDebugLocation(I.getDebugLoc());
380
381 unsigned LHSBits = 0, RHSBits = 0;
382 bool IsSigned = false;
383
384 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&
385 (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {
386 IsSigned = false;
387
388 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&
389 (RHSBits = numBitsSigned(RHS, &I)) <= 24) {
390 IsSigned = true;
391
392 } else
393 return false;
394
395 SmallVector<Value *, 4> LHSVals;
396 SmallVector<Value *, 4> RHSVals;
397 SmallVector<Value *, 4> ResultVals;
398 extractValues(Builder, LHSVals, LHS);
399 extractValues(Builder, RHSVals, RHS);
400
401 IntegerType *I32Ty = Builder.getInt32Ty();
402 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
403 Type *DstTy = LHSVals[0]->getType();
404
405 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
406 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
407 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
408 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
409 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
411 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
412 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
413 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
414 : Builder.CreateZExtOrTrunc(Result, DstTy);
415 ResultVals.push_back(Result);
416 }
417
418 Value *NewVal = insertValues(Builder, Ty, ResultVals);
419 NewVal->takeName(&I);
420 I.replaceAllUsesWith(NewVal);
421 DeadVals.push_back(&I);
422
423 return true;
424}
425
426// Find a select instruction, which may have been casted. This is mostly to deal
427// with cases where i16 selects were promoted here to i32.
429 Cast = nullptr;
430 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
431 return Sel;
432
433 if ((Cast = dyn_cast<CastInst>(V))) {
434 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
435 return Sel;
436 }
437
438 return nullptr;
439}
440
441bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
442 // Don't do this unless the old select is going away. We want to eliminate the
443 // binary operator, not replace a binop with a select.
444 int SelOpNo = 0;
445
446 CastInst *CastOp;
447
448 // TODO: Should probably try to handle some cases with multiple
449 // users. Duplicating the select may be profitable for division.
450 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
451 if (!Sel || !Sel->hasOneUse()) {
452 SelOpNo = 1;
453 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
454 }
455
456 if (!Sel || !Sel->hasOneUse())
457 return false;
458
461 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
462 if (!CBO || !CT || !CF)
463 return false;
464
465 if (CastOp) {
466 if (!CastOp->hasOneUse())
467 return false;
468 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
469 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
470 }
471
472 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
473 // need to handle divisions here.
474 Constant *FoldedT =
475 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
476 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
477 if (!FoldedT || isa<ConstantExpr>(FoldedT))
478 return false;
479
480 Constant *FoldedF =
481 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
482 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
483 if (!FoldedF || isa<ConstantExpr>(FoldedF))
484 return false;
485
486 IRBuilder<> Builder(&BO);
487 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
488 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
489 Builder.setFastMathFlags(FPOp->getFastMathFlags());
490
491 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
492 FoldedT, FoldedF);
493 NewSelect->takeName(&BO);
494 BO.replaceAllUsesWith(NewSelect);
495 DeadVals.push_back(&BO);
496 if (CastOp)
497 DeadVals.push_back(CastOp);
498 DeadVals.push_back(Sel);
499 return true;
500}
501
502std::pair<Value *, Value *>
503AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
504 Value *Src) const {
505 Type *Ty = Src->getType();
506 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
507 {Ty, Builder.getInt32Ty()}, Src);
508 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
509
510 // Bypass the bug workaround for the exponent result since it doesn't matter.
511 // TODO: Does the bug workaround even really need to consider the exponent
512 // result? It's unspecified by the spec.
513
514 Value *FrexpExp =
515 ST.hasFractBug()
516 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
517 {Builder.getInt32Ty(), Ty}, Src)
518 : Builder.CreateExtractValue(Frexp, {1});
519 return {FrexpMant, FrexpExp};
520}
521
522/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
523Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
524 Value *Src,
525 bool IsNegative) const {
526 // Same as for 1.0, but expand the sign out of the constant.
527 // -1.0 / x -> rcp (fneg x)
528 if (IsNegative)
529 Src = Builder.CreateFNeg(Src);
530
531 // The rcp instruction doesn't support denormals, so scale the input
532 // out of the denormal range and convert at the end.
533 //
534 // Expand as 2^-n * (1.0 / (x * 2^n))
535
536 // TODO: Skip scaling if input is known never denormal and the input
537 // range won't underflow to denormal. The hard part is knowing the
538 // result. We need a range check, the result could be denormal for
539 // 0x1p+126 < den <= 0x1p+127.
540 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
541 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
542 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
543 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
544}
545
546/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
547Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
548 Value *RHS,
549 FastMathFlags FMF) const {
550 // If we have have to work around the fract/frexp bug, we're worse off than
551 // using the fdiv.fast expansion. The full safe expansion is faster if we have
552 // fast FMA.
553 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
554 (!FMF.noNaNs() || !FMF.noInfs()))
555 return nullptr;
556
557 // We're scaling the LHS to avoid a denormal input, and scale the denominator
558 // to avoid large values underflowing the result.
559 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
560
561 Value *Rcp =
562 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
563
564 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
565 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
566
567 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
568 // result.
569 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
570 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
571}
572
573/// Emit a sqrt that handles denormals and is accurate to 2ulp.
574Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
575 Value *Src,
576 FastMathFlags FMF) const {
577 Type *Ty = Src->getType();
578 APFloat SmallestNormal =
580 Value *NeedScale =
581 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
582
583 ConstantInt *Zero = Builder.getInt32(0);
584 Value *InputScaleFactor =
585 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
586
587 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
588
589 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
590
591 Value *OutputScaleFactor =
592 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
593 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
594}
595
596/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
597static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
598 bool IsNegative) {
599 // bool need_scale = x < 0x1p-126f;
600 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
601 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
602 // rsq(x * input_scale) * output_scale;
603
604 Type *Ty = Src->getType();
605 APFloat SmallestNormal =
606 APFloat::getSmallestNormalized(Ty->getFltSemantics());
607 Value *NeedScale =
608 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
609 Constant *One = ConstantFP::get(Ty, 1.0);
610 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
611 Constant *OutputScale =
612 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
613
614 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
615
616 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
617 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
618 Value *OutputScaleFactor = Builder.CreateSelect(
619 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
620
621 return Builder.CreateFMul(Rsq, OutputScaleFactor);
622}
623
624/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
625/// v_rsq_f64. This should give a 1ulp result.
626Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
627 FastMathFlags SqrtFMF,
628 FastMathFlags DivFMF,
629 const Instruction *CtxI,
630 bool IsNegative) const {
631 // rsq(x):
632 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
633 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
634 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
635 //
636 // -rsq(x):
637 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
638 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
639 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
640 //
641 // The rsq instruction handles the special cases correctly. We need to check
642 // for the edge case conditions to ensure the special case propagates through
643 // the later instructions.
644
645 Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
646
647 // Try to elide the edge case check.
648 //
649 // Fast math flags imply:
650 // sqrt ninf => !isinf(x)
651 // fdiv ninf => x != 0, !isinf(x)
652 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
653 bool MaybeZero = !DivFMF.noInfs();
654
655 DenormalMode DenormMode;
656 FPClassTest Interested = fcNone;
657 if (MaybePosInf)
658 Interested = fcPosInf;
659 if (MaybeZero)
660 Interested |= fcZero;
661
662 if (Interested != fcNone) {
663 KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
664 if (KnownSrc.isKnownNeverPosInfinity())
665 MaybePosInf = false;
666
667 DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
668 if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
669 MaybeZero = false;
670 }
671
672 Value *SpecialOrRsq = X;
673 if (MaybeZero || MaybePosInf) {
674 Value *Cond;
675 if (MaybePosInf && MaybeZero) {
676 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
677 FPClassTest TestMask = fcPosInf | fcZero;
678 if (DenormMode.inputsAreZero())
679 TestMask |= fcSubnormal;
680
681 Cond = Builder.createIsFPClass(X, TestMask);
682 } else {
683 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
684 // doesn't respect the floating-point environment.
685 Value *IsZero =
686 Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
687 Value *IsInf =
688 Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
689 Cond = Builder.CreateOr(IsZero, IsInf);
690 }
691 } else if (MaybeZero) {
692 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
693 } else {
694 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
695 }
696
697 SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
698 }
699
700 Value *NegY0 = Builder.CreateFNeg(Y0);
701 Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
702
703 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
704 Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
705
706 Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
707
708 Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
709 ConstantFP::get(X->getType(), 0.5));
710
711 return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
712}
713
714bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
715 FastMathFlags SqrtFMF) const {
716 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
717 // f64.
718 return DivFMF.allowContract() && SqrtFMF.allowContract();
719}
720
721Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
722 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
723 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
724 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
725 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
726
727 // rsq_f16 is accurate to 0.51 ulp.
728 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
729 // rsq_f64 is never accurate.
730 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
731 if (!CLHS)
732 return nullptr;
733
734 bool IsNegative = false;
735
736 // TODO: Handle other numerator values with arcp.
737 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
738 // Add in the sqrt flags.
739 IRBuilder<>::FastMathFlagGuard Guard(Builder);
740 Builder.setFastMathFlags(DivFMF | SqrtFMF);
741
742 if (Den->getType()->isFloatTy()) {
743 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
744 canIgnoreDenormalInput(Den, CtxI)) {
745 Value *Result =
746 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
747 // -1.0 / sqrt(x) -> fneg(rsq(x))
748 return IsNegative ? Builder.CreateFNeg(Result) : Result;
749 }
750
751 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
752 }
753
754 if (Den->getType()->isDoubleTy())
755 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
756 }
757
758 return nullptr;
759}
760
761// Optimize fdiv with rcp:
762//
763// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
764// allowed with afn.
765//
766// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
767Value *
768AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
769 Value *Den, FastMathFlags FMF,
770 const Instruction *CtxI) const {
771 // rcp_f16 is accurate to 0.51 ulp.
772 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
773 // rcp_f64 is never accurate.
774 assert(Den->getType()->isFloatTy());
775
776 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
777 bool IsNegative = false;
778 if (CLHS->isExactlyValue(1.0) ||
779 (IsNegative = CLHS->isExactlyValue(-1.0))) {
780 Value *Src = Den;
781
782 if (HasFP32DenormalFlush || FMF.approxFunc()) {
783 // -1.0 / x -> 1.0 / fneg(x)
784 if (IsNegative)
785 Src = Builder.CreateFNeg(Src);
786
787 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
788 // the CI documentation has a worst case error of 1 ulp.
789 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
790 // to use it as long as we aren't trying to use denormals.
791 //
792 // v_rcp_f16 and v_rsq_f16 DO support denormals.
793
794 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
795 // insert rsq intrinsic here.
796
797 // 1.0 / x -> rcp(x)
798 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
799 }
800
801 // TODO: If the input isn't denormal, and we know the input exponent isn't
802 // big enough to introduce a denormal we can avoid the scaling.
803 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
804 }
805 }
806
807 if (FMF.allowReciprocal()) {
808 // x / y -> x * (1.0 / y)
809
810 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
811 // will never underflow.
812 if (HasFP32DenormalFlush || FMF.approxFunc()) {
813 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
814 return Builder.CreateFMul(Num, Recip);
815 }
816
817 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
818 return Builder.CreateFMul(Num, Recip);
819 }
820
821 return nullptr;
822}
823
824// optimize with fdiv.fast:
825//
826// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
827//
828// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
829//
830// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
831Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
832 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
833 // fdiv.fast can achieve 2.5 ULP accuracy.
834 if (ReqdAccuracy < 2.5f)
835 return nullptr;
836
837 // Only have fdiv.fast for f32.
838 assert(Den->getType()->isFloatTy());
839
840 bool NumIsOne = false;
841 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
842 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
843 NumIsOne = true;
844 }
845
846 // fdiv does not support denormals. But 1.0/x is always fine to use it.
847 //
848 // TODO: This works for any value with a specific known exponent range, don't
849 // just limit to constant 1.
850 if (!HasFP32DenormalFlush && !NumIsOne)
851 return nullptr;
852
853 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
854}
855
856Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
857 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
858 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
859 float ReqdDivAccuracy) const {
860 if (RsqOp) {
861 Value *Rsq =
862 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
863 if (Rsq)
864 return Rsq;
865 }
866
867 if (!Num->getType()->isFloatTy())
868 return nullptr;
869
870 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
871 if (Rcp)
872 return Rcp;
873
874 // In the basic case fdiv_fast has the same instruction count as the frexp div
875 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
876 // potentially be fused into a user. Also, materialization of the constants
877 // can be reused for multiple instances.
878 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
879 if (FDivFast)
880 return FDivFast;
881
882 return emitFrexpDiv(Builder, Num, Den, DivFMF);
883}
884
885// Optimizations is performed based on fpmath, fast math flags as well as
886// denormals to optimize fdiv with either rcp or fdiv.fast.
887//
888// With rcp:
889// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
890// allowed with afn.
891//
892// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
893//
894// With fdiv.fast:
895// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
896//
897// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
898//
899// NOTE: rcp is the preference in cases that both are legal.
900bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
901 if (DisableFDivExpand)
902 return false;
903
904 Type *Ty = FDiv.getType()->getScalarType();
905 const bool IsFloat = Ty->isFloatTy();
906 if (!IsFloat && !Ty->isDoubleTy())
907 return false;
908
909 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
910 // expansion around them in codegen. f16 is good enough to always use.
911
912 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
913 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
914 const float ReqdAccuracy = FPOp->getFPAccuracy();
915
916 FastMathFlags SqrtFMF;
917
918 Value *Num = FDiv.getOperand(0);
919 Value *Den = FDiv.getOperand(1);
920
921 Value *RsqOp = nullptr;
922 auto *DenII = dyn_cast<IntrinsicInst>(Den);
923 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
924 DenII->hasOneUse()) {
925 const auto *SqrtOp = cast<FPMathOperator>(DenII);
926 SqrtFMF = SqrtOp->getFastMathFlags();
927 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
928 RsqOp = SqrtOp->getOperand(0);
929 }
930
931 // rcp path not yet implemented for f64.
932 if (!IsFloat && !RsqOp)
933 return false;
934
935 // Inaccurate rcp is allowed with afn.
936 //
937 // Defer to codegen to handle this.
938 //
939 // TODO: Decide on an interpretation for interactions between afn + arcp +
940 // !fpmath, and make it consistent between here and codegen. For now, defer
941 // expansion of afn to codegen. The current interpretation is so aggressive we
942 // don't need any pre-consideration here when we have better information. A
943 // more conservative interpretation could use handling here.
944 const bool AllowInaccurateRcp = DivFMF.approxFunc();
945 if (!RsqOp && AllowInaccurateRcp)
946 return false;
947
948 // Defer the correct implementations to codegen.
949 if (IsFloat && ReqdAccuracy < 1.0f)
950 return false;
951
952 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
953 Builder.setFastMathFlags(DivFMF);
954 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
955
956 SmallVector<Value *, 4> NumVals;
957 SmallVector<Value *, 4> DenVals;
958 SmallVector<Value *, 4> RsqDenVals;
959 extractValues(Builder, NumVals, Num);
960 extractValues(Builder, DenVals, Den);
961
962 if (RsqOp)
963 extractValues(Builder, RsqDenVals, RsqOp);
964
965 SmallVector<Value *, 4> ResultVals(NumVals.size());
966 for (int I = 0, E = NumVals.size(); I != E; ++I) {
967 Value *NumElt = NumVals[I];
968 Value *DenElt = DenVals[I];
969 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
970
971 Value *NewElt =
972 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
973 cast<Instruction>(FPOp), ReqdAccuracy);
974 if (!NewElt) {
975 // Keep the original, but scalarized.
976
977 // This has the unfortunate side effect of sometimes scalarizing when
978 // we're not going to do anything.
979 NewElt = Builder.CreateFDiv(NumElt, DenElt);
980 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
981 NewEltInst->copyMetadata(FDiv);
982 }
983
984 ResultVals[I] = NewElt;
985 }
986
987 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
988
989 if (NewVal) {
990 FDiv.replaceAllUsesWith(NewVal);
991 NewVal->takeName(&FDiv);
992 DeadVals.push_back(&FDiv);
993 }
994
995 return true;
996}
997
998static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
999 Value *LHS, Value *RHS) {
1000 Type *I32Ty = Builder.getInt32Ty();
1001 Type *I64Ty = Builder.getInt64Ty();
1002
1003 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1004 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1005 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1006 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1007 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1008 Hi = Builder.CreateTrunc(Hi, I32Ty);
1009 return std::pair(Lo, Hi);
1010}
1011
1012static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1013 return getMul64(Builder, LHS, RHS).second;
1014}
1015
1016/// Figure out how many bits are really needed for this division.
1017/// \p MaxDivBits is an optimization hint to bypass the second
1018/// ComputeNumSignBits/computeKnownBits call if the first one is
1019/// insufficient.
1020unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1021 Value *Den,
1022 unsigned MaxDivBits,
1023 bool IsSigned) const {
1025 Den->getType()->getScalarSizeInBits());
1026 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1027 if (IsSigned) {
1028 unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);
1029 // A sign bit needs to be reserved for shrinking.
1030 unsigned DivBits = SSBits - RHSSignBits + 1;
1031 if (DivBits > MaxDivBits)
1032 return SSBits;
1033
1034 unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);
1035
1036 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1037 DivBits = SSBits - SignBits + 1;
1038 return DivBits;
1039 }
1040
1041 // All bits are used for unsigned division for Num or Den in range
1042 // (SignedMax, UnsignedMax].
1043 KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));
1044 if (Known.isNegative() || !Known.isNonNegative())
1045 return SSBits;
1046 unsigned RHSSignBits = Known.countMinLeadingZeros();
1047 unsigned DivBits = SSBits - RHSSignBits;
1048 if (DivBits > MaxDivBits)
1049 return SSBits;
1050
1051 Known = computeKnownBits(Num, SQ.getWithInstruction(&I));
1052 if (Known.isNegative() || !Known.isNonNegative())
1053 return SSBits;
1054 unsigned LHSSignBits = Known.countMinLeadingZeros();
1055
1056 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1057 DivBits = SSBits - SignBits;
1058 return DivBits;
1059}
1060
1061// The fractional part of a float is enough to accurately represent up to
1062// a 24-bit signed integer.
1063Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1064 BinaryOperator &I, Value *Num,
1065 Value *Den, bool IsDiv,
1066 bool IsSigned) const {
1067 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
1068 if (DivBits > 24)
1069 return nullptr;
1070 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1071}
1072
1073Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1074 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1075 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1076 Type *I32Ty = Builder.getInt32Ty();
1077 Num = Builder.CreateTrunc(Num, I32Ty);
1078 Den = Builder.CreateTrunc(Den, I32Ty);
1079
1080 Type *F32Ty = Builder.getFloatTy();
1081 ConstantInt *One = Builder.getInt32(1);
1082 Value *JQ = One;
1083
1084 if (IsSigned) {
1085 // char|short jq = ia ^ ib;
1086 JQ = Builder.CreateXor(Num, Den);
1087
1088 // jq = jq >> (bitsize - 2)
1089 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1090
1091 // jq = jq | 0x1
1092 JQ = Builder.CreateOr(JQ, One);
1093 }
1094
1095 // int ia = (int)LHS;
1096 Value *IA = Num;
1097
1098 // int ib, (int)RHS;
1099 Value *IB = Den;
1100
1101 // float fa = (float)ia;
1102 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1103 : Builder.CreateUIToFP(IA, F32Ty);
1104
1105 // float fb = (float)ib;
1106 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1107 : Builder.CreateUIToFP(IB,F32Ty);
1108
1109 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1110 Builder.getFloatTy(), {FB});
1111 Value *FQM = Builder.CreateFMul(FA, RCP);
1112
1113 // fq = trunc(fqm);
1114 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1115 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1116
1117 // float fqneg = -fq;
1118 Value *FQNeg = Builder.CreateFNeg(FQ);
1119
1120 // float fr = mad(fqneg, fb, fa);
1121 auto FMAD = !ST.hasMadMacF32Insts()
1122 ? Intrinsic::fma
1123 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1124 Value *FR = Builder.CreateIntrinsic(FMAD,
1125 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1126
1127 // int iq = (int)fq;
1128 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1129 : Builder.CreateFPToUI(FQ, I32Ty);
1130
1131 // fr = fabs(fr);
1132 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1133
1134 // fb = fabs(fb);
1135 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1136
1137 // int cv = fr >= fb;
1138 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1139
1140 // jq = (cv ? jq : 0);
1141 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1142
1143 // dst = iq + jq;
1144 Value *Div = Builder.CreateAdd(IQ, JQ);
1145
1146 Value *Res = Div;
1147 if (!IsDiv) {
1148 // Rem needs compensation, it's easier to recompute it
1149 Value *Rem = Builder.CreateMul(Div, Den);
1150 Res = Builder.CreateSub(Num, Rem);
1151 }
1152
1153 if (DivBits != 0 && DivBits < 32) {
1154 // Extend in register from the number of bits this divide really is.
1155 if (IsSigned) {
1156 int InRegBits = 32 - DivBits;
1157
1158 Res = Builder.CreateShl(Res, InRegBits);
1159 Res = Builder.CreateAShr(Res, InRegBits);
1160 } else {
1161 ConstantInt *TruncMask
1162 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1163 Res = Builder.CreateAnd(Res, TruncMask);
1164 }
1165 }
1166
1167 return Res;
1168}
1169
1170// Try to recognize special cases the DAG will emit special, better expansions
1171// than the general expansion we do here.
1172
1173// TODO: It would be better to just directly handle those optimizations here.
1174bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1175 Value *Num,
1176 Value *Den) const {
1177 if (Constant *C = dyn_cast<Constant>(Den)) {
1178 // Arbitrary constants get a better expansion as long as a wider mulhi is
1179 // legal.
1180 if (C->getType()->getScalarSizeInBits() <= 32)
1181 return true;
1182
1183 // TODO: Sdiv check for not exact for some reason.
1184
1185 // If there's no wider mulhi, there's only a better expansion for powers of
1186 // two.
1187 // TODO: Should really know for each vector element.
1189 return true;
1190
1191 return false;
1192 }
1193
1194 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1195 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1196 if (BinOpDen->getOpcode() == Instruction::Shl &&
1197 isa<Constant>(BinOpDen->getOperand(0)) &&
1198 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,
1199 SQ.getWithInstruction(&I))) {
1200 return true;
1201 }
1202 }
1203
1204 return false;
1205}
1206
1207static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1208 // Check whether the sign can be determined statically.
1209 KnownBits Known = computeKnownBits(V, DL);
1210 if (Known.isNegative())
1211 return Constant::getAllOnesValue(V->getType());
1212 if (Known.isNonNegative())
1213 return Constant::getNullValue(V->getType());
1214 return Builder.CreateAShr(V, Builder.getInt32(31));
1215}
1216
1217Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1218 BinaryOperator &I, Value *X,
1219 Value *Y) const {
1220 Instruction::BinaryOps Opc = I.getOpcode();
1221 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1222 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1223
1224 FastMathFlags FMF;
1225 FMF.setFast();
1226 Builder.setFastMathFlags(FMF);
1227
1228 if (divHasSpecialOptimization(I, X, Y))
1229 return nullptr; // Keep it for later optimization.
1230
1231 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1232 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1233
1234 Type *Ty = X->getType();
1235 Type *I32Ty = Builder.getInt32Ty();
1236 Type *F32Ty = Builder.getFloatTy();
1237
1238 if (Ty->getScalarSizeInBits() != 32) {
1239 if (IsSigned) {
1240 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1241 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1242 } else {
1243 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1244 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1245 }
1246 }
1247
1248 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1249 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1250 Builder.CreateZExtOrTrunc(Res, Ty);
1251 }
1252
1253 ConstantInt *Zero = Builder.getInt32(0);
1254 ConstantInt *One = Builder.getInt32(1);
1255
1256 Value *Sign = nullptr;
1257 if (IsSigned) {
1258 Value *SignX = getSign32(X, Builder, DL);
1259 Value *SignY = getSign32(Y, Builder, DL);
1260 // Remainder sign is the same as LHS
1261 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1262
1263 X = Builder.CreateAdd(X, SignX);
1264 Y = Builder.CreateAdd(Y, SignY);
1265
1266 X = Builder.CreateXor(X, SignX);
1267 Y = Builder.CreateXor(Y, SignY);
1268 }
1269
1270 // The algorithm here is based on ideas from "Software Integer Division", Tom
1271 // Rodeheffer, August 2008.
1272 //
1273 // unsigned udiv(unsigned x, unsigned y) {
1274 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1275 // // that this is a lower bound on inv(y), even if some of the calculations
1276 // // round up.
1277 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1278 //
1279 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1280 // // Empirically this is guaranteed to give a "two-y" lower bound on
1281 // // inv(y).
1282 // z += umulh(z, -y * z);
1283 //
1284 // // Quotient/remainder estimate.
1285 // unsigned q = umulh(x, z);
1286 // unsigned r = x - q * y;
1287 //
1288 // // Two rounds of quotient/remainder refinement.
1289 // if (r >= y) {
1290 // ++q;
1291 // r -= y;
1292 // }
1293 // if (r >= y) {
1294 // ++q;
1295 // r -= y;
1296 // }
1297 //
1298 // return q;
1299 // }
1300
1301 // Initial estimate of inv(y).
1302 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1303 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1304 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1305 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1306 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1307
1308 // One round of UNR.
1309 Value *NegY = Builder.CreateSub(Zero, Y);
1310 Value *NegYZ = Builder.CreateMul(NegY, Z);
1311 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1312
1313 // Quotient/remainder estimate.
1314 Value *Q = getMulHu(Builder, X, Z);
1315 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1316
1317 // First quotient/remainder refinement.
1318 Value *Cond = Builder.CreateICmpUGE(R, Y);
1319 if (IsDiv)
1320 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1321 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1322
1323 // Second quotient/remainder refinement.
1324 Cond = Builder.CreateICmpUGE(R, Y);
1325 Value *Res;
1326 if (IsDiv)
1327 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1328 else
1329 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1330
1331 if (IsSigned) {
1332 Res = Builder.CreateXor(Res, Sign);
1333 Res = Builder.CreateSub(Res, Sign);
1334 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1335 } else {
1336 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1337 }
1338 return Res;
1339}
1340
1341Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1342 BinaryOperator &I, Value *Num,
1343 Value *Den) const {
1344 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1345 return nullptr; // Keep it for later optimization.
1346
1347 Instruction::BinaryOps Opc = I.getOpcode();
1348
1349 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1350 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1351
1352 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1353 if (NumDivBits > 32)
1354 return nullptr;
1355
1356 Value *Narrowed = nullptr;
1357 if (NumDivBits <= 24) {
1358 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1359 IsDiv, IsSigned);
1360 } else if (NumDivBits <= 32) {
1361 Narrowed = expandDivRem32(Builder, I, Num, Den);
1362 }
1363
1364 if (Narrowed) {
1365 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1366 Builder.CreateZExt(Narrowed, Num->getType());
1367 }
1368
1369 return nullptr;
1370}
1371
1372void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1373 Instruction::BinaryOps Opc = I.getOpcode();
1374 // Do the general expansion.
1375 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1377 return;
1378 }
1379
1380 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1382 return;
1383 }
1384
1385 llvm_unreachable("not a division");
1386}
1387
1388/*
1389This will cause non-byte load in consistency, for example:
1390```
1391 %load = load i1, ptr addrspace(4) %arg, align 4
1392 %zext = zext i1 %load to
1393 i64 %add = add i64 %zext
1394```
1395Instead of creating `s_and_b32 s0, s0, 1`,
1396it will create `s_and_b32 s0, s0, 0xff`.
1397We accept this change since the non-byte load assumes the upper bits
1398within the byte are all 0.
1399*/
1400bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1401 unsigned Opc = I->getOpcode();
1402 Type *OldType = I->getType();
1403
1404 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1405 return false;
1406
1407 unsigned OrigBit = OldType->getScalarSizeInBits();
1408
1409 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1410 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1411 "Instruction::Mul.");
1412
1413 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1414
1415 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1416 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1417 if (!NewType)
1418 return false;
1419 unsigned NewBit = NewType->getIntegerBitWidth();
1420 if (NewBit >= OrigBit)
1421 return false;
1422 NewType = I->getType()->getWithNewBitWidth(NewBit);
1423
1424 // Old cost
1425 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1426 InstructionCost OldCost =
1428 // New cost of new op
1429 InstructionCost NewCost =
1431 // New cost of narrowing 2 operands (use trunc)
1432 int NumOfNonConstOps = 2;
1433 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1434 // Cannot be both constant, should be propagated
1435 NumOfNonConstOps = 1;
1436 }
1437 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1438 NewType, OldType,
1441 // New cost of zext narrowed result to original type
1442 NewCost +=
1443 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1445 if (NewCost >= OldCost)
1446 return false;
1447
1448 IRBuilder<> Builder(I);
1449 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1450 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1451 Value *Arith =
1452 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1453
1454 Value *Zext = Builder.CreateZExt(Arith, OldType);
1455 I->replaceAllUsesWith(Zext);
1456 DeadVals.push_back(I);
1457 return true;
1458}
1459
1460bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1461 if (foldBinOpIntoSelect(I))
1462 return true;
1463
1464 if (UseMul24Intrin && replaceMulWithMul24(I))
1465 return true;
1466 if (tryNarrowMathIfNoOverflow(&I))
1467 return true;
1468
1469 bool Changed = false;
1470 Instruction::BinaryOps Opc = I.getOpcode();
1471 Type *Ty = I.getType();
1472 Value *NewDiv = nullptr;
1473 unsigned ScalarSize = Ty->getScalarSizeInBits();
1474
1476
1477 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1478 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1479 ScalarSize <= 64 &&
1480 !DisableIDivExpand) {
1481 Value *Num = I.getOperand(0);
1482 Value *Den = I.getOperand(1);
1483 IRBuilder<> Builder(&I);
1484 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1485
1486 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1487 NewDiv = PoisonValue::get(VT);
1488
1489 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1490 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1491 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1492
1493 Value *NewElt;
1494 if (ScalarSize <= 32) {
1495 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1496 if (!NewElt)
1497 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1498 } else {
1499 // See if this 64-bit division can be shrunk to 32/24-bits before
1500 // producing the general expansion.
1501 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1502 if (!NewElt) {
1503 // The general 64-bit expansion introduces control flow and doesn't
1504 // return the new value. Just insert a scalar copy and defer
1505 // expanding it.
1506 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1507 // CreateBinOp does constant folding. If the operands are constant,
1508 // it will return a Constant instead of a BinaryOperator.
1509 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1510 Div64ToExpand.push_back(NewEltBO);
1511 }
1512 }
1513
1514 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1515 NewEltI->copyIRFlags(&I);
1516
1517 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1518 }
1519 } else {
1520 if (ScalarSize <= 32)
1521 NewDiv = expandDivRem32(Builder, I, Num, Den);
1522 else {
1523 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1524 if (!NewDiv)
1525 Div64ToExpand.push_back(&I);
1526 }
1527 }
1528
1529 if (NewDiv) {
1530 I.replaceAllUsesWith(NewDiv);
1531 DeadVals.push_back(&I);
1532 Changed = true;
1533 }
1534 }
1535
1536 if (ExpandDiv64InIR) {
1537 // TODO: We get much worse code in specially handled constant cases.
1538 for (BinaryOperator *Div : Div64ToExpand) {
1539 expandDivRem64(*Div);
1540 FlowChanged = true;
1541 Changed = true;
1542 }
1543 }
1544
1545 return Changed;
1546}
1547
1548bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1549 if (!WidenLoads)
1550 return false;
1551
1552 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1553 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1554 canWidenScalarExtLoad(I)) {
1555 IRBuilder<> Builder(&I);
1556 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1557
1558 Type *I32Ty = Builder.getInt32Ty();
1559 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1560 WidenLoad->copyMetadata(I);
1561
1562 // If we have range metadata, we need to convert the type, and not make
1563 // assumptions about the high bits.
1564 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1565 ConstantInt *Lower =
1566 mdconst::extract<ConstantInt>(Range->getOperand(0));
1567
1568 if (Lower->isNullValue()) {
1569 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1570 } else {
1571 Metadata *LowAndHigh[] = {
1572 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1573 // Don't make assumptions about the high bits.
1574 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1575 };
1576
1577 WidenLoad->setMetadata(LLVMContext::MD_range,
1578 MDNode::get(F.getContext(), LowAndHigh));
1579 }
1580 }
1581
1582 int TySize = DL.getTypeSizeInBits(I.getType());
1583 Type *IntNTy = Builder.getIntNTy(TySize);
1584 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1585 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1586 I.replaceAllUsesWith(ValOrig);
1587 DeadVals.push_back(&I);
1588 return true;
1589 }
1590
1591 return false;
1592}
1593
1594bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1595 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1596 if (!FPOp)
1597 return false;
1598
1599 Value *X;
1600 Value *Fract = nullptr;
1601
1602 // Match:
1603 // (x - floor(x)) >= MIN_CONSTANT ? MIN_CONSTANT : (x - floor(x))
1604 //
1605 // This is the preferred way to implement fract.
1606 // TODO: Could also match with compare against 1.0
1607 const APFloat *C;
1609 Value *FractSrc = matchFractPatImpl(*X, *C);
1610 if (!FractSrc)
1611 return false;
1612 IRBuilder<> Builder(&I);
1613 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1614 Fract = applyFractPat(Builder, FractSrc);
1615 } else {
1616 // Match patterns which may appear in legacy implementations of the fract()
1617 // function, built around the nan-avoidant minnum intrinsic. These are the
1618 // core pattern plus additional clamping of inf and nan values on the
1619 // result.
1620 Value *Cond = I.getCondition();
1621 Value *TrueVal = I.getTrueValue();
1622 Value *FalseVal = I.getFalseValue();
1623 Value *CmpVal;
1624 CmpPredicate IsNanPred;
1625
1626 // Match fract pattern with nan check.
1627 if (!match(Cond, m_FCmp(IsNanPred, m_Value(CmpVal), m_NonNaN())))
1628 return false;
1629
1630 IRBuilder<> Builder(&I);
1631 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1632
1633 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1634 CmpVal == matchFractPatNanAvoidant(*FalseVal)) {
1635 // isnan(x) ? x : fract(x)
1636 Fract = applyFractPat(Builder, CmpVal);
1637 } else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1638 if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {
1639 // !isnan(x) ? fract(x) : x
1640 Fract = applyFractPat(Builder, CmpVal);
1641 } else {
1642 // Match an intermediate clamp infinity to 0 pattern. i.e.
1643 // !isnan(x) ? (!isinf(x) ? fract(x) : 0.0) : x
1644 CmpPredicate PredInf;
1645 Value *IfNotInf;
1646
1647 if (!match(TrueVal, m_Select(m_FCmp(PredInf, m_FAbs(m_Specific(CmpVal)),
1648 m_PosInf()),
1649 m_Value(IfNotInf), m_PosZeroFP())) ||
1650 PredInf != FCmpInst::FCMP_UNE ||
1651 CmpVal != matchFractPatNanAvoidant(*IfNotInf))
1652 return false;
1653
1654 SelectInst *ClampInfSelect = cast<SelectInst>(TrueVal);
1655
1656 // Insert before the fabs
1657 Value *InsertPt =
1658 cast<Instruction>(ClampInfSelect->getCondition())->getOperand(0);
1659
1660 Builder.SetInsertPoint(cast<Instruction>(InsertPt));
1661 Value *NewFract = applyFractPat(Builder, CmpVal);
1662 NewFract->takeName(TrueVal);
1663
1664 // Thread the new fract into the inf clamping sequence.
1665 DeadVals.push_back(ClampInfSelect->getOperand(1));
1666 ClampInfSelect->setOperand(1, NewFract);
1667
1668 // The outer select nan handling is also absorbed into the fract.
1669 Fract = ClampInfSelect;
1670 }
1671 } else
1672 return false;
1673 }
1674
1675 Fract->takeName(&I);
1676 I.replaceAllUsesWith(Fract);
1677 DeadVals.push_back(&I);
1678 return true;
1679}
1680
1681static bool areInSameBB(const Value *A, const Value *B) {
1682 const auto *IA = dyn_cast<Instruction>(A);
1683 const auto *IB = dyn_cast<Instruction>(B);
1684 return IA && IB && IA->getParent() == IB->getParent();
1685}
1686
1687// Helper for breaking large PHIs that returns true when an extractelement on V
1688// is likely to be folded away by the DAG combiner.
1690 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1691 if (!FVT)
1692 return false;
1693
1694 const Value *CurVal = V;
1695
1696 // Check for insertelements, keeping track of the elements covered.
1697 BitVector EltsCovered(FVT->getNumElements());
1698 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1699 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1700
1701 // Non constant index/out of bounds index -> folding is unlikely.
1702 // The latter is more of a sanity check because canonical IR should just
1703 // have replaced those with poison.
1704 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1705 return false;
1706
1707 const auto *VecSrc = IE->getOperand(0);
1708
1709 // If the vector source is another instruction, it must be in the same basic
1710 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1711 // unlikely to be able to do anything interesting here.
1712 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1713 return false;
1714
1715 CurVal = VecSrc;
1716 EltsCovered.set(Idx->getZExtValue());
1717
1718 // All elements covered.
1719 if (EltsCovered.all())
1720 return true;
1721 }
1722
1723 // We either didn't find a single insertelement, or the insertelement chain
1724 // ended before all elements were covered. Check for other interesting values.
1725
1726 // Constants are always interesting because we can just constant fold the
1727 // extractelements.
1728 if (isa<Constant>(CurVal))
1729 return true;
1730
1731 // shufflevector is likely to be profitable if either operand is a constant,
1732 // or if either source is in the same block.
1733 // This is because shufflevector is most often lowered as a series of
1734 // insert/extract elements anyway.
1735 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1736 return isa<Constant>(SV->getOperand(1)) ||
1737 areInSameBB(SV, SV->getOperand(0)) ||
1738 areInSameBB(SV, SV->getOperand(1));
1739 }
1740
1741 return false;
1742}
1743
1744static void collectPHINodes(const PHINode &I,
1746 const auto [It, Inserted] = SeenPHIs.insert(&I);
1747 if (!Inserted)
1748 return;
1749
1750 for (const Value *Inc : I.incoming_values()) {
1751 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1752 collectPHINodes(*PhiInc, SeenPHIs);
1753 }
1754
1755 for (const User *U : I.users()) {
1756 if (const auto *PhiU = dyn_cast<PHINode>(U))
1757 collectPHINodes(*PhiU, SeenPHIs);
1758 }
1759}
1760
1761bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1762 // Check in the cache first.
1763 if (const auto It = BreakPhiNodesCache.find(&I);
1764 It != BreakPhiNodesCache.end())
1765 return It->second;
1766
1767 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1768 // recursively consider all its users and incoming values that are also PHI
1769 // nodes. We then make a decision about all of those PHIs at once. Either they
1770 // all get broken up, or none of them do. That way, we avoid cases where a
1771 // single PHI is/is not broken and we end up reforming/exploding a vector
1772 // multiple times, or even worse, doing it in a loop.
1773 SmallPtrSet<const PHINode *, 8> WorkList;
1774 collectPHINodes(I, WorkList);
1775
1776#ifndef NDEBUG
1777 // Check that none of the PHI nodes in the worklist are in the map. If some of
1778 // them are, it means we're not good enough at collecting related PHIs.
1779 for (const PHINode *WLP : WorkList) {
1780 assert(BreakPhiNodesCache.count(WLP) == 0);
1781 }
1782#endif
1783
1784 // To consider a PHI profitable to break, we need to see some interesting
1785 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1786 // must have one to consider all PHIs breakable.
1787 //
1788 // This threshold has been determined through performance testing.
1789 //
1790 // Note that the computation below is equivalent to
1791 //
1792 // (unsigned)ceil((K / 3.0) * 2)
1793 //
1794 // It's simply written this way to avoid mixing integral/FP arithmetic.
1795 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1796 unsigned NumBreakablePHIs = 0;
1797 bool CanBreak = false;
1798 for (const PHINode *Cur : WorkList) {
1799 // Don't break PHIs that have no interesting incoming values. That is, where
1800 // there is no clear opportunity to fold the "extractelement" instructions
1801 // we would add.
1802 //
1803 // Note: IC does not run after this pass, so we're only interested in the
1804 // foldings that the DAG combiner can do.
1805 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1806 if (++NumBreakablePHIs >= Threshold) {
1807 CanBreak = true;
1808 break;
1809 }
1810 }
1811 }
1812
1813 for (const PHINode *Cur : WorkList)
1814 BreakPhiNodesCache[Cur] = CanBreak;
1815
1816 return CanBreak;
1817}
1818
1819/// Helper class for "break large PHIs" (visitPHINode).
1820///
1821/// This represents a slice of a PHI's incoming value, which is made up of:
1822/// - The type of the slice (Ty)
1823/// - The index in the incoming value's vector where the slice starts (Idx)
1824/// - The number of elements in the slice (NumElts).
1825/// It also keeps track of the NewPHI node inserted for this particular slice.
1826///
1827/// Slice examples:
1828/// <4 x i64> -> Split into four i64 slices.
1829/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1830/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1831/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1833public:
1834 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1835 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1836
1837 Type *Ty = nullptr;
1838 unsigned Idx = 0;
1839 unsigned NumElts = 0;
1840 PHINode *NewPHI = nullptr;
1841
1842 /// Slice \p Inc according to the information contained within this slice.
1843 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1844 /// pair, it returns the same Sliced value as well.
1845 ///
1846 /// Note this *intentionally* does not return the same value for, say,
1847 /// [%bb.0, %0] & [%bb.1, %0] as:
1848 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1849 /// the value in bb.1 may not be reachable from bb.0 if it's its
1850 /// predecessor.)
1851 /// - We also want to make our extract instructions as local as possible so
1852 /// the DAG has better chances of folding them out. Duplicating them like
1853 /// that is beneficial in that regard.
1854 ///
1855 /// This is both a minor optimization to avoid creating duplicate
1856 /// instructions, but also a requirement for correctness. It is not forbidden
1857 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1858 /// returned a new value each time, those previously identical pairs would all
1859 /// have different incoming values (from the same block) and it'd cause a "PHI
1860 /// node has multiple entries for the same basic block with different incoming
1861 /// values!" verifier error.
1862 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1863 Value *&Res = SlicedVals[{BB, Inc}];
1864 if (Res)
1865 return Res;
1866
1868 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1869 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1870
1871 if (NumElts > 1) {
1873 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1874 Mask.push_back(K);
1875 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1876 } else
1877 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1878
1879 return Res;
1880 }
1881
1882private:
1884};
1885
1886bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1887 // Break-up fixed-vector PHIs into smaller pieces.
1888 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1889 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1890 //
1891 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1892 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1893 // With large, odd-sized PHIs we may end up needing many `build_vector`
1894 // operations with most elements being "undef". This inhibits a lot of
1895 // optimization opportunities and can result in unreasonably high register
1896 // pressure and the inevitable stack spilling.
1897 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1898 return false;
1899
1900 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1901 if (!FVT || FVT->getNumElements() == 1 ||
1902 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1903 return false;
1904
1905 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1906 return false;
1907
1908 std::vector<VectorSlice> Slices;
1909
1910 Type *EltTy = FVT->getElementType();
1911 {
1912 unsigned Idx = 0;
1913 // For 8/16 bits type, don't scalarize fully but break it up into as many
1914 // 32-bit slices as we can, and scalarize the tail.
1915 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1916 const unsigned NumElts = FVT->getNumElements();
1917 if (EltSize == 8 || EltSize == 16) {
1918 const unsigned SubVecSize = (32 / EltSize);
1919 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1920 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1921 Idx += SubVecSize)
1922 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1923 }
1924
1925 // Scalarize all remaining elements.
1926 for (; Idx < NumElts; ++Idx)
1927 Slices.emplace_back(EltTy, Idx, 1);
1928 }
1929
1930 assert(Slices.size() > 1);
1931
1932 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1933 // creating the necessary instruction to extract the relevant slices of each
1934 // incoming value.
1935 IRBuilder<> B(I.getParent());
1936 B.SetCurrentDebugLocation(I.getDebugLoc());
1937
1938 unsigned IncNameSuffix = 0;
1939 for (VectorSlice &S : Slices) {
1940 // We need to reset the build on each iteration, because getSlicedVal may
1941 // have inserted something into I's BB.
1942 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1943 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1944
1945 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1946 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1947 "largephi.extractslice" +
1948 std::to_string(IncNameSuffix++)),
1949 BB);
1950 }
1951 }
1952
1953 // And replace this PHI with a vector of all the previous PHI values.
1954 Value *Vec = PoisonValue::get(FVT);
1955 unsigned NameSuffix = 0;
1956 for (VectorSlice &S : Slices) {
1957 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1958 if (S.NumElts > 1)
1959 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1960 else
1961 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1962 }
1963
1964 I.replaceAllUsesWith(Vec);
1965 DeadVals.push_back(&I);
1966 return true;
1967}
1968
1969/// \param V Value to check
1970/// \param DL DataLayout
1971/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1972/// \param AS Target Address Space
1973/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1974static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1975 const AMDGPUTargetMachine &TM, unsigned AS) {
1976 // Pointer cannot be null if it's a block address, GV or alloca.
1977 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1978 // it as the symbol could be null in such cases.
1980 return true;
1981
1982 // Check nonnull arguments.
1983 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
1984 return true;
1985
1986 // Check nonnull loads.
1987 if (const auto *Load = dyn_cast<LoadInst>(V);
1988 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1989 return true;
1990
1991 // getUnderlyingObject may have looked through another addrspacecast, although
1992 // the optimizable situations most likely folded out by now.
1993 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
1994 return false;
1995
1996 // TODO: Calls that return nonnull?
1997
1998 // For all other things, use KnownBits.
1999 // We either use 0 or all bits set to indicate null, so check whether the
2000 // value can be zero or all ones.
2001 //
2002 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2003 // address spaces have non-zero null values.
2004 auto SrcPtrKB = computeKnownBits(V, DL);
2005 const auto NullVal = AMDGPU::getNullPointerValue(AS);
2006
2007 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2008 assert((NullVal == 0 || NullVal == -1) &&
2009 "don't know how to check for this null value!");
2010 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2011}
2012
2013bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2014 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2015 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2016 // worth supporting.
2017 if (I.getType()->isVectorTy())
2018 return false;
2019
2020 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2021 // This is only worthwhile for casts from/to priv/local to flat.
2022 const unsigned SrcAS = I.getSrcAddressSpace();
2023 const unsigned DstAS = I.getDestAddressSpace();
2024
2025 bool CanLower = false;
2026 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2027 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2028 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2029 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2030 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2031 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2032 if (!CanLower)
2033 return false;
2034
2036 getUnderlyingObjects(I.getOperand(0), WorkList);
2037 if (!all_of(WorkList, [&](const Value *V) {
2038 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
2039 }))
2040 return false;
2041
2042 IRBuilder<> B(&I);
2043 auto *Intrin = B.CreateIntrinsic(
2044 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2045 I.replaceAllUsesWith(Intrin);
2046 DeadVals.push_back(&I);
2047 return true;
2048}
2049
2050bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2051 Intrinsic::ID IID = I.getIntrinsicID();
2052 switch (IID) {
2053 case Intrinsic::minnum:
2054 case Intrinsic::minimumnum:
2055 case Intrinsic::minimum:
2056 return visitFMinLike(I);
2057 case Intrinsic::sqrt:
2058 return visitSqrt(I);
2059 case Intrinsic::log:
2060 case Intrinsic::log10:
2061 return visitLog(cast<FPMathOperator>(I), IID);
2062 case Intrinsic::log2:
2063 // No reason to handle log2.
2064 return false;
2065 case Intrinsic::amdgcn_mbcnt_lo:
2066 return visitMbcntLo(I);
2067 case Intrinsic::amdgcn_mbcnt_hi:
2068 return visitMbcntHi(I);
2069 default:
2070 return false;
2071 }
2072}
2073
2074/// Match the core sequence in the fract pattern (x - floor(x), which doesn't
2075/// need to consider edge case handling.
2076Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(Value &FractSrc,
2077 const APFloat &C) const {
2078 if (ST.hasFractBug())
2079 return nullptr;
2080
2081 Type *Ty = FractSrc.getType();
2082 if (!isLegalFloatingTy(Ty->getScalarType()))
2083 return nullptr;
2084
2085 APFloat OneNextDown = APFloat::getOne(C.getSemantics());
2086 OneNextDown.next(true);
2087
2088 // Match nextafter(1.0, -1)
2089 if (OneNextDown != C)
2090 return nullptr;
2091
2092 Value *FloorSrc;
2093 if (match(&FractSrc, m_FSub(m_Value(FloorSrc), m_Intrinsic<Intrinsic::floor>(
2094 m_Deferred(FloorSrc)))))
2095 return FloorSrc;
2096 return nullptr;
2097}
2098
2099/// Match non-nan fract pattern.
2100// MIN_CONSTANT = nextafter(1.0, -1.0)
2101/// minnum(fsub(x, floor(x)), MIN_CONSTANT)
2102/// minimumnum(fsub(x, floor(x)), MIN_CONSTANT)
2103/// minimum(fsub(x, floor(x)), MIN_CONSTANT)
2104
2105// x_sub_floor >= MIN_CONSTANT ? MIN_CONSTANT : x_sub_floor;
2106///
2107/// If fract is a useful instruction for the subtarget. Does not account for the
2108/// nan handling; the instruction has a nan check on the input value.
2109Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(Value &V) {
2110 Value *Arg0;
2111 const APFloat *C;
2112
2113 // The value is only used in contexts where we know the input isn't a nan, so
2114 // any of the fmin variants are fine.
2115 if (!match(&V,
2119 return nullptr;
2120
2121 return matchFractPatImpl(*Arg0, *C);
2122}
2123
2124Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2125 Value *FractArg) {
2126 SmallVector<Value *, 4> FractVals;
2127 extractValues(Builder, FractVals, FractArg);
2128
2129 SmallVector<Value *, 4> ResultVals(FractVals.size());
2130
2131 Type *Ty = FractArg->getType()->getScalarType();
2132 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2133 ResultVals[I] =
2134 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2135 }
2136
2137 return insertValues(Builder, FractArg->getType(), ResultVals);
2138}
2139
2140bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2141 const APFloat *C;
2142 Value *FractArg;
2143
2144 // minimum(x - floor(x), MIN_CONSTANT)
2145 Value *X;
2146 if (!ST.hasFractBug() &&
2148 FractArg = matchFractPatImpl(*X, *C);
2149 if (!FractArg)
2150 return false;
2151 } else {
2152 // minnum(x - floor(x), MIN_CONSTANT)
2153 FractArg = matchFractPatNanAvoidant(I);
2154 if (!FractArg)
2155 return false;
2156
2157 // Match pattern for fract intrinsic in contexts where the nan check has
2158 // been optimized out (and hope the knowledge the source can't be nan wasn't
2159 // lost).
2160 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))
2161 return false;
2162 }
2163
2164 IRBuilder<> Builder(&I);
2165 FastMathFlags FMF = I.getFastMathFlags();
2166 FMF.setNoNaNs();
2167 Builder.setFastMathFlags(FMF);
2168
2169 Value *Fract = applyFractPat(Builder, FractArg);
2170 Fract->takeName(&I);
2171 I.replaceAllUsesWith(Fract);
2172 DeadVals.push_back(&I);
2173 return true;
2174}
2175
2176// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2177bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2178 Type *Ty = Sqrt.getType()->getScalarType();
2179 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2180 return false;
2181
2182 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2183 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2184
2185 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2186 // of fast llvm.sqrt will give the raw instruction anyway.
2187 if (SqrtFMF.approxFunc())
2188 return false;
2189
2190 const float ReqdAccuracy = FPOp->getFPAccuracy();
2191
2192 // Defer correctly rounded expansion to codegen.
2193 if (ReqdAccuracy < 1.0f)
2194 return false;
2195
2196 Value *SrcVal = Sqrt.getOperand(0);
2197 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2198
2199 // The raw instruction is 1 ulp, but the correction for denormal handling
2200 // brings it to 2.
2201 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2202 return false;
2203
2204 IRBuilder<> Builder(&Sqrt);
2205 SmallVector<Value *, 4> SrcVals;
2206 extractValues(Builder, SrcVals, SrcVal);
2207
2208 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2209 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2210 if (CanTreatAsDAZ)
2211 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2212 else
2213 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2214 }
2215
2216 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2217 NewSqrt->takeName(&Sqrt);
2218 Sqrt.replaceAllUsesWith(NewSqrt);
2219 DeadVals.push_back(&Sqrt);
2220 return true;
2221}
2222
2223/// Replace log and log10 intrinsic calls based on fpmath metadata.
2224bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2225 Intrinsic::ID IID) {
2226 Type *Ty = Log.getType();
2227 if (!Ty->getScalarType()->isHalfTy() || !ST.has16BitInsts())
2228 return false;
2229
2230 FastMathFlags FMF = Log.getFastMathFlags();
2231
2232 // Defer fast math cases to codegen.
2233 if (FMF.approxFunc())
2234 return false;
2235
2236 // Limit experimentally determined from OpenCL conformance test (1.79)
2237 if (Log.getFPAccuracy() < 1.80f)
2238 return false;
2239
2240 IRBuilder<> Builder(&cast<CallInst>(Log));
2241
2242 // Use the generic intrinsic for convenience in the vector case. Codegen will
2243 // recognize the denormal handling is not necessary from the fpext.
2244 // TODO: Move to generic code
2245 Value *Log2 =
2246 Builder.CreateUnaryIntrinsic(Intrinsic::log2, Log.getOperand(0), FMF);
2247
2248 double Log2BaseInverted =
2249 IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2250 Value *Mul =
2251 Builder.CreateFMulFMF(Log2, ConstantFP::get(Ty, Log2BaseInverted), FMF);
2252
2253 Mul->takeName(&Log);
2254
2255 Log.replaceAllUsesWith(Mul);
2256 DeadVals.push_back(&Log);
2257 return true;
2258}
2259
2260bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2261 if (skipFunction(F))
2262 return false;
2263
2264 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2265 if (!TPC)
2266 return false;
2267
2268 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2269 const TargetLibraryInfo *TLI =
2270 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2271 AssumptionCache *AC =
2272 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2273 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2274 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2275 const UniformityInfo &UA =
2276 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2277 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2278}
2279
2282 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2283 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
2284 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
2285 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2286 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
2287 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2288 if (!Impl.run())
2289 return PreservedAnalyses::all();
2291 if (!Impl.FlowChanged)
2293 return PA;
2294}
2295
2296INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2297 "AMDGPU IR optimizations", false, false)
2301INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2303
2304/// Create a workitem.id.x intrinsic call with range metadata.
2305CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2306 CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2307 ST.makeLIDRangeMetadata(Tid);
2308 return Tid;
2309}
2310
2311/// Replace the instruction with a direct workitem.id.x call.
2312void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2313 IRBuilder<> B(&I);
2314 CallInst *Tid = createWorkitemIdX(B);
2316 ReplaceInstWithValue(BI, Tid);
2317}
2318
2319/// Replace the instruction with (workitem.id.x & mask).
2320void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2321 Instruction &I, unsigned WaveSize) const {
2322 IRBuilder<> B(&I);
2323 CallInst *Tid = createWorkitemIdX(B);
2324 Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);
2325 Value *AndInst = B.CreateAnd(Tid, Mask);
2327 ReplaceInstWithValue(BI, AndInst);
2328}
2329
2330/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2331/// work group size allows direct computation of lane ID.
2332/// Returns true if optimization was applied, false otherwise.
2333bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2334 unsigned Wave) const {
2335 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);
2336 if (!MaybeX)
2337 return false;
2338
2339 // When work group size == wave_size, each work group contains exactly one
2340 // wave, so the instruction can be replaced with workitem.id.x directly.
2341 if (*MaybeX == Wave) {
2342 replaceWithWorkitemIdX(I);
2343 return true;
2344 }
2345
2346 // When work group evenly splits into waves, compute lane ID within wave
2347 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2348 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
2349 replaceWithMaskedWorkitemIdX(I, Wave);
2350 return true;
2351 }
2352
2353 return false;
2354}
2355
2356/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2357bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2358 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2359 // the full execution mask.
2360 if (!ST.isWave32())
2361 return false;
2362
2363 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2364 // lower IDs.
2365 if (!match(&I,
2367 return false;
2368
2369 return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());
2370}
2371
2372/// Optimize mbcnt.hi calls for lane ID computation.
2373bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2374 // Abort if wave size is not known at compile time.
2375 if (!ST.isWaveSizeKnown())
2376 return false;
2377
2378 unsigned Wave = ST.getWavefrontSize();
2379
2380 // On wave32, the upper 32 bits of execution mask are always 0, so
2381 // mbcnt.hi(mask, val) always returns val unchanged.
2382 if (ST.isWave32()) {
2383 if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
2384 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2385 // wave size (single wave per work group).
2386 if (*MaybeX == Wave) {
2388 ReplaceInstWithValue(BI, I.getArgOperand(1));
2389 return true;
2390 }
2391 }
2392 }
2393
2394 // Optimize the complete lane ID computation pattern:
2395 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2396 // across the full execution mask.
2397 using namespace PatternMatch;
2398
2399 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2402 m_AllOnes(), m_Zero()))))
2403 return false;
2404
2405 return tryReplaceWithWorkitemId(I, Wave);
2406}
2407
2408char AMDGPUCodeGenPrepare::ID = 0;
2409
2411 return new AMDGPUCodeGenPrepare();
2412}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Value * RHS
Value * LHS
BinaryOperator * Mul
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
Definition APFloat.h:1143
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1213
opStatus next(bool nextDown)
Definition APFloat.h:1309
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:374
BitVector & set()
Definition BitVector.h:370
bool all() const
all - Returns true if all bits are set.
Definition BitVector.h:194
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:610
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
void setFast(bool B=true)
Definition FMF.h:99
bool noInfs() const
Definition FMF.h:69
bool allowReciprocal() const
Definition FMF.h:71
bool approxFunc() const
Definition FMF.h:73
void setNoNaNs(bool B=true)
Definition FMF.h:81
bool noNaNs() const
Definition FMF.h:68
bool allowContract() const
Definition FMF.h:72
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isWave32() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
bool hasFractBug() const
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2584
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1696
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2150
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2572
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:592
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2095
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2631
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2123
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2089
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:579
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2137
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2386
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1811
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1446
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Definition IRBuilder.h:1100
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2199
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1877
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1518
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:334
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2077
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2371
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1577
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1429
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:607
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2510
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2063
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1734
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2343
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1558
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1625
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1677
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1820
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1599
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2110
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1682
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1463
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2381
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2130
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:110
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
auto m_Value()
Match an arbitrary value and ignore it.
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
constexpr double ln2
constexpr double ln10
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:360
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
Definition bit.h:90
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
const DataLayout & DL
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC