LLVM 22.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
18#include "llvm/ADT/SetVector.h"
26#include "llvm/IR/Dominators.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/InstVisitor.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/IR/ValueHandle.h"
33#include "llvm/Pass.h"
38
39#define DEBUG_TYPE "amdgpu-codegenprepare"
40
41using namespace llvm;
42using namespace llvm::PatternMatch;
43
44namespace {
45
47 "amdgpu-codegenprepare-widen-constant-loads",
48 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
50 cl::init(false));
51
52static cl::opt<bool>
53 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
54 cl::desc("Break large PHI nodes for DAGISel"),
56
57static cl::opt<bool>
58 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
59 cl::desc("For testing purposes, always break large "
60 "PHIs even if it isn't profitable."),
62
63static cl::opt<unsigned> BreakLargePHIsThreshold(
64 "amdgpu-codegenprepare-break-large-phis-threshold",
65 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
67
68static cl::opt<bool> UseMul24Intrin(
69 "amdgpu-codegenprepare-mul24",
70 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
72 cl::init(true));
73
74// Legalize 64-bit division by using the generic IR expansion.
75static cl::opt<bool> ExpandDiv64InIR(
76 "amdgpu-codegenprepare-expand-div64",
77 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
79 cl::init(false));
80
81// Leave all division operations as they are. This supersedes ExpandDiv64InIR
82// and is used for testing the legalizer.
83static cl::opt<bool> DisableIDivExpand(
84 "amdgpu-codegenprepare-disable-idiv-expansion",
85 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
87 cl::init(false));
88
89// Disable processing of fdiv so we can better test the backend implementations.
90static cl::opt<bool> DisableFDivExpand(
91 "amdgpu-codegenprepare-disable-fdiv-expansion",
92 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
94 cl::init(false));
95
96class AMDGPUCodeGenPrepareImpl
97 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
98public:
99 Function &F;
100 const GCNSubtarget &ST;
101 const AMDGPUTargetMachine &TM;
102 const TargetLibraryInfo *TLI;
103 AssumptionCache *AC;
104 const DominatorTree *DT;
105 const UniformityInfo &UA;
106 const DataLayout &DL;
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged = false;
109 mutable Function *SqrtF32 = nullptr;
110 mutable Function *LdexpF32 = nullptr;
111 mutable SmallVector<WeakVH> DeadVals;
112
113 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
114
115 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
116 const TargetLibraryInfo *TLI, AssumptionCache *AC,
117 const DominatorTree *DT, const UniformityInfo &UA)
118 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
119 DT(DT), UA(UA), DL(F.getDataLayout()),
120 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
122
123 Function *getSqrtF32() const {
124 if (SqrtF32)
125 return SqrtF32;
126
127 LLVMContext &Ctx = F.getContext();
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
130 return SqrtF32;
131 }
132
133 Function *getLdexpF32() const {
134 if (LdexpF32)
135 return LdexpF32;
136
137 LLVMContext &Ctx = F.getContext();
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
141 return LdexpF32;
142 }
143
144 bool canBreakPHINode(const PHINode &I);
145
146 /// Return true if \p T is a legal scalar floating point type.
147 bool isLegalFloatingTy(const Type *T) const;
148
149 /// Wrapper to pass all the arguments to computeKnownFPClass
151 const Instruction *CtxI) const {
152 return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT);
153 }
154
155 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
156 return HasFP32DenormalFlush ||
158 }
159
160 /// \returns The minimum number of bits needed to store the value of \Op as an
161 /// unsigned integer. Truncating to this size and then zero-extending to
162 /// the original will not change the value.
163 unsigned numBitsUnsigned(Value *Op) const;
164
165 /// \returns The minimum number of bits needed to store the value of \Op as a
166 /// signed integer. Truncating to this size and then sign-extending to
167 /// the original size will not change the value.
168 unsigned numBitsSigned(Value *Op) const;
169
170 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
171 /// SelectionDAG has an issue where an and asserting the bits are known
172 bool replaceMulWithMul24(BinaryOperator &I) const;
173
174 /// Perform same function as equivalently named function in DAGCombiner. Since
175 /// we expand some divisions here, we need to perform this before obscuring.
176 bool foldBinOpIntoSelect(BinaryOperator &I) const;
177
178 bool divHasSpecialOptimization(BinaryOperator &I,
179 Value *Num, Value *Den) const;
180 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
181 unsigned MaxDivBits, bool Signed) const;
182
183 /// Expands 24 bit div or rem.
184 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
185 Value *Num, Value *Den,
186 bool IsDiv, bool IsSigned) const;
187
188 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
189 Value *Num, Value *Den, unsigned NumBits,
190 bool IsDiv, bool IsSigned) const;
191
192 /// Expands 32 bit div or rem.
193 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
194 Value *Num, Value *Den) const;
195
196 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
197 Value *Num, Value *Den) const;
198 void expandDivRem64(BinaryOperator &I) const;
199
200 /// Widen a scalar load.
201 ///
202 /// \details \p Widen scalar load for uniform, small type loads from constant
203 // memory / to a full 32-bits and then truncate the input to allow a scalar
204 // load instead of a vector load.
205 //
206 /// \returns True.
207
208 bool canWidenScalarExtLoad(LoadInst &I) const;
209
210 Value *matchFractPat(IntrinsicInst &I);
211 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
212
213 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
214 FastMathFlags SqrtFMF) const;
215
216 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
217 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
218 const Instruction *CtxI) const;
219
220 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
221 FastMathFlags FMF, const Instruction *CtxI) const;
222 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
223 float ReqdAccuracy) const;
224
225 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
226 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
227 Value *RsqOp, const Instruction *FDiv,
228 float ReqdAccuracy) const;
229
230 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
231 Value *Src) const;
232
233 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
234 bool IsNegative) const;
235 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
236 FastMathFlags FMF) const;
237 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
238 FastMathFlags FMF) const;
239
240 bool tryNarrowMathIfNoOverflow(Instruction *I);
241
242public:
243 bool visitFDiv(BinaryOperator &I);
244
245 bool visitInstruction(Instruction &I) { return false; }
246 bool visitBinaryOperator(BinaryOperator &I);
247 bool visitLoadInst(LoadInst &I);
248 bool visitSelectInst(SelectInst &I);
249 bool visitPHINode(PHINode &I);
250 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
251
252 bool visitIntrinsicInst(IntrinsicInst &I);
253 bool visitFMinLike(IntrinsicInst &I);
254 bool visitSqrt(IntrinsicInst &I);
255 bool run();
256};
257
258class AMDGPUCodeGenPrepare : public FunctionPass {
259public:
260 static char ID;
261 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
262 void getAnalysisUsage(AnalysisUsage &AU) const override {
266
267 // FIXME: Division expansion needs to preserve the dominator tree.
268 if (!ExpandDiv64InIR)
269 AU.setPreservesAll();
270 }
271 bool runOnFunction(Function &F) override;
272 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
273};
274
275} // end anonymous namespace
276
277bool AMDGPUCodeGenPrepareImpl::run() {
278 BreakPhiNodesCache.clear();
279 bool MadeChange = false;
280
281 // Need to use make_early_inc_range because integer division expansion is
282 // handled by Transform/Utils, and it can delete instructions such as the
283 // terminator of the BB.
284 for (BasicBlock &BB : reverse(F)) {
285 for (Instruction &I : make_early_inc_range(reverse(BB))) {
286 if (!isInstructionTriviallyDead(&I, TLI))
287 MadeChange |= visit(I);
288 }
289 }
290
291 while (!DeadVals.empty()) {
292 if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
294 }
295
296 return MadeChange;
297}
298
299bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
300 return Ty->isFloatTy() || Ty->isDoubleTy() ||
301 (Ty->isHalfTy() && ST.has16BitInsts());
302}
303
304bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
305 Type *Ty = I.getType();
306 int TySize = DL.getTypeSizeInBits(Ty);
307 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
308
309 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
310}
311
312unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
313 return computeKnownBits(Op, DL, AC).countMaxActiveBits();
314}
315
316unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
317 return ComputeMaxSignificantBits(Op, DL, AC);
318}
319
320static void extractValues(IRBuilder<> &Builder,
321 SmallVectorImpl<Value *> &Values, Value *V) {
322 auto *VT = dyn_cast<FixedVectorType>(V->getType());
323 if (!VT) {
324 Values.push_back(V);
325 return;
326 }
327
328 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
329 Values.push_back(Builder.CreateExtractElement(V, I));
330}
331
333 Type *Ty,
334 SmallVectorImpl<Value *> &Values) {
335 if (!Ty->isVectorTy()) {
336 assert(Values.size() == 1);
337 return Values[0];
338 }
339
340 Value *NewVal = PoisonValue::get(Ty);
341 for (int I = 0, E = Values.size(); I != E; ++I)
342 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
343
344 return NewVal;
345}
346
347bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
348 if (I.getOpcode() != Instruction::Mul)
349 return false;
350
351 Type *Ty = I.getType();
352 unsigned Size = Ty->getScalarSizeInBits();
353 if (Size <= 16 && ST.has16BitInsts())
354 return false;
355
356 // Prefer scalar if this could be s_mul_i32
357 if (UA.isUniform(&I))
358 return false;
359
360 Value *LHS = I.getOperand(0);
361 Value *RHS = I.getOperand(1);
362 IRBuilder<> Builder(&I);
363 Builder.SetCurrentDebugLocation(I.getDebugLoc());
364
365 unsigned LHSBits = 0, RHSBits = 0;
366 bool IsSigned = false;
367
368 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
369 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
370 IsSigned = false;
371
372 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
373 (RHSBits = numBitsSigned(RHS)) <= 24) {
374 IsSigned = true;
375
376 } else
377 return false;
378
381 SmallVector<Value *, 4> ResultVals;
382 extractValues(Builder, LHSVals, LHS);
383 extractValues(Builder, RHSVals, RHS);
384
385 IntegerType *I32Ty = Builder.getInt32Ty();
386 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
387 Type *DstTy = LHSVals[0]->getType();
388
389 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
390 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
391 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
392 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
393 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
395 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
396 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
397 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
398 : Builder.CreateZExtOrTrunc(Result, DstTy);
399 ResultVals.push_back(Result);
400 }
401
402 Value *NewVal = insertValues(Builder, Ty, ResultVals);
403 NewVal->takeName(&I);
404 I.replaceAllUsesWith(NewVal);
405 DeadVals.push_back(&I);
406
407 return true;
408}
409
410// Find a select instruction, which may have been casted. This is mostly to deal
411// with cases where i16 selects were promoted here to i32.
413 Cast = nullptr;
414 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
415 return Sel;
416
417 if ((Cast = dyn_cast<CastInst>(V))) {
418 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
419 return Sel;
420 }
421
422 return nullptr;
423}
424
425bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
426 // Don't do this unless the old select is going away. We want to eliminate the
427 // binary operator, not replace a binop with a select.
428 int SelOpNo = 0;
429
430 CastInst *CastOp;
431
432 // TODO: Should probably try to handle some cases with multiple
433 // users. Duplicating the select may be profitable for division.
434 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
435 if (!Sel || !Sel->hasOneUse()) {
436 SelOpNo = 1;
437 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
438 }
439
440 if (!Sel || !Sel->hasOneUse())
441 return false;
442
445 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
446 if (!CBO || !CT || !CF)
447 return false;
448
449 if (CastOp) {
450 if (!CastOp->hasOneUse())
451 return false;
452 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
453 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
454 }
455
456 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
457 // need to handle divisions here.
458 Constant *FoldedT =
459 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
460 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
461 if (!FoldedT || isa<ConstantExpr>(FoldedT))
462 return false;
463
464 Constant *FoldedF =
465 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
466 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
467 if (!FoldedF || isa<ConstantExpr>(FoldedF))
468 return false;
469
470 IRBuilder<> Builder(&BO);
471 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
472 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
473 Builder.setFastMathFlags(FPOp->getFastMathFlags());
474
475 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
476 FoldedT, FoldedF);
477 NewSelect->takeName(&BO);
478 BO.replaceAllUsesWith(NewSelect);
479 DeadVals.push_back(&BO);
480 if (CastOp)
481 DeadVals.push_back(CastOp);
482 DeadVals.push_back(Sel);
483 return true;
484}
485
486std::pair<Value *, Value *>
487AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
488 Value *Src) const {
489 Type *Ty = Src->getType();
490 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
491 {Ty, Builder.getInt32Ty()}, Src);
492 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
493
494 // Bypass the bug workaround for the exponent result since it doesn't matter.
495 // TODO: Does the bug workaround even really need to consider the exponent
496 // result? It's unspecified by the spec.
497
498 Value *FrexpExp =
499 ST.hasFractBug()
500 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
501 {Builder.getInt32Ty(), Ty}, Src)
502 : Builder.CreateExtractValue(Frexp, {1});
503 return {FrexpMant, FrexpExp};
504}
505
506/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
507Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
508 Value *Src,
509 bool IsNegative) const {
510 // Same as for 1.0, but expand the sign out of the constant.
511 // -1.0 / x -> rcp (fneg x)
512 if (IsNegative)
513 Src = Builder.CreateFNeg(Src);
514
515 // The rcp instruction doesn't support denormals, so scale the input
516 // out of the denormal range and convert at the end.
517 //
518 // Expand as 2^-n * (1.0 / (x * 2^n))
519
520 // TODO: Skip scaling if input is known never denormal and the input
521 // range won't underflow to denormal. The hard part is knowing the
522 // result. We need a range check, the result could be denormal for
523 // 0x1p+126 < den <= 0x1p+127.
524 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
525 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
526 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
527 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
528}
529
530/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
531Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
532 Value *RHS,
533 FastMathFlags FMF) const {
534 // If we have have to work around the fract/frexp bug, we're worse off than
535 // using the fdiv.fast expansion. The full safe expansion is faster if we have
536 // fast FMA.
537 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
538 (!FMF.noNaNs() || !FMF.noInfs()))
539 return nullptr;
540
541 // We're scaling the LHS to avoid a denormal input, and scale the denominator
542 // to avoid large values underflowing the result.
543 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
544
545 Value *Rcp =
546 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
547
548 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
549 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
550
551 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
552 // result.
553 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
554 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
555}
556
557/// Emit a sqrt that handles denormals and is accurate to 2ulp.
558Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
559 Value *Src,
560 FastMathFlags FMF) const {
561 Type *Ty = Src->getType();
562 APFloat SmallestNormal =
564 Value *NeedScale =
565 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
566
567 ConstantInt *Zero = Builder.getInt32(0);
568 Value *InputScaleFactor =
569 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
570
571 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
572
573 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
574
575 Value *OutputScaleFactor =
576 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
577 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
578}
579
580/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
581static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
582 bool IsNegative) {
583 // bool need_scale = x < 0x1p-126f;
584 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
585 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
586 // rsq(x * input_scale) * output_scale;
587
588 Type *Ty = Src->getType();
589 APFloat SmallestNormal =
590 APFloat::getSmallestNormalized(Ty->getFltSemantics());
591 Value *NeedScale =
592 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
593 Constant *One = ConstantFP::get(Ty, 1.0);
594 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
595 Constant *OutputScale =
596 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
597
598 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
599
600 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
601 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
602 Value *OutputScaleFactor = Builder.CreateSelect(
603 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
604
605 return Builder.CreateFMul(Rsq, OutputScaleFactor);
606}
607
608bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
609 FastMathFlags DivFMF,
610 FastMathFlags SqrtFMF) const {
611 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
612 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
613 return false;
614
615 // v_rsq_f32 gives 1ulp
616 return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
617}
618
619Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
620 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
621 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
622 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
623 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
624
625 // rsq_f16 is accurate to 0.51 ulp.
626 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
627 // rsq_f64 is never accurate.
628 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
629 if (!CLHS)
630 return nullptr;
631
632 assert(Den->getType()->isFloatTy());
633
634 bool IsNegative = false;
635
636 // TODO: Handle other numerator values with arcp.
637 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
638 // Add in the sqrt flags.
639 IRBuilder<>::FastMathFlagGuard Guard(Builder);
640 Builder.setFastMathFlags(DivFMF | SqrtFMF);
641
642 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
643 canIgnoreDenormalInput(Den, CtxI)) {
644 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
645 // -1.0 / sqrt(x) -> fneg(rsq(x))
646 return IsNegative ? Builder.CreateFNeg(Result) : Result;
647 }
648
649 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
650 }
651
652 return nullptr;
653}
654
655// Optimize fdiv with rcp:
656//
657// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
658// allowed with afn.
659//
660// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
661Value *
662AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
663 Value *Den, FastMathFlags FMF,
664 const Instruction *CtxI) const {
665 // rcp_f16 is accurate to 0.51 ulp.
666 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
667 // rcp_f64 is never accurate.
668 assert(Den->getType()->isFloatTy());
669
670 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
671 bool IsNegative = false;
672 if (CLHS->isExactlyValue(1.0) ||
673 (IsNegative = CLHS->isExactlyValue(-1.0))) {
674 Value *Src = Den;
675
676 if (HasFP32DenormalFlush || FMF.approxFunc()) {
677 // -1.0 / x -> 1.0 / fneg(x)
678 if (IsNegative)
679 Src = Builder.CreateFNeg(Src);
680
681 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
682 // the CI documentation has a worst case error of 1 ulp.
683 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
684 // to use it as long as we aren't trying to use denormals.
685 //
686 // v_rcp_f16 and v_rsq_f16 DO support denormals.
687
688 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
689 // insert rsq intrinsic here.
690
691 // 1.0 / x -> rcp(x)
692 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
693 }
694
695 // TODO: If the input isn't denormal, and we know the input exponent isn't
696 // big enough to introduce a denormal we can avoid the scaling.
697 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
698 }
699 }
700
701 if (FMF.allowReciprocal()) {
702 // x / y -> x * (1.0 / y)
703
704 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
705 // will never underflow.
706 if (HasFP32DenormalFlush || FMF.approxFunc()) {
707 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
708 return Builder.CreateFMul(Num, Recip);
709 }
710
711 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
712 return Builder.CreateFMul(Num, Recip);
713 }
714
715 return nullptr;
716}
717
718// optimize with fdiv.fast:
719//
720// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
721//
722// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
723//
724// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
725Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
726 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
727 // fdiv.fast can achieve 2.5 ULP accuracy.
728 if (ReqdAccuracy < 2.5f)
729 return nullptr;
730
731 // Only have fdiv.fast for f32.
732 assert(Den->getType()->isFloatTy());
733
734 bool NumIsOne = false;
735 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
736 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
737 NumIsOne = true;
738 }
739
740 // fdiv does not support denormals. But 1.0/x is always fine to use it.
741 //
742 // TODO: This works for any value with a specific known exponent range, don't
743 // just limit to constant 1.
744 if (!HasFP32DenormalFlush && !NumIsOne)
745 return nullptr;
746
747 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
748}
749
750Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
751 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
752 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
753 float ReqdDivAccuracy) const {
754 if (RsqOp) {
755 Value *Rsq =
756 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
757 if (Rsq)
758 return Rsq;
759 }
760
761 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
762 if (Rcp)
763 return Rcp;
764
765 // In the basic case fdiv_fast has the same instruction count as the frexp div
766 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
767 // potentially be fused into a user. Also, materialization of the constants
768 // can be reused for multiple instances.
769 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
770 if (FDivFast)
771 return FDivFast;
772
773 return emitFrexpDiv(Builder, Num, Den, DivFMF);
774}
775
776// Optimizations is performed based on fpmath, fast math flags as well as
777// denormals to optimize fdiv with either rcp or fdiv.fast.
778//
779// With rcp:
780// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
781// allowed with afn.
782//
783// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
784//
785// With fdiv.fast:
786// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
787//
788// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
789//
790// NOTE: rcp is the preference in cases that both are legal.
791bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
792 if (DisableFDivExpand)
793 return false;
794
795 Type *Ty = FDiv.getType()->getScalarType();
796 if (!Ty->isFloatTy())
797 return false;
798
799 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
800 // expansion around them in codegen. f16 is good enough to always use.
801
802 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
803 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
804 const float ReqdAccuracy = FPOp->getFPAccuracy();
805
806 FastMathFlags SqrtFMF;
807
808 Value *Num = FDiv.getOperand(0);
809 Value *Den = FDiv.getOperand(1);
810
811 Value *RsqOp = nullptr;
812 auto *DenII = dyn_cast<IntrinsicInst>(Den);
813 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
814 DenII->hasOneUse()) {
815 const auto *SqrtOp = cast<FPMathOperator>(DenII);
816 SqrtFMF = SqrtOp->getFastMathFlags();
817 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
818 RsqOp = SqrtOp->getOperand(0);
819 }
820
821 // Inaccurate rcp is allowed with afn.
822 //
823 // Defer to codegen to handle this.
824 //
825 // TODO: Decide on an interpretation for interactions between afn + arcp +
826 // !fpmath, and make it consistent between here and codegen. For now, defer
827 // expansion of afn to codegen. The current interpretation is so aggressive we
828 // don't need any pre-consideration here when we have better information. A
829 // more conservative interpretation could use handling here.
830 const bool AllowInaccurateRcp = DivFMF.approxFunc();
831 if (!RsqOp && AllowInaccurateRcp)
832 return false;
833
834 // Defer the correct implementations to codegen.
835 if (ReqdAccuracy < 1.0f)
836 return false;
837
838 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
839 Builder.setFastMathFlags(DivFMF);
840 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
841
844 SmallVector<Value *, 4> RsqDenVals;
845 extractValues(Builder, NumVals, Num);
846 extractValues(Builder, DenVals, Den);
847
848 if (RsqOp)
849 extractValues(Builder, RsqDenVals, RsqOp);
850
851 SmallVector<Value *, 4> ResultVals(NumVals.size());
852 for (int I = 0, E = NumVals.size(); I != E; ++I) {
853 Value *NumElt = NumVals[I];
854 Value *DenElt = DenVals[I];
855 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
856
857 Value *NewElt =
858 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
859 cast<Instruction>(FPOp), ReqdAccuracy);
860 if (!NewElt) {
861 // Keep the original, but scalarized.
862
863 // This has the unfortunate side effect of sometimes scalarizing when
864 // we're not going to do anything.
865 NewElt = Builder.CreateFDiv(NumElt, DenElt);
866 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
867 NewEltInst->copyMetadata(FDiv);
868 }
869
870 ResultVals[I] = NewElt;
871 }
872
873 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
874
875 if (NewVal) {
876 FDiv.replaceAllUsesWith(NewVal);
877 NewVal->takeName(&FDiv);
878 DeadVals.push_back(&FDiv);
879 }
880
881 return true;
882}
883
884static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
885 Value *LHS, Value *RHS) {
886 Type *I32Ty = Builder.getInt32Ty();
887 Type *I64Ty = Builder.getInt64Ty();
888
889 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
890 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
891 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
892 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
893 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
894 Hi = Builder.CreateTrunc(Hi, I32Ty);
895 return std::pair(Lo, Hi);
896}
897
898static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
899 return getMul64(Builder, LHS, RHS).second;
900}
901
902/// Figure out how many bits are really needed for this division.
903/// \p MaxDivBits is an optimization hint to bypass the second
904/// ComputeNumSignBits/computeKnownBits call if the first one is
905/// insufficient.
906unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
907 Value *Den,
908 unsigned MaxDivBits,
909 bool IsSigned) const {
911 Den->getType()->getScalarSizeInBits());
912 unsigned SSBits = Num->getType()->getScalarSizeInBits();
913 if (IsSigned) {
914 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I);
915 // A sign bit needs to be reserved for shrinking.
916 unsigned DivBits = SSBits - RHSSignBits + 1;
917 if (DivBits > MaxDivBits)
918 return SSBits;
919
920 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I);
921
922 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
923 DivBits = SSBits - SignBits + 1;
924 return DivBits;
925 }
926
927 // All bits are used for unsigned division for Num or Den in range
928 // (SignedMax, UnsignedMax].
929 KnownBits Known = computeKnownBits(Den, DL, AC, &I);
930 if (Known.isNegative() || !Known.isNonNegative())
931 return SSBits;
932 unsigned RHSSignBits = Known.countMinLeadingZeros();
933 unsigned DivBits = SSBits - RHSSignBits;
934 if (DivBits > MaxDivBits)
935 return SSBits;
936
937 Known = computeKnownBits(Num, DL, AC, &I);
938 if (Known.isNegative() || !Known.isNonNegative())
939 return SSBits;
940 unsigned LHSSignBits = Known.countMinLeadingZeros();
941
942 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
943 DivBits = SSBits - SignBits;
944 return DivBits;
945}
946
947// The fractional part of a float is enough to accurately represent up to
948// a 24-bit signed integer.
949Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
950 BinaryOperator &I, Value *Num,
951 Value *Den, bool IsDiv,
952 bool IsSigned) const {
953 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
954 if (DivBits > 24)
955 return nullptr;
956 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
957}
958
959Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
960 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
961 unsigned DivBits, bool IsDiv, bool IsSigned) const {
962 Type *I32Ty = Builder.getInt32Ty();
963 Num = Builder.CreateTrunc(Num, I32Ty);
964 Den = Builder.CreateTrunc(Den, I32Ty);
965
966 Type *F32Ty = Builder.getFloatTy();
967 ConstantInt *One = Builder.getInt32(1);
968 Value *JQ = One;
969
970 if (IsSigned) {
971 // char|short jq = ia ^ ib;
972 JQ = Builder.CreateXor(Num, Den);
973
974 // jq = jq >> (bitsize - 2)
975 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
976
977 // jq = jq | 0x1
978 JQ = Builder.CreateOr(JQ, One);
979 }
980
981 // int ia = (int)LHS;
982 Value *IA = Num;
983
984 // int ib, (int)RHS;
985 Value *IB = Den;
986
987 // float fa = (float)ia;
988 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
989 : Builder.CreateUIToFP(IA, F32Ty);
990
991 // float fb = (float)ib;
992 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
993 : Builder.CreateUIToFP(IB,F32Ty);
994
995 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
996 Builder.getFloatTy(), {FB});
997 Value *FQM = Builder.CreateFMul(FA, RCP);
998
999 // fq = trunc(fqm);
1000 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1001 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1002
1003 // float fqneg = -fq;
1004 Value *FQNeg = Builder.CreateFNeg(FQ);
1005
1006 // float fr = mad(fqneg, fb, fa);
1007 auto FMAD = !ST.hasMadMacF32Insts()
1008 ? Intrinsic::fma
1009 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1010 Value *FR = Builder.CreateIntrinsic(FMAD,
1011 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1012
1013 // int iq = (int)fq;
1014 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1015 : Builder.CreateFPToUI(FQ, I32Ty);
1016
1017 // fr = fabs(fr);
1018 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1019
1020 // fb = fabs(fb);
1021 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1022
1023 // int cv = fr >= fb;
1024 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1025
1026 // jq = (cv ? jq : 0);
1027 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1028
1029 // dst = iq + jq;
1030 Value *Div = Builder.CreateAdd(IQ, JQ);
1031
1032 Value *Res = Div;
1033 if (!IsDiv) {
1034 // Rem needs compensation, it's easier to recompute it
1035 Value *Rem = Builder.CreateMul(Div, Den);
1036 Res = Builder.CreateSub(Num, Rem);
1037 }
1038
1039 if (DivBits != 0 && DivBits < 32) {
1040 // Extend in register from the number of bits this divide really is.
1041 if (IsSigned) {
1042 int InRegBits = 32 - DivBits;
1043
1044 Res = Builder.CreateShl(Res, InRegBits);
1045 Res = Builder.CreateAShr(Res, InRegBits);
1046 } else {
1047 ConstantInt *TruncMask
1048 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1049 Res = Builder.CreateAnd(Res, TruncMask);
1050 }
1051 }
1052
1053 return Res;
1054}
1055
1056// Try to recognize special cases the DAG will emit special, better expansions
1057// than the general expansion we do here.
1058
1059// TODO: It would be better to just directly handle those optimizations here.
1060bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1061 Value *Num,
1062 Value *Den) const {
1063 if (Constant *C = dyn_cast<Constant>(Den)) {
1064 // Arbitrary constants get a better expansion as long as a wider mulhi is
1065 // legal.
1066 if (C->getType()->getScalarSizeInBits() <= 32)
1067 return true;
1068
1069 // TODO: Sdiv check for not exact for some reason.
1070
1071 // If there's no wider mulhi, there's only a better expansion for powers of
1072 // two.
1073 // TODO: Should really know for each vector element.
1074 if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT))
1075 return true;
1076
1077 return false;
1078 }
1079
1080 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1081 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1082 if (BinOpDen->getOpcode() == Instruction::Shl &&
1083 isa<Constant>(BinOpDen->getOperand(0)) &&
1084 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) {
1085 return true;
1086 }
1087 }
1088
1089 return false;
1090}
1091
1092static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1093 // Check whether the sign can be determined statically.
1094 KnownBits Known = computeKnownBits(V, DL);
1095 if (Known.isNegative())
1096 return Constant::getAllOnesValue(V->getType());
1097 if (Known.isNonNegative())
1098 return Constant::getNullValue(V->getType());
1099 return Builder.CreateAShr(V, Builder.getInt32(31));
1100}
1101
1102Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1103 BinaryOperator &I, Value *X,
1104 Value *Y) const {
1105 Instruction::BinaryOps Opc = I.getOpcode();
1106 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1107 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1108
1109 FastMathFlags FMF;
1110 FMF.setFast();
1111 Builder.setFastMathFlags(FMF);
1112
1113 if (divHasSpecialOptimization(I, X, Y))
1114 return nullptr; // Keep it for later optimization.
1115
1116 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1117 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1118
1119 Type *Ty = X->getType();
1120 Type *I32Ty = Builder.getInt32Ty();
1121 Type *F32Ty = Builder.getFloatTy();
1122
1123 if (Ty->getScalarSizeInBits() != 32) {
1124 if (IsSigned) {
1125 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1126 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1127 } else {
1128 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1129 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1130 }
1131 }
1132
1133 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1134 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1135 Builder.CreateZExtOrTrunc(Res, Ty);
1136 }
1137
1138 ConstantInt *Zero = Builder.getInt32(0);
1139 ConstantInt *One = Builder.getInt32(1);
1140
1141 Value *Sign = nullptr;
1142 if (IsSigned) {
1143 Value *SignX = getSign32(X, Builder, DL);
1144 Value *SignY = getSign32(Y, Builder, DL);
1145 // Remainder sign is the same as LHS
1146 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1147
1148 X = Builder.CreateAdd(X, SignX);
1149 Y = Builder.CreateAdd(Y, SignY);
1150
1151 X = Builder.CreateXor(X, SignX);
1152 Y = Builder.CreateXor(Y, SignY);
1153 }
1154
1155 // The algorithm here is based on ideas from "Software Integer Division", Tom
1156 // Rodeheffer, August 2008.
1157 //
1158 // unsigned udiv(unsigned x, unsigned y) {
1159 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1160 // // that this is a lower bound on inv(y), even if some of the calculations
1161 // // round up.
1162 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1163 //
1164 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1165 // // Empirically this is guaranteed to give a "two-y" lower bound on
1166 // // inv(y).
1167 // z += umulh(z, -y * z);
1168 //
1169 // // Quotient/remainder estimate.
1170 // unsigned q = umulh(x, z);
1171 // unsigned r = x - q * y;
1172 //
1173 // // Two rounds of quotient/remainder refinement.
1174 // if (r >= y) {
1175 // ++q;
1176 // r -= y;
1177 // }
1178 // if (r >= y) {
1179 // ++q;
1180 // r -= y;
1181 // }
1182 //
1183 // return q;
1184 // }
1185
1186 // Initial estimate of inv(y).
1187 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1188 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1189 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1190 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1191 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1192
1193 // One round of UNR.
1194 Value *NegY = Builder.CreateSub(Zero, Y);
1195 Value *NegYZ = Builder.CreateMul(NegY, Z);
1196 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1197
1198 // Quotient/remainder estimate.
1199 Value *Q = getMulHu(Builder, X, Z);
1200 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1201
1202 // First quotient/remainder refinement.
1203 Value *Cond = Builder.CreateICmpUGE(R, Y);
1204 if (IsDiv)
1205 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1206 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1207
1208 // Second quotient/remainder refinement.
1209 Cond = Builder.CreateICmpUGE(R, Y);
1210 Value *Res;
1211 if (IsDiv)
1212 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1213 else
1214 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1215
1216 if (IsSigned) {
1217 Res = Builder.CreateXor(Res, Sign);
1218 Res = Builder.CreateSub(Res, Sign);
1219 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1220 } else {
1221 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1222 }
1223 return Res;
1224}
1225
1226Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1227 BinaryOperator &I, Value *Num,
1228 Value *Den) const {
1229 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1230 return nullptr; // Keep it for later optimization.
1231
1232 Instruction::BinaryOps Opc = I.getOpcode();
1233
1234 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1235 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1236
1237 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1238 if (NumDivBits > 32)
1239 return nullptr;
1240
1241 Value *Narrowed = nullptr;
1242 if (NumDivBits <= 24) {
1243 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1244 IsDiv, IsSigned);
1245 } else if (NumDivBits <= 32) {
1246 Narrowed = expandDivRem32(Builder, I, Num, Den);
1247 }
1248
1249 if (Narrowed) {
1250 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1251 Builder.CreateZExt(Narrowed, Num->getType());
1252 }
1253
1254 return nullptr;
1255}
1256
1257void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1258 Instruction::BinaryOps Opc = I.getOpcode();
1259 // Do the general expansion.
1260 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1262 return;
1263 }
1264
1265 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1267 return;
1268 }
1269
1270 llvm_unreachable("not a division");
1271}
1272
1273/*
1274This will cause non-byte load in consistency, for example:
1275```
1276 %load = load i1, ptr addrspace(4) %arg, align 4
1277 %zext = zext i1 %load to
1278 i64 %add = add i64 %zext
1279```
1280Instead of creating `s_and_b32 s0, s0, 1`,
1281it will create `s_and_b32 s0, s0, 0xff`.
1282We accept this change since the non-byte load assumes the upper bits
1283within the byte are all 0.
1284*/
1285bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1286 unsigned Opc = I->getOpcode();
1287 Type *OldType = I->getType();
1288
1289 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1290 return false;
1291
1292 unsigned OrigBit = OldType->getScalarSizeInBits();
1293
1294 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1295 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1296 "Instruction::Mul.");
1297
1298 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1299
1300 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1301 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1302 if (!NewType)
1303 return false;
1304 unsigned NewBit = NewType->getIntegerBitWidth();
1305 if (NewBit >= OrigBit)
1306 return false;
1307 NewType = I->getType()->getWithNewBitWidth(NewBit);
1308
1309 // Old cost
1310 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1311 InstructionCost OldCost =
1313 // New cost of new op
1314 InstructionCost NewCost =
1316 // New cost of narrowing 2 operands (use trunc)
1317 int NumOfNonConstOps = 2;
1318 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1319 // Cannot be both constant, should be propagated
1320 NumOfNonConstOps = 1;
1321 }
1322 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1323 NewType, OldType,
1326 // New cost of zext narrowed result to original type
1327 NewCost +=
1328 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1330 if (NewCost >= OldCost)
1331 return false;
1332
1333 IRBuilder<> Builder(I);
1334 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1335 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1336 Value *Arith =
1337 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1338
1339 Value *Zext = Builder.CreateZExt(Arith, OldType);
1340 I->replaceAllUsesWith(Zext);
1341 DeadVals.push_back(I);
1342 return true;
1343}
1344
1345bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1346 if (foldBinOpIntoSelect(I))
1347 return true;
1348
1349 if (UseMul24Intrin && replaceMulWithMul24(I))
1350 return true;
1351 if (tryNarrowMathIfNoOverflow(&I))
1352 return true;
1353
1354 bool Changed = false;
1355 Instruction::BinaryOps Opc = I.getOpcode();
1356 Type *Ty = I.getType();
1357 Value *NewDiv = nullptr;
1358 unsigned ScalarSize = Ty->getScalarSizeInBits();
1359
1361
1362 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1363 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1364 ScalarSize <= 64 &&
1365 !DisableIDivExpand) {
1366 Value *Num = I.getOperand(0);
1367 Value *Den = I.getOperand(1);
1368 IRBuilder<> Builder(&I);
1369 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1370
1371 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1372 NewDiv = PoisonValue::get(VT);
1373
1374 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1375 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1376 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1377
1378 Value *NewElt;
1379 if (ScalarSize <= 32) {
1380 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1381 if (!NewElt)
1382 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1383 } else {
1384 // See if this 64-bit division can be shrunk to 32/24-bits before
1385 // producing the general expansion.
1386 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1387 if (!NewElt) {
1388 // The general 64-bit expansion introduces control flow and doesn't
1389 // return the new value. Just insert a scalar copy and defer
1390 // expanding it.
1391 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1392 // CreateBinOp does constant folding. If the operands are constant,
1393 // it will return a Constant instead of a BinaryOperator.
1394 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1395 Div64ToExpand.push_back(NewEltBO);
1396 }
1397 }
1398
1399 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1400 NewEltI->copyIRFlags(&I);
1401
1402 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1403 }
1404 } else {
1405 if (ScalarSize <= 32)
1406 NewDiv = expandDivRem32(Builder, I, Num, Den);
1407 else {
1408 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1409 if (!NewDiv)
1410 Div64ToExpand.push_back(&I);
1411 }
1412 }
1413
1414 if (NewDiv) {
1415 I.replaceAllUsesWith(NewDiv);
1416 DeadVals.push_back(&I);
1417 Changed = true;
1418 }
1419 }
1420
1421 if (ExpandDiv64InIR) {
1422 // TODO: We get much worse code in specially handled constant cases.
1423 for (BinaryOperator *Div : Div64ToExpand) {
1424 expandDivRem64(*Div);
1425 FlowChanged = true;
1426 Changed = true;
1427 }
1428 }
1429
1430 return Changed;
1431}
1432
1433bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1434 if (!WidenLoads)
1435 return false;
1436
1437 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1438 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1439 canWidenScalarExtLoad(I)) {
1440 IRBuilder<> Builder(&I);
1441 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1442
1443 Type *I32Ty = Builder.getInt32Ty();
1444 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1445 WidenLoad->copyMetadata(I);
1446
1447 // If we have range metadata, we need to convert the type, and not make
1448 // assumptions about the high bits.
1449 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1450 ConstantInt *Lower =
1451 mdconst::extract<ConstantInt>(Range->getOperand(0));
1452
1453 if (Lower->isNullValue()) {
1454 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1455 } else {
1456 Metadata *LowAndHigh[] = {
1457 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1458 // Don't make assumptions about the high bits.
1459 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1460 };
1461
1462 WidenLoad->setMetadata(LLVMContext::MD_range,
1463 MDNode::get(F.getContext(), LowAndHigh));
1464 }
1465 }
1466
1467 int TySize = DL.getTypeSizeInBits(I.getType());
1468 Type *IntNTy = Builder.getIntNTy(TySize);
1469 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1470 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1471 I.replaceAllUsesWith(ValOrig);
1472 DeadVals.push_back(&I);
1473 return true;
1474 }
1475
1476 return false;
1477}
1478
1479bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1480 Value *Cond = I.getCondition();
1481 Value *TrueVal = I.getTrueValue();
1482 Value *FalseVal = I.getFalseValue();
1483 Value *CmpVal;
1484 CmpPredicate Pred;
1485
1486 // Match fract pattern with nan check.
1487 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1488 return false;
1489
1490 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1491 if (!FPOp)
1492 return false;
1493
1494 IRBuilder<> Builder(&I);
1495 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1496
1497 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1498 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1499
1500 Value *Fract = nullptr;
1501 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1502 CmpVal == matchFractPat(*IIFalse)) {
1503 // isnan(x) ? x : fract(x)
1504 Fract = applyFractPat(Builder, CmpVal);
1505 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1506 CmpVal == matchFractPat(*IITrue)) {
1507 // !isnan(x) ? fract(x) : x
1508 Fract = applyFractPat(Builder, CmpVal);
1509 } else
1510 return false;
1511
1512 Fract->takeName(&I);
1513 I.replaceAllUsesWith(Fract);
1514 DeadVals.push_back(&I);
1515 return true;
1516}
1517
1518static bool areInSameBB(const Value *A, const Value *B) {
1519 const auto *IA = dyn_cast<Instruction>(A);
1520 const auto *IB = dyn_cast<Instruction>(B);
1521 return IA && IB && IA->getParent() == IB->getParent();
1522}
1523
1524// Helper for breaking large PHIs that returns true when an extractelement on V
1525// is likely to be folded away by the DAG combiner.
1527 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1528 if (!FVT)
1529 return false;
1530
1531 const Value *CurVal = V;
1532
1533 // Check for insertelements, keeping track of the elements covered.
1534 BitVector EltsCovered(FVT->getNumElements());
1535 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1536 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1537
1538 // Non constant index/out of bounds index -> folding is unlikely.
1539 // The latter is more of a sanity check because canonical IR should just
1540 // have replaced those with poison.
1541 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1542 return false;
1543
1544 const auto *VecSrc = IE->getOperand(0);
1545
1546 // If the vector source is another instruction, it must be in the same basic
1547 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1548 // unlikely to be able to do anything interesting here.
1549 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1550 return false;
1551
1552 CurVal = VecSrc;
1553 EltsCovered.set(Idx->getZExtValue());
1554
1555 // All elements covered.
1556 if (EltsCovered.all())
1557 return true;
1558 }
1559
1560 // We either didn't find a single insertelement, or the insertelement chain
1561 // ended before all elements were covered. Check for other interesting values.
1562
1563 // Constants are always interesting because we can just constant fold the
1564 // extractelements.
1565 if (isa<Constant>(CurVal))
1566 return true;
1567
1568 // shufflevector is likely to be profitable if either operand is a constant,
1569 // or if either source is in the same block.
1570 // This is because shufflevector is most often lowered as a series of
1571 // insert/extract elements anyway.
1572 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1573 return isa<Constant>(SV->getOperand(1)) ||
1574 areInSameBB(SV, SV->getOperand(0)) ||
1575 areInSameBB(SV, SV->getOperand(1));
1576 }
1577
1578 return false;
1579}
1580
1581static void collectPHINodes(const PHINode &I,
1583 const auto [It, Inserted] = SeenPHIs.insert(&I);
1584 if (!Inserted)
1585 return;
1586
1587 for (const Value *Inc : I.incoming_values()) {
1588 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1589 collectPHINodes(*PhiInc, SeenPHIs);
1590 }
1591
1592 for (const User *U : I.users()) {
1593 if (const auto *PhiU = dyn_cast<PHINode>(U))
1594 collectPHINodes(*PhiU, SeenPHIs);
1595 }
1596}
1597
1598bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1599 // Check in the cache first.
1600 if (const auto It = BreakPhiNodesCache.find(&I);
1601 It != BreakPhiNodesCache.end())
1602 return It->second;
1603
1604 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1605 // recursively consider all its users and incoming values that are also PHI
1606 // nodes. We then make a decision about all of those PHIs at once. Either they
1607 // all get broken up, or none of them do. That way, we avoid cases where a
1608 // single PHI is/is not broken and we end up reforming/exploding a vector
1609 // multiple times, or even worse, doing it in a loop.
1610 SmallPtrSet<const PHINode *, 8> WorkList;
1611 collectPHINodes(I, WorkList);
1612
1613#ifndef NDEBUG
1614 // Check that none of the PHI nodes in the worklist are in the map. If some of
1615 // them are, it means we're not good enough at collecting related PHIs.
1616 for (const PHINode *WLP : WorkList) {
1617 assert(BreakPhiNodesCache.count(WLP) == 0);
1618 }
1619#endif
1620
1621 // To consider a PHI profitable to break, we need to see some interesting
1622 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1623 // must have one to consider all PHIs breakable.
1624 //
1625 // This threshold has been determined through performance testing.
1626 //
1627 // Note that the computation below is equivalent to
1628 //
1629 // (unsigned)ceil((K / 3.0) * 2)
1630 //
1631 // It's simply written this way to avoid mixing integral/FP arithmetic.
1632 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1633 unsigned NumBreakablePHIs = 0;
1634 bool CanBreak = false;
1635 for (const PHINode *Cur : WorkList) {
1636 // Don't break PHIs that have no interesting incoming values. That is, where
1637 // there is no clear opportunity to fold the "extractelement" instructions
1638 // we would add.
1639 //
1640 // Note: IC does not run after this pass, so we're only interested in the
1641 // foldings that the DAG combiner can do.
1642 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1643 if (++NumBreakablePHIs >= Threshold) {
1644 CanBreak = true;
1645 break;
1646 }
1647 }
1648 }
1649
1650 for (const PHINode *Cur : WorkList)
1651 BreakPhiNodesCache[Cur] = CanBreak;
1652
1653 return CanBreak;
1654}
1655
1656/// Helper class for "break large PHIs" (visitPHINode).
1657///
1658/// This represents a slice of a PHI's incoming value, which is made up of:
1659/// - The type of the slice (Ty)
1660/// - The index in the incoming value's vector where the slice starts (Idx)
1661/// - The number of elements in the slice (NumElts).
1662/// It also keeps track of the NewPHI node inserted for this particular slice.
1663///
1664/// Slice examples:
1665/// <4 x i64> -> Split into four i64 slices.
1666/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1667/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1668/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1670public:
1671 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1672 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1673
1674 Type *Ty = nullptr;
1675 unsigned Idx = 0;
1676 unsigned NumElts = 0;
1677 PHINode *NewPHI = nullptr;
1678
1679 /// Slice \p Inc according to the information contained within this slice.
1680 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1681 /// pair, it returns the same Sliced value as well.
1682 ///
1683 /// Note this *intentionally* does not return the same value for, say,
1684 /// [%bb.0, %0] & [%bb.1, %0] as:
1685 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1686 /// the value in bb.1 may not be reachable from bb.0 if it's its
1687 /// predecessor.)
1688 /// - We also want to make our extract instructions as local as possible so
1689 /// the DAG has better chances of folding them out. Duplicating them like
1690 /// that is beneficial in that regard.
1691 ///
1692 /// This is both a minor optimization to avoid creating duplicate
1693 /// instructions, but also a requirement for correctness. It is not forbidden
1694 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1695 /// returned a new value each time, those previously identical pairs would all
1696 /// have different incoming values (from the same block) and it'd cause a "PHI
1697 /// node has multiple entries for the same basic block with different incoming
1698 /// values!" verifier error.
1699 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1700 Value *&Res = SlicedVals[{BB, Inc}];
1701 if (Res)
1702 return Res;
1703
1705 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1706 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1707
1708 if (NumElts > 1) {
1710 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1711 Mask.push_back(K);
1712 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1713 } else
1714 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1715
1716 return Res;
1717 }
1718
1719private:
1721};
1722
1723bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1724 // Break-up fixed-vector PHIs into smaller pieces.
1725 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1726 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1727 //
1728 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1729 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1730 // With large, odd-sized PHIs we may end up needing many `build_vector`
1731 // operations with most elements being "undef". This inhibits a lot of
1732 // optimization opportunities and can result in unreasonably high register
1733 // pressure and the inevitable stack spilling.
1734 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1735 return false;
1736
1737 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1738 if (!FVT || FVT->getNumElements() == 1 ||
1739 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1740 return false;
1741
1742 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1743 return false;
1744
1745 std::vector<VectorSlice> Slices;
1746
1747 Type *EltTy = FVT->getElementType();
1748 {
1749 unsigned Idx = 0;
1750 // For 8/16 bits type, don't scalarize fully but break it up into as many
1751 // 32-bit slices as we can, and scalarize the tail.
1752 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1753 const unsigned NumElts = FVT->getNumElements();
1754 if (EltSize == 8 || EltSize == 16) {
1755 const unsigned SubVecSize = (32 / EltSize);
1756 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1757 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1758 Idx += SubVecSize)
1759 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1760 }
1761
1762 // Scalarize all remaining elements.
1763 for (; Idx < NumElts; ++Idx)
1764 Slices.emplace_back(EltTy, Idx, 1);
1765 }
1766
1767 assert(Slices.size() > 1);
1768
1769 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1770 // creating the necessary instruction to extract the relevant slices of each
1771 // incoming value.
1772 IRBuilder<> B(I.getParent());
1773 B.SetCurrentDebugLocation(I.getDebugLoc());
1774
1775 unsigned IncNameSuffix = 0;
1776 for (VectorSlice &S : Slices) {
1777 // We need to reset the build on each iteration, because getSlicedVal may
1778 // have inserted something into I's BB.
1779 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1780 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1781
1782 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1783 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1784 "largephi.extractslice" +
1785 std::to_string(IncNameSuffix++)),
1786 BB);
1787 }
1788 }
1789
1790 // And replace this PHI with a vector of all the previous PHI values.
1791 Value *Vec = PoisonValue::get(FVT);
1792 unsigned NameSuffix = 0;
1793 for (VectorSlice &S : Slices) {
1794 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1795 if (S.NumElts > 1)
1796 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1797 else
1798 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1799 }
1800
1801 I.replaceAllUsesWith(Vec);
1802 DeadVals.push_back(&I);
1803 return true;
1804}
1805
1806/// \param V Value to check
1807/// \param DL DataLayout
1808/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1809/// \param AS Target Address Space
1810/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1811static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1812 const AMDGPUTargetMachine &TM, unsigned AS) {
1813 // Pointer cannot be null if it's a block address, GV or alloca.
1814 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1815 // it as the symbol could be null in such cases.
1817 return true;
1818
1819 // Check nonnull arguments.
1820 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
1821 return true;
1822
1823 // Check nonnull loads.
1824 if (const auto *Load = dyn_cast<LoadInst>(V);
1825 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1826 return true;
1827
1828 // getUnderlyingObject may have looked through another addrspacecast, although
1829 // the optimizable situations most likely folded out by now.
1830 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
1831 return false;
1832
1833 // TODO: Calls that return nonnull?
1834
1835 // For all other things, use KnownBits.
1836 // We either use 0 or all bits set to indicate null, so check whether the
1837 // value can be zero or all ones.
1838 //
1839 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
1840 // address spaces have non-zero null values.
1841 auto SrcPtrKB = computeKnownBits(V, DL);
1842 const auto NullVal = TM.getNullPointerValue(AS);
1843
1844 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1845 assert((NullVal == 0 || NullVal == -1) &&
1846 "don't know how to check for this null value!");
1847 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1848}
1849
1850bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1851 // Intrinsic doesn't support vectors, also it seems that it's often difficult
1852 // to prove that a vector cannot have any nulls in it so it's unclear if it's
1853 // worth supporting.
1854 if (I.getType()->isVectorTy())
1855 return false;
1856
1857 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
1858 // This is only worthwhile for casts from/to priv/local to flat.
1859 const unsigned SrcAS = I.getSrcAddressSpace();
1860 const unsigned DstAS = I.getDestAddressSpace();
1861
1862 bool CanLower = false;
1863 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1864 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
1865 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
1866 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
1867 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1868 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
1869 if (!CanLower)
1870 return false;
1871
1873 getUnderlyingObjects(I.getOperand(0), WorkList);
1874 if (!all_of(WorkList, [&](const Value *V) {
1875 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
1876 }))
1877 return false;
1878
1879 IRBuilder<> B(&I);
1880 auto *Intrin = B.CreateIntrinsic(
1881 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
1882 I.replaceAllUsesWith(Intrin);
1883 DeadVals.push_back(&I);
1884 return true;
1885}
1886
1887bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
1888 switch (I.getIntrinsicID()) {
1889 case Intrinsic::minnum:
1890 case Intrinsic::minimumnum:
1891 case Intrinsic::minimum:
1892 return visitFMinLike(I);
1893 case Intrinsic::sqrt:
1894 return visitSqrt(I);
1895 default:
1896 return false;
1897 }
1898}
1899
1900/// Match non-nan fract pattern.
1901/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
1902/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
1903/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
1904///
1905/// If fract is a useful instruction for the subtarget. Does not account for the
1906/// nan handling; the instruction has a nan check on the input value.
1907Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
1908 if (ST.hasFractBug())
1909 return nullptr;
1910
1911 Intrinsic::ID IID = I.getIntrinsicID();
1912
1913 // The value is only used in contexts where we know the input isn't a nan, so
1914 // any of the fmin variants are fine.
1915 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
1916 IID != Intrinsic::minimumnum)
1917 return nullptr;
1918
1919 Type *Ty = I.getType();
1920 if (!isLegalFloatingTy(Ty->getScalarType()))
1921 return nullptr;
1922
1923 Value *Arg0 = I.getArgOperand(0);
1924 Value *Arg1 = I.getArgOperand(1);
1925
1926 const APFloat *C;
1927 if (!match(Arg1, m_APFloat(C)))
1928 return nullptr;
1929
1930 APFloat One(1.0);
1931 bool LosesInfo;
1932 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
1933
1934 // Match nextafter(1.0, -1)
1935 One.next(true);
1936 if (One != *C)
1937 return nullptr;
1938
1939 Value *FloorSrc;
1940 if (match(Arg0, m_FSub(m_Value(FloorSrc),
1942 return FloorSrc;
1943 return nullptr;
1944}
1945
1946Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
1947 Value *FractArg) {
1948 SmallVector<Value *, 4> FractVals;
1949 extractValues(Builder, FractVals, FractArg);
1950
1951 SmallVector<Value *, 4> ResultVals(FractVals.size());
1952
1953 Type *Ty = FractArg->getType()->getScalarType();
1954 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
1955 ResultVals[I] =
1956 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
1957 }
1958
1959 return insertValues(Builder, FractArg->getType(), ResultVals);
1960}
1961
1962bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
1963 Value *FractArg = matchFractPat(I);
1964 if (!FractArg)
1965 return false;
1966
1967 // Match pattern for fract intrinsic in contexts where the nan check has been
1968 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
1969 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI)))
1970 return false;
1971
1972 IRBuilder<> Builder(&I);
1973 FastMathFlags FMF = I.getFastMathFlags();
1974 FMF.setNoNaNs();
1975 Builder.setFastMathFlags(FMF);
1976
1977 Value *Fract = applyFractPat(Builder, FractArg);
1978 Fract->takeName(&I);
1979 I.replaceAllUsesWith(Fract);
1980 DeadVals.push_back(&I);
1981 return true;
1982}
1983
1984// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
1985bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
1986 Type *Ty = Sqrt.getType()->getScalarType();
1987 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
1988 return false;
1989
1990 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
1991 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
1992
1993 // We're trying to handle the fast-but-not-that-fast case only. The lowering
1994 // of fast llvm.sqrt will give the raw instruction anyway.
1995 if (SqrtFMF.approxFunc())
1996 return false;
1997
1998 const float ReqdAccuracy = FPOp->getFPAccuracy();
1999
2000 // Defer correctly rounded expansion to codegen.
2001 if (ReqdAccuracy < 1.0f)
2002 return false;
2003
2004 Value *SrcVal = Sqrt.getOperand(0);
2005 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2006
2007 // The raw instruction is 1 ulp, but the correction for denormal handling
2008 // brings it to 2.
2009 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2010 return false;
2011
2012 IRBuilder<> Builder(&Sqrt);
2014 extractValues(Builder, SrcVals, SrcVal);
2015
2016 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2017 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2018 if (CanTreatAsDAZ)
2019 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2020 else
2021 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2022 }
2023
2024 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2025 NewSqrt->takeName(&Sqrt);
2026 Sqrt.replaceAllUsesWith(NewSqrt);
2027 DeadVals.push_back(&Sqrt);
2028 return true;
2029}
2030
2031bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2032 if (skipFunction(F))
2033 return false;
2034
2035 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2036 if (!TPC)
2037 return false;
2038
2039 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2040 const TargetLibraryInfo *TLI =
2041 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2042 AssumptionCache *AC =
2043 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2044 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2045 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2046 const UniformityInfo &UA =
2047 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2048 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2049}
2050
2053 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2054 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
2055 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
2056 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2057 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
2058 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2059 if (!Impl.run())
2060 return PreservedAnalyses::all();
2062 if (!Impl.FlowChanged)
2064 return PA;
2065}
2066
2067INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2068 "AMDGPU IR optimizations", false, false)
2072INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2074
2075char AMDGPUCodeGenPrepare::ID = 0;
2076
2078 return new AMDGPUCodeGenPrepare();
2079}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Value * RHS
Value * LHS
BinaryOperator * Mul
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
bool hasMadMacF32Insts() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
BitVector & set()
Definition BitVector.h:370
bool all() const
all - Returns true if all bits are set.
Definition BitVector.h:194
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:610
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
void setFast(bool B=true)
Definition FMF.h:96
bool noInfs() const
Definition FMF.h:66
bool allowReciprocal() const
Definition FMF.h:68
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:65
bool allowContract() const
Definition FMF.h:69
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasFractBug() const
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1670
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2158
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2103
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2131
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2145
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2387
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1784
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:334
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2344
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1599
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1793
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2118
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2382
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2138
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
Definition bit.h:90
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.