LLVM 23.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUMemoryUtils.h"
17#include "AMDGPUTargetMachine.h"
19#include "llvm/ADT/SetVector.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/InstVisitor.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/IR/ValueHandle.h"
34#include "llvm/Pass.h"
40
41#define DEBUG_TYPE "amdgpu-codegenprepare"
42
43using namespace llvm;
44using namespace llvm::PatternMatch;
45
46namespace {
47
49 "amdgpu-codegenprepare-widen-constant-loads",
50 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
52 cl::init(false));
53
54static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
58
59static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
64
65static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
69
70static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
74 cl::init(true));
75
76// Legalize 64-bit division by using the generic IR expansion.
77static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
81 cl::init(false));
82
83// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84// and is used for testing the legalizer.
85static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
89 cl::init(false));
90
91// Disable processing of fdiv so we can better test the backend implementations.
92static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
96 cl::init(false));
97
98class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100public:
101 Function &F;
102 const GCNSubtarget &ST;
103 const AMDGPUTargetMachine &TM;
104 const TargetLibraryInfo *TLI;
105 const UniformityInfo &UA;
106 const DataLayout &DL;
107 SimplifyQuery SQ;
108 const bool HasFP32DenormalFlush;
109 bool FlowChanged = false;
110 mutable Function *SqrtF32 = nullptr;
111 mutable Function *LdexpF32 = nullptr;
112 mutable SmallVector<WeakVH> DeadVals;
113
114 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
115
116 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
117 const TargetLibraryInfo *TLI, AssumptionCache *AC,
118 const DominatorTree *DT, const UniformityInfo &UA)
119 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
120 DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
121 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
123
124 Function *getSqrtF32() const {
125 if (SqrtF32)
126 return SqrtF32;
127
128 LLVMContext &Ctx = F.getContext();
130 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
131 return SqrtF32;
132 }
133
134 Function *getLdexpF32() const {
135 if (LdexpF32)
136 return LdexpF32;
137
138 LLVMContext &Ctx = F.getContext();
140 F.getParent(), Intrinsic::ldexp,
141 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
142 return LdexpF32;
143 }
144
145 bool canBreakPHINode(const PHINode &I);
146
147 /// Return true if \p T is a legal scalar floating point type.
148 bool isLegalFloatingTy(const Type *T) const;
149
150 /// Wrapper to pass all the arguments to computeKnownFPClass
152 const Instruction *CtxI) const {
153 return llvm::computeKnownFPClass(V, Interested,
154 SQ.getWithInstruction(CtxI));
155 }
156
157 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
158 return HasFP32DenormalFlush ||
160 }
161
162 /// \returns The minimum number of bits needed to store the value of \Op as an
163 /// unsigned integer. Truncating to this size and then zero-extending to
164 /// the original will not change the value.
165 unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
166
167 /// \returns The minimum number of bits needed to store the value of \Op as a
168 /// signed integer. Truncating to this size and then sign-extending to
169 /// the original size will not change the value.
170 unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
171
172 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
173 /// SelectionDAG has an issue where an and asserting the bits are known
174 bool replaceMulWithMul24(BinaryOperator &I) const;
175
176 /// Perform same function as equivalently named function in DAGCombiner. Since
177 /// we expand some divisions here, we need to perform this before obscuring.
178 bool foldBinOpIntoSelect(BinaryOperator &I) const;
179
180 bool divHasSpecialOptimization(BinaryOperator &I,
181 Value *Num, Value *Den) const;
182 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
183 unsigned MaxDivBits, bool Signed) const;
184
185 /// Expands div or rem by using floating-point operations.
186 /// Operands must be in the range [-0x400000,0x3FFFFF]
187 Value *expandDivRemToFloat(IRBuilder<> &Builder, BinaryOperator &I,
188 Value *Num, Value *Den, bool IsDiv,
189 bool IsSigned) const;
190
191 Value *expandDivRemToFloatImpl(IRBuilder<> &Builder, BinaryOperator &I,
192 Value *Num, Value *Den, unsigned NumBits,
193 bool IsDiv, bool IsSigned) const;
194
195 /// Expands 32 bit div or rem.
196 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
197 Value *Num, Value *Den) const;
198
199 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
200 Value *Num, Value *Den) const;
201 void expandDivRem64(BinaryOperator &I) const;
202
203 /// Widen a scalar load.
204 ///
205 /// \details \p Widen scalar load for uniform, small type loads from constant
206 // memory / to a full 32-bits and then truncate the input to allow a scalar
207 // load instead of a vector load.
208 //
209 /// \returns True.
210
211 bool canWidenScalarExtLoad(LoadInst &I) const;
212
213 Value *matchFractPatImpl(Value &V, const APFloat &C) const;
214 Value *matchFractPatNanAvoidant(Value &V);
215 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
216
217 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
218
219 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
220 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
221 const Instruction *CtxI) const;
222
223 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
224 FastMathFlags FMF, const Instruction *CtxI) const;
225 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
226 float ReqdAccuracy) const;
227
228 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
229 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
230 Value *RsqOp, const Instruction *FDiv,
231 float ReqdAccuracy) const;
232
233 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
234 Value *Src) const;
235
236 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
237 bool IsNegative) const;
238 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
239 FastMathFlags FMF) const;
240 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
241 FastMathFlags FMF) const;
242 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
243 FastMathFlags DivFMF, const Instruction *CtxI,
244 bool IsNegative) const;
245
246 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
247 void replaceWithWorkitemIdX(Instruction &I) const;
248 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
249 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
250
251 bool tryNarrowMathIfNoOverflow(Instruction *I);
252
253public:
254 bool visitFDiv(BinaryOperator &I);
255
256 bool visitInstruction(Instruction &I) { return false; }
257 bool visitBinaryOperator(BinaryOperator &I);
258 bool visitLoadInst(LoadInst &I);
259 bool visitSelectInst(SelectInst &I);
260 bool visitPHINode(PHINode &I);
261 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
262
263 bool visitIntrinsicInst(IntrinsicInst &I);
264 bool visitFMinLike(IntrinsicInst &I);
265 bool visitSqrt(IntrinsicInst &I);
266 bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);
267 bool visitMbcntLo(IntrinsicInst &I) const;
268 bool visitMbcntHi(IntrinsicInst &I) const;
269 bool visitVectorReduceAdd(IntrinsicInst &I);
270 bool visitSaturatingAdd(IntrinsicInst &I);
271 bool run();
272};
273
274class AMDGPUCodeGenPrepare : public FunctionPass {
275public:
276 static char ID;
277 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
278 void getAnalysisUsage(AnalysisUsage &AU) const override {
282
283 // FIXME: Division expansion needs to preserve the dominator tree.
284 if (!ExpandDiv64InIR)
285 AU.setPreservesAll();
286 }
287 bool runOnFunction(Function &F) override;
288 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
289};
290
291} // end anonymous namespace
292
293bool AMDGPUCodeGenPrepareImpl::run() {
294 BreakPhiNodesCache.clear();
295 bool MadeChange = false;
296
297 // Need to use make_early_inc_range because integer division expansion is
298 // handled by Transform/Utils, and it can delete instructions such as the
299 // terminator of the BB.
300 for (BasicBlock &BB : reverse(F)) {
301 for (Instruction &I : make_early_inc_range(reverse(BB))) {
302 if (!isInstructionTriviallyDead(&I, TLI))
303 MadeChange |= visit(I);
304 }
305 }
306
307 while (!DeadVals.empty()) {
308 if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
310 }
311
312 return MadeChange;
313}
314
315bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
316 return Ty->isFloatTy() || Ty->isDoubleTy() ||
317 (Ty->isHalfTy() && ST.has16BitInsts());
318}
319
320bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
321 Type *Ty = I.getType();
322 int TySize = DL.getTypeSizeInBits(Ty);
323 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
324
325 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniformAtDef(&I);
326}
327
328unsigned
329AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
330 const Instruction *CtxI) const {
331 return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();
332}
333
334unsigned
335AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
336 const Instruction *CtxI) const {
337 return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);
338}
339
340static void extractValues(IRBuilder<> &Builder,
341 SmallVectorImpl<Value *> &Values, Value *V) {
342 auto *VT = dyn_cast<FixedVectorType>(V->getType());
343 if (!VT) {
344 Values.push_back(V);
345 return;
346 }
347
348 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
349 Values.push_back(Builder.CreateExtractElement(V, I));
350}
351
353 Type *Ty,
354 SmallVectorImpl<Value *> &Values) {
355 if (!Ty->isVectorTy()) {
356 assert(Values.size() == 1);
357 return Values[0];
358 }
359
360 Value *NewVal = PoisonValue::get(Ty);
361 for (int I = 0, E = Values.size(); I != E; ++I)
362 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
363
364 return NewVal;
365}
366
367bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
368 if (I.getOpcode() != Instruction::Mul)
369 return false;
370
371 Type *Ty = I.getType();
372 unsigned Size = Ty->getScalarSizeInBits();
373 if (Size <= 16 && ST.has16BitInsts())
374 return false;
375
376 // Prefer scalar if this could be s_mul_i32
377 if (UA.isUniformAtDef(&I))
378 return false;
379
380 Value *LHS = I.getOperand(0);
381 Value *RHS = I.getOperand(1);
382 IRBuilder<> Builder(&I);
383 Builder.SetCurrentDebugLocation(I.getDebugLoc());
384
385 unsigned LHSBits = 0, RHSBits = 0;
386 bool IsSigned = false;
387
388 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&
389 (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {
390 IsSigned = false;
391
392 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&
393 (RHSBits = numBitsSigned(RHS, &I)) <= 24) {
394 IsSigned = true;
395
396 } else
397 return false;
398
399 SmallVector<Value *, 4> LHSVals;
400 SmallVector<Value *, 4> RHSVals;
401 SmallVector<Value *, 4> ResultVals;
402 extractValues(Builder, LHSVals, LHS);
403 extractValues(Builder, RHSVals, RHS);
404
405 IntegerType *I32Ty = Builder.getInt32Ty();
406 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
407 Type *DstTy = LHSVals[0]->getType();
408
409 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
410 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
411 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
412 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
413 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
415 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
416 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
417 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
418 : Builder.CreateZExtOrTrunc(Result, DstTy);
419 ResultVals.push_back(Result);
420 }
421
422 Value *NewVal = insertValues(Builder, Ty, ResultVals);
423 NewVal->takeName(&I);
424 I.replaceAllUsesWith(NewVal);
425 DeadVals.push_back(&I);
426
427 return true;
428}
429
430// Find a select instruction, which may have been casted. This is mostly to deal
431// with cases where i16 selects were promoted here to i32.
433 Cast = nullptr;
434 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
435 return Sel;
436
437 if ((Cast = dyn_cast<CastInst>(V))) {
438 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
439 return Sel;
440 }
441
442 return nullptr;
443}
444
445bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
446 // Don't do this unless the old select is going away. We want to eliminate the
447 // binary operator, not replace a binop with a select.
448 int SelOpNo = 0;
449
450 CastInst *CastOp;
451
452 // TODO: Should probably try to handle some cases with multiple
453 // users. Duplicating the select may be profitable for division.
454 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
455 if (!Sel || !Sel->hasOneUse()) {
456 SelOpNo = 1;
457 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
458 }
459
460 if (!Sel || !Sel->hasOneUse())
461 return false;
462
465 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
466 if (!CBO || !CT || !CF)
467 return false;
468
469 if (CastOp) {
470 if (!CastOp->hasOneUse())
471 return false;
472 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
473 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
474 }
475
476 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
477 // need to handle divisions here.
478 Constant *FoldedT =
479 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
480 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
481 if (!FoldedT || isa<ConstantExpr>(FoldedT))
482 return false;
483
484 Constant *FoldedF =
485 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
486 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
487 if (!FoldedF || isa<ConstantExpr>(FoldedF))
488 return false;
489
490 IRBuilder<> Builder(&BO);
491 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
492 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
493 Builder.setFastMathFlags(FPOp->getFastMathFlags());
494
495 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
496 FoldedT, FoldedF);
497 NewSelect->takeName(&BO);
498 BO.replaceAllUsesWith(NewSelect);
499 DeadVals.push_back(&BO);
500 if (CastOp)
501 DeadVals.push_back(CastOp);
502 DeadVals.push_back(Sel);
503 return true;
504}
505
506std::pair<Value *, Value *>
507AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
508 Value *Src) const {
509 Type *Ty = Src->getType();
510 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
511 {Ty, Builder.getInt32Ty()}, Src);
512 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
513
514 // Bypass the bug workaround for the exponent result since it doesn't matter.
515 // TODO: Does the bug workaround even really need to consider the exponent
516 // result? It's unspecified by the spec.
517
518 Value *FrexpExp =
519 ST.hasFractBug()
520 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
521 {Builder.getInt32Ty(), Ty}, Src)
522 : Builder.CreateExtractValue(Frexp, {1});
523 return {FrexpMant, FrexpExp};
524}
525
526/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
527Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
528 Value *Src,
529 bool IsNegative) const {
530 // Same as for 1.0, but expand the sign out of the constant.
531 // -1.0 / x -> rcp (fneg x)
532 if (IsNegative)
533 Src = Builder.CreateFNeg(Src);
534
535 // The rcp instruction doesn't support denormals, so scale the input
536 // out of the denormal range and convert at the end.
537 //
538 // Expand as 2^-n * (1.0 / (x * 2^n))
539
540 // TODO: Skip scaling if input is known never denormal and the input
541 // range won't underflow to denormal. The hard part is knowing the
542 // result. We need a range check, the result could be denormal for
543 // 0x1p+126 < den <= 0x1p+127.
544 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
545 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
546 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
547 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
548}
549
550/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
551Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
552 Value *RHS,
553 FastMathFlags FMF) const {
554 // If we have have to work around the fract/frexp bug, we're worse off than
555 // using the fdiv.fast expansion. The full safe expansion is faster if we have
556 // fast FMA.
557 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
558 (!FMF.noNaNs() || !FMF.noInfs()))
559 return nullptr;
560
561 // We're scaling the LHS to avoid a denormal input, and scale the denominator
562 // to avoid large values underflowing the result.
563 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
564
565 Value *Rcp =
566 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
567
568 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
569 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
570
571 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
572 // result.
573 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
574 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
575}
576
577/// Emit a sqrt that handles denormals and is accurate to 2ulp.
578Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
579 Value *Src,
580 FastMathFlags FMF) const {
581 Type *Ty = Src->getType();
582 APFloat SmallestNormal =
584 Value *NeedScale =
585 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
586
587 ConstantInt *Zero = Builder.getInt32(0);
588 Value *InputScaleFactor =
589 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
590
591 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
592
593 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
594
595 Value *OutputScaleFactor =
596 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
597 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
598}
599
600/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
601static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
602 bool IsNegative) {
603 // bool need_scale = x < 0x1p-126f;
604 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
605 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
606 // rsq(x * input_scale) * output_scale;
607
608 Type *Ty = Src->getType();
609 APFloat SmallestNormal =
610 APFloat::getSmallestNormalized(Ty->getFltSemantics());
611 Value *NeedScale =
612 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
613 Constant *One = ConstantFP::get(Ty, 1.0);
614 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
615 Constant *OutputScale =
616 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
617
618 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
619
620 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
621 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
622 Value *OutputScaleFactor = Builder.CreateSelect(
623 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
624
625 return Builder.CreateFMul(Rsq, OutputScaleFactor);
626}
627
628/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
629/// v_rsq_f64. This should give a 1ulp result.
630Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
631 FastMathFlags SqrtFMF,
632 FastMathFlags DivFMF,
633 const Instruction *CtxI,
634 bool IsNegative) const {
635 // rsq(x):
636 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
637 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
638 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
639 //
640 // -rsq(x):
641 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
642 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
643 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
644 //
645 // The rsq instruction handles the special cases correctly. We need to check
646 // for the edge case conditions to ensure the special case propagates through
647 // the later instructions.
648
649 Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
650
651 // Try to elide the edge case check.
652 //
653 // Fast math flags imply:
654 // sqrt ninf => !isinf(x)
655 // fdiv ninf => x != 0, !isinf(x)
656 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
657 bool MaybeZero = !DivFMF.noInfs();
658
659 DenormalMode DenormMode;
660 FPClassTest Interested = fcNone;
661 if (MaybePosInf)
662 Interested = fcPosInf;
663 if (MaybeZero)
664 Interested |= fcZero;
665
666 if (Interested != fcNone) {
667 KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
668 if (KnownSrc.isKnownNeverPosInfinity())
669 MaybePosInf = false;
670
671 DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
672 if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
673 MaybeZero = false;
674 }
675
676 Value *SpecialOrRsq = X;
677 if (MaybeZero || MaybePosInf) {
678 Value *Cond;
679 if (MaybePosInf && MaybeZero) {
680 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
681 FPClassTest TestMask = fcPosInf | fcZero;
682 if (DenormMode.inputsAreZero())
683 TestMask |= fcSubnormal;
684
685 Cond = Builder.createIsFPClass(X, TestMask);
686 } else {
687 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
688 // doesn't respect the floating-point environment.
689 Value *IsZero =
690 Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
691 Value *IsInf =
692 Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
693 Cond = Builder.CreateOr(IsZero, IsInf);
694 }
695 } else if (MaybeZero) {
696 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
697 } else {
698 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
699 }
700
701 SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
702 }
703
704 Value *NegY0 = Builder.CreateFNeg(Y0);
705 Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
706
707 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
708 Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
709
710 Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
711
712 Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
713 ConstantFP::get(X->getType(), 0.5));
714
715 return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
716}
717
718bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
719 FastMathFlags SqrtFMF) const {
720 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
721 // f64.
722 return DivFMF.allowContract() && SqrtFMF.allowContract();
723}
724
725Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
726 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
727 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
728 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
729 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
730
731 // rsq_f16 is accurate to 0.51 ulp.
732 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
733 // rsq_f64 is never accurate.
734 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
735 if (!CLHS)
736 return nullptr;
737
738 bool IsNegative = false;
739
740 // TODO: Handle other numerator values with arcp.
741 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
742 // Add in the sqrt flags.
743 IRBuilder<>::FastMathFlagGuard Guard(Builder);
744 Builder.setFastMathFlags(DivFMF | SqrtFMF);
745
746 if (Den->getType()->isFloatTy()) {
747 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
748 canIgnoreDenormalInput(Den, CtxI)) {
749 Value *Result =
750 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
751 // -1.0 / sqrt(x) -> fneg(rsq(x))
752 return IsNegative ? Builder.CreateFNeg(Result) : Result;
753 }
754
755 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
756 }
757
758 if (Den->getType()->isDoubleTy())
759 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
760 }
761
762 return nullptr;
763}
764
765// Optimize fdiv with rcp:
766//
767// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
768// allowed with afn.
769//
770// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
771Value *
772AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
773 Value *Den, FastMathFlags FMF,
774 const Instruction *CtxI) const {
775 // rcp_f16 is accurate to 0.51 ulp.
776 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
777 // rcp_f64 is never accurate.
778 assert(Den->getType()->isFloatTy());
779
780 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
781 bool IsNegative = false;
782 if (CLHS->isExactlyValue(1.0) ||
783 (IsNegative = CLHS->isExactlyValue(-1.0))) {
784 Value *Src = Den;
785
786 if (HasFP32DenormalFlush || FMF.approxFunc()) {
787 // -1.0 / x -> 1.0 / fneg(x)
788 if (IsNegative)
789 Src = Builder.CreateFNeg(Src);
790
791 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
792 // the CI documentation has a worst case error of 1 ulp.
793 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
794 // to use it as long as we aren't trying to use denormals.
795 //
796 // v_rcp_f16 and v_rsq_f16 DO support denormals.
797
798 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
799 // insert rsq intrinsic here.
800
801 // 1.0 / x -> rcp(x)
802 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
803 }
804
805 // TODO: If the input isn't denormal, and we know the input exponent isn't
806 // big enough to introduce a denormal we can avoid the scaling.
807 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
808 }
809 }
810
811 if (FMF.allowReciprocal()) {
812 // x / y -> x * (1.0 / y)
813
814 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
815 // will never underflow.
816 if (HasFP32DenormalFlush || FMF.approxFunc()) {
817 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
818 return Builder.CreateFMul(Num, Recip);
819 }
820
821 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
822 return Builder.CreateFMul(Num, Recip);
823 }
824
825 return nullptr;
826}
827
828// optimize with fdiv.fast:
829//
830// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
831//
832// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
833//
834// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
835Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
836 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
837 // fdiv.fast can achieve 2.5 ULP accuracy.
838 if (ReqdAccuracy < 2.5f)
839 return nullptr;
840
841 // Only have fdiv.fast for f32.
842 assert(Den->getType()->isFloatTy());
843
844 bool NumIsOne = false;
845 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
846 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
847 NumIsOne = true;
848 }
849
850 // fdiv does not support denormals. But 1.0/x is always fine to use it.
851 //
852 // TODO: This works for any value with a specific known exponent range, don't
853 // just limit to constant 1.
854 if (!HasFP32DenormalFlush && !NumIsOne)
855 return nullptr;
856
857 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
858}
859
860Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
861 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
862 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
863 float ReqdDivAccuracy) const {
864 if (RsqOp) {
865 Value *Rsq =
866 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
867 if (Rsq)
868 return Rsq;
869 }
870
871 if (!Num->getType()->isFloatTy())
872 return nullptr;
873
874 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
875 if (Rcp)
876 return Rcp;
877
878 // In the basic case fdiv_fast has the same instruction count as the frexp div
879 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
880 // potentially be fused into a user. Also, materialization of the constants
881 // can be reused for multiple instances.
882 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
883 if (FDivFast)
884 return FDivFast;
885
886 return emitFrexpDiv(Builder, Num, Den, DivFMF);
887}
888
889// Optimizations is performed based on fpmath, fast math flags as well as
890// denormals to optimize fdiv with either rcp or fdiv.fast.
891//
892// With rcp:
893// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
894// allowed with afn.
895//
896// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
897//
898// With fdiv.fast:
899// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
900//
901// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
902//
903// NOTE: rcp is the preference in cases that both are legal.
904bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
905 if (DisableFDivExpand)
906 return false;
907
908 Type *Ty = FDiv.getType()->getScalarType();
909 const bool IsFloat = Ty->isFloatTy();
910 if (!IsFloat && !Ty->isDoubleTy())
911 return false;
912
913 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
914 // expansion around them in codegen. f16 is good enough to always use.
915
916 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
917 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
918 const float ReqdAccuracy = FPOp->getFPAccuracy();
919
920 FastMathFlags SqrtFMF;
921
922 Value *Num = FDiv.getOperand(0);
923 Value *Den = FDiv.getOperand(1);
924
925 Value *RsqOp = nullptr;
926 auto *DenII = dyn_cast<IntrinsicInst>(Den);
927 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
928 DenII->hasOneUse()) {
929 const auto *SqrtOp = cast<FPMathOperator>(DenII);
930 SqrtFMF = SqrtOp->getFastMathFlags();
931 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
932 RsqOp = SqrtOp->getOperand(0);
933 }
934
935 // rcp path not yet implemented for f64.
936 if (!IsFloat && !RsqOp)
937 return false;
938
939 // Inaccurate rcp is allowed with afn.
940 //
941 // Defer to codegen to handle this.
942 //
943 // TODO: Decide on an interpretation for interactions between afn + arcp +
944 // !fpmath, and make it consistent between here and codegen. For now, defer
945 // expansion of afn to codegen. The current interpretation is so aggressive we
946 // don't need any pre-consideration here when we have better information. A
947 // more conservative interpretation could use handling here.
948 const bool AllowInaccurateRcp = DivFMF.approxFunc();
949 if (!RsqOp && AllowInaccurateRcp)
950 return false;
951
952 // Defer the correct implementations to codegen.
953 if (IsFloat && ReqdAccuracy < 1.0f)
954 return false;
955
956 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
957 Builder.setFastMathFlags(DivFMF);
958 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
959
960 SmallVector<Value *, 4> NumVals;
961 SmallVector<Value *, 4> DenVals;
962 SmallVector<Value *, 4> RsqDenVals;
963 extractValues(Builder, NumVals, Num);
964 extractValues(Builder, DenVals, Den);
965
966 if (RsqOp)
967 extractValues(Builder, RsqDenVals, RsqOp);
968
969 SmallVector<Value *, 4> ResultVals(NumVals.size());
970 for (int I = 0, E = NumVals.size(); I != E; ++I) {
971 Value *NumElt = NumVals[I];
972 Value *DenElt = DenVals[I];
973 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
974
975 Value *NewElt =
976 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
977 cast<Instruction>(FPOp), ReqdAccuracy);
978 if (!NewElt) {
979 // Keep the original, but scalarized.
980
981 // This has the unfortunate side effect of sometimes scalarizing when
982 // we're not going to do anything.
983 NewElt = Builder.CreateFDiv(NumElt, DenElt);
984 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
985 NewEltInst->copyMetadata(FDiv);
986 }
987
988 ResultVals[I] = NewElt;
989 }
990
991 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
992
993 if (NewVal) {
994 FDiv.replaceAllUsesWith(NewVal);
995 NewVal->takeName(&FDiv);
996 DeadVals.push_back(&FDiv);
997 }
998
999 return true;
1000}
1001
1002static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1003 Value *LHS, Value *RHS) {
1004 Type *I32Ty = Builder.getInt32Ty();
1005 Type *I64Ty = Builder.getInt64Ty();
1006
1007 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1008 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1009 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1010 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1011 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1012 Hi = Builder.CreateTrunc(Hi, I32Ty);
1013 return std::pair(Lo, Hi);
1014}
1015
1016static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1017 return getMul64(Builder, LHS, RHS).second;
1018}
1019
1020/// Figure out how many bits are really needed for this division.
1021/// \p MaxDivBits is an optimization hint to bypass the second
1022/// ComputeNumSignBits/computeKnownBits call if the first one is
1023/// insufficient.
1024unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1025 Value *Den,
1026 unsigned MaxDivBits,
1027 bool IsSigned) const {
1029 Den->getType()->getScalarSizeInBits());
1030 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1031 if (IsSigned) {
1032 unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);
1033 // A sign bit needs to be reserved for shrinking.
1034 unsigned DivBits = SSBits - RHSSignBits + 1;
1035 if (DivBits > MaxDivBits)
1036 return SSBits;
1037
1038 unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);
1039
1040 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1041 DivBits = SSBits - SignBits + 1;
1042 return DivBits;
1043 }
1044
1045 // All bits are used for unsigned division for Num or Den in range
1046 // (SignedMax, UnsignedMax].
1047 KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));
1048 unsigned RHSBits = Known.countMaxActiveBits();
1049 if (RHSBits > MaxDivBits)
1050 return SSBits;
1051
1052 Known = computeKnownBits(Num, SQ.getWithInstruction(&I));
1053 unsigned LHSBits = Known.countMaxActiveBits();
1054
1055 unsigned DivBits = std::max(LHSBits, RHSBits);
1056 return DivBits;
1057}
1058
1059Value *AMDGPUCodeGenPrepareImpl::expandDivRemToFloat(IRBuilder<> &Builder,
1060 BinaryOperator &I,
1061 Value *Num, Value *Den,
1062 bool IsDiv,
1063 bool IsSigned) const {
1064 unsigned DivBits = getDivNumBits(I, Num, Den, 23, IsSigned);
1065
1066 if (DivBits > (IsSigned ? 23 : 22))
1067 return nullptr;
1068 return expandDivRemToFloatImpl(Builder, I, Num, Den, DivBits, IsDiv,
1069 IsSigned);
1070}
1071
1072Value *AMDGPUCodeGenPrepareImpl::expandDivRemToFloatImpl(
1073 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1074 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1075
1076 // v_rcp_f32(float(X)) can have an error of 1 ulp.
1077 // This would cause incorrect calculation of Y/X if:
1078 // Y = (0x7FFFFF/X)*(X-0)-1
1079 // were allowed.
1080 //
1081 // For example,
1082 // (0x7FF6D3/0x000FE7) would erroneously produce 2060 instead of 2059.
1083 // (0x7FF8F5/0x007EFB) would erroneously produce 258 instead of 257.
1084 //
1085 // Thus, we conservatively restrict expandDivRemToFloatImpl to
1086 // [-0x40000,0x3FFFFF] for IsSigned
1087 // [0x000000,0x3FFFFF] for !IsSigned.
1088 assert(DivBits <= (IsSigned ? 23 : 22) &&
1089 "abs(Num) must be <= than 0x40000 for expandDivRemToFloatImpl to work "
1090 "correctly");
1091
1092 Type *I32Ty = Builder.getInt32Ty();
1093 Num = Builder.CreateTrunc(Num, I32Ty);
1094 Den = Builder.CreateTrunc(Den, I32Ty);
1095
1096 Type *F32Ty = Builder.getFloatTy();
1097 ConstantInt *One = Builder.getInt32(1);
1098 Value *JQ = One;
1099
1100 if (IsSigned) {
1101 // char|short jq = ia ^ ib;
1102 JQ = Builder.CreateXor(Num, Den);
1103
1104 // jq = jq >> (bitsize - 2)
1105 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1106
1107 // jq = jq | 0x1
1108 JQ = Builder.CreateOr(JQ, One);
1109 }
1110
1111 // int ia = (int)LHS;
1112 Value *IA = Num;
1113
1114 // int ib, (int)RHS;
1115 Value *IB = Den;
1116
1117 // float fa = (float)ia;
1118 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1119 : Builder.CreateUIToFP(IA, F32Ty);
1120
1121 // float fb = (float)ib;
1122 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1123 : Builder.CreateUIToFP(IB,F32Ty);
1124
1125 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1126 Builder.getFloatTy(), {FB});
1127 Value *FQM = Builder.CreateFMul(FA, RCP);
1128
1129 // fq = trunc(fqm);
1130 Value *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1131 auto *FQI = dyn_cast<Instruction>(FQ);
1132 if (FQI)
1133 FQI->copyFastMathFlags(Builder.getFastMathFlags());
1134
1135 // float fqneg = -fq;
1136 Value *FQNeg = Builder.CreateFNeg(FQ);
1137
1138 // float fr = mad(fqneg, fb, fa);
1139 auto FMAD = !ST.hasMadMacF32Insts()
1140 ? Intrinsic::fma
1141 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1142 Value *FR =
1143 Builder.CreateIntrinsic(FMAD, {FQNeg->getType()}, {FQNeg, FB, FA}, FQI);
1144
1145 // int iq = (int)fq;
1146 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1147 : Builder.CreateFPToUI(FQ, I32Ty);
1148
1149 // fr = fabs(fr);
1150 FR = Builder.CreateFAbs(FR, FQI);
1151
1152 // fb = fabs(fb);
1153 FB = Builder.CreateFAbs(FB, FQI);
1154
1155 // int cv = fr >= fb;
1156 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1157
1158 // jq = (cv ? jq : 0);
1159 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1160
1161 // dst = iq + jq;
1162 Value *Div = Builder.CreateAdd(IQ, JQ);
1163
1164 Value *Res = Div;
1165 if (!IsDiv) {
1166 // Rem needs compensation, it's easier to recompute it
1167 Value *Rem = Builder.CreateMul(Div, Den);
1168 Res = Builder.CreateSub(Num, Rem);
1169 }
1170
1171 if (DivBits != 0 && DivBits < 32) {
1172 // Extend in register from the number of bits this divide really is.
1173 if (IsSigned) {
1174 int InRegBits = 32 - DivBits;
1175
1176 Res = Builder.CreateShl(Res, InRegBits);
1177 Res = Builder.CreateAShr(Res, InRegBits);
1178 } else {
1179 ConstantInt *TruncMask
1180 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1181 Res = Builder.CreateAnd(Res, TruncMask);
1182 }
1183 }
1184
1185 return Res;
1186}
1187
1188// Try to recognize special cases the DAG will emit special, better expansions
1189// than the general expansion we do here.
1190
1191// TODO: It would be better to just directly handle those optimizations here.
1192bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1193 Value *Num,
1194 Value *Den) const {
1195 if (Constant *C = dyn_cast<Constant>(Den)) {
1196 // Arbitrary constants get a better expansion as long as a wider mulhi is
1197 // legal.
1198 if (C->getType()->getScalarSizeInBits() <= 32)
1199 return true;
1200
1201 // TODO: Sdiv check for not exact for some reason.
1202
1203 // If there's no wider mulhi, there's only a better expansion for powers of
1204 // two.
1205 // TODO: Should really know for each vector element.
1207 return true;
1208
1209 return false;
1210 }
1211
1212 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1213 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1214 if (BinOpDen->getOpcode() == Instruction::Shl &&
1215 isa<Constant>(BinOpDen->getOperand(0)) &&
1216 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,
1217 SQ.getWithInstruction(&I))) {
1218 return true;
1219 }
1220 }
1221
1222 return false;
1223}
1224
1225static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1226 // Check whether the sign can be determined statically.
1227 KnownBits Known = computeKnownBits(V, DL);
1228 if (Known.isNegative())
1229 return Constant::getAllOnesValue(V->getType());
1230 if (Known.isNonNegative())
1231 return Constant::getNullValue(V->getType());
1232 return Builder.CreateAShr(V, Builder.getInt32(31));
1233}
1234
1235Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1236 BinaryOperator &I, Value *X,
1237 Value *Y) const {
1238 Instruction::BinaryOps Opc = I.getOpcode();
1239 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1240 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1241
1242 FastMathFlags FMF;
1243 FMF.setFast();
1244 Builder.setFastMathFlags(FMF);
1245
1246 if (divHasSpecialOptimization(I, X, Y))
1247 return nullptr; // Keep it for later optimization.
1248
1249 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1250 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1251
1252 Type *Ty = X->getType();
1253 Type *I32Ty = Builder.getInt32Ty();
1254 Type *F32Ty = Builder.getFloatTy();
1255
1256 if (Ty->getScalarSizeInBits() != 32) {
1257 if (IsSigned) {
1258 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1259 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1260 } else {
1261 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1262 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1263 }
1264 }
1265
1266 if (Value *Res = expandDivRemToFloat(Builder, I, X, Y, IsDiv, IsSigned)) {
1267 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1268 Builder.CreateZExtOrTrunc(Res, Ty);
1269 }
1270
1271 ConstantInt *Zero = Builder.getInt32(0);
1272 ConstantInt *One = Builder.getInt32(1);
1273
1274 Value *Sign = nullptr;
1275 if (IsSigned) {
1276 Value *SignX = getSign32(X, Builder, DL);
1277 Value *SignY = getSign32(Y, Builder, DL);
1278 // Remainder sign is the same as LHS
1279 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1280
1281 X = Builder.CreateAdd(X, SignX);
1282 Y = Builder.CreateAdd(Y, SignY);
1283
1284 X = Builder.CreateXor(X, SignX);
1285 Y = Builder.CreateXor(Y, SignY);
1286 }
1287
1288 // The algorithm here is based on ideas from "Software Integer Division", Tom
1289 // Rodeheffer, August 2008.
1290 //
1291 // unsigned udiv(unsigned x, unsigned y) {
1292 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1293 // // that this is a lower bound on inv(y), even if some of the calculations
1294 // // round up.
1295 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1296 //
1297 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1298 // // Empirically this is guaranteed to give a "two-y" lower bound on
1299 // // inv(y).
1300 // z += umulh(z, -y * z);
1301 //
1302 // // Quotient/remainder estimate.
1303 // unsigned q = umulh(x, z);
1304 // unsigned r = x - q * y;
1305 //
1306 // // Two rounds of quotient/remainder refinement.
1307 // if (r >= y) {
1308 // ++q;
1309 // r -= y;
1310 // }
1311 // if (r >= y) {
1312 // ++q;
1313 // r -= y;
1314 // }
1315 //
1316 // return q;
1317 // }
1318
1319 // Initial estimate of inv(y).
1320 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1321 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1322 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1323 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1324 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1325
1326 // One round of UNR.
1327 Value *NegY = Builder.CreateSub(Zero, Y);
1328 Value *NegYZ = Builder.CreateMul(NegY, Z);
1329 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1330
1331 // Quotient/remainder estimate.
1332 Value *Q = getMulHu(Builder, X, Z);
1333 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1334
1335 // First quotient/remainder refinement.
1336 Value *Cond = Builder.CreateICmpUGE(R, Y);
1337 if (IsDiv)
1338 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1339 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1340
1341 // Second quotient/remainder refinement.
1342 Cond = Builder.CreateICmpUGE(R, Y);
1343 Value *Res;
1344 if (IsDiv)
1345 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1346 else
1347 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1348
1349 if (IsSigned) {
1350 Res = Builder.CreateXor(Res, Sign);
1351 Res = Builder.CreateSub(Res, Sign);
1352 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1353 } else {
1354 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1355 }
1356 return Res;
1357}
1358
1359Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1360 BinaryOperator &I, Value *Num,
1361 Value *Den) const {
1362 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1363 return nullptr; // Keep it for later optimization.
1364
1365 Instruction::BinaryOps Opc = I.getOpcode();
1366
1367 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1368 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1369
1370 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1371 if (NumDivBits > 32)
1372 return nullptr;
1373
1374 Value *Narrowed = nullptr;
1375 if (NumDivBits <= (IsSigned ? 23 : 22)) {
1376 Narrowed = expandDivRemToFloatImpl(Builder, I, Num, Den, NumDivBits, IsDiv,
1377 IsSigned);
1378 } else if (NumDivBits <= 32) {
1379 Narrowed = expandDivRem32(Builder, I, Num, Den);
1380 }
1381
1382 if (Narrowed) {
1383 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1384 Builder.CreateZExt(Narrowed, Num->getType());
1385 }
1386
1387 return nullptr;
1388}
1389
1390void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1391 Instruction::BinaryOps Opc = I.getOpcode();
1392 // Do the general expansion.
1393 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1395 return;
1396 }
1397
1398 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1400 return;
1401 }
1402
1403 llvm_unreachable("not a division");
1404}
1405
1406/*
1407This will cause non-byte load in consistency, for example:
1408```
1409 %load = load i1, ptr addrspace(4) %arg, align 4
1410 %zext = zext i1 %load to
1411 i64 %add = add i64 %zext
1412```
1413Instead of creating `s_and_b32 s0, s0, 1`,
1414it will create `s_and_b32 s0, s0, 0xff`.
1415We accept this change since the non-byte load assumes the upper bits
1416within the byte are all 0.
1417*/
1418bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1419 unsigned Opc = I->getOpcode();
1420 Type *OldType = I->getType();
1421
1422 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1423 return false;
1424
1425 unsigned OrigBit = OldType->getScalarSizeInBits();
1426
1427 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1428 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1429 "Instruction::Mul.");
1430
1431 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1432
1433 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1434 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1435 if (!NewType)
1436 return false;
1437 unsigned NewBit = NewType->getIntegerBitWidth();
1438 if (NewBit >= OrigBit)
1439 return false;
1440 NewType = I->getType()->getWithNewBitWidth(NewBit);
1441
1442 // Old cost
1443 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1444 InstructionCost OldCost =
1446 // New cost of new op
1447 InstructionCost NewCost =
1449 // New cost of narrowing 2 operands (use trunc)
1450 int NumOfNonConstOps = 2;
1451 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1452 // Cannot be both constant, should be propagated
1453 NumOfNonConstOps = 1;
1454 }
1455 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1456 NewType, OldType,
1459 // New cost of zext narrowed result to original type
1460 NewCost +=
1461 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1463 if (NewCost >= OldCost)
1464 return false;
1465
1466 IRBuilder<> Builder(I);
1467 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1468 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1469 Value *Arith =
1470 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1471
1472 Value *Zext = Builder.CreateZExt(Arith, OldType);
1473 I->replaceAllUsesWith(Zext);
1474 DeadVals.push_back(I);
1475 return true;
1476}
1477
1478bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1479 if (foldBinOpIntoSelect(I))
1480 return true;
1481
1482 if (UseMul24Intrin && replaceMulWithMul24(I))
1483 return true;
1484 if (tryNarrowMathIfNoOverflow(&I))
1485 return true;
1486
1487 bool Changed = false;
1488 Instruction::BinaryOps Opc = I.getOpcode();
1489 Type *Ty = I.getType();
1490 Value *NewDiv = nullptr;
1491 unsigned ScalarSize = Ty->getScalarSizeInBits();
1492
1494
1495 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1496 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1497 ScalarSize <= 64 &&
1498 !DisableIDivExpand) {
1499 Value *Num = I.getOperand(0);
1500 Value *Den = I.getOperand(1);
1501 IRBuilder<> Builder(&I);
1502 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1503
1504 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1505 NewDiv = PoisonValue::get(VT);
1506
1507 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1508 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1509 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1510
1511 Value *NewElt;
1512 if (ScalarSize <= 32) {
1513 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1514 if (!NewElt)
1515 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1516 } else {
1517 // See if this 64-bit division can be shrunk to 32/24-bits before
1518 // producing the general expansion.
1519 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1520 if (!NewElt) {
1521 // The general 64-bit expansion introduces control flow and doesn't
1522 // return the new value. Just insert a scalar copy and defer
1523 // expanding it.
1524 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1525 // CreateBinOp does constant folding. If the operands are constant,
1526 // it will return a Constant instead of a BinaryOperator.
1527 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1528 Div64ToExpand.push_back(NewEltBO);
1529 }
1530 }
1531
1532 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1533 NewEltI->copyIRFlags(&I);
1534
1535 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1536 }
1537 } else {
1538 if (ScalarSize <= 32)
1539 NewDiv = expandDivRem32(Builder, I, Num, Den);
1540 else {
1541 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1542 if (!NewDiv)
1543 Div64ToExpand.push_back(&I);
1544 }
1545 }
1546
1547 if (NewDiv) {
1548 I.replaceAllUsesWith(NewDiv);
1549 DeadVals.push_back(&I);
1550 Changed = true;
1551 }
1552 }
1553
1554 if (ExpandDiv64InIR) {
1555 // TODO: We get much worse code in specially handled constant cases.
1556 for (BinaryOperator *Div : Div64ToExpand) {
1557 expandDivRem64(*Div);
1558 FlowChanged = true;
1559 Changed = true;
1560 }
1561 }
1562
1563 return Changed;
1564}
1565
1566bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1567 if (!WidenLoads)
1568 return false;
1569
1570 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1571 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1572 canWidenScalarExtLoad(I)) {
1573 IRBuilder<> Builder(&I);
1574 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1575
1576 Type *I32Ty = Builder.getInt32Ty();
1577 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1579
1580 // The widened load reads the original bytes in the low bits, so a !range
1581 // lower bound still holds. Convert it to the new type and don't make
1582 // assumptions about the high bits.
1583 if (auto *Range = I.getMetadata(LLVMContext::MD_range)) {
1584 ConstantInt *Lower = mdconst::extract<ConstantInt>(Range->getOperand(0));
1585
1586 if (!Lower->isNullValue()) {
1587 Metadata *LowAndHigh[] = {
1588 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1589 // Don't make assumptions about the high bits.
1590 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1591 };
1592
1593 WidenLoad->setMetadata(LLVMContext::MD_range,
1594 MDNode::get(F.getContext(), LowAndHigh));
1595 }
1596 }
1597
1598 int TySize = DL.getTypeSizeInBits(I.getType());
1599 Type *IntNTy = Builder.getIntNTy(TySize);
1600 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1601 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1602 I.replaceAllUsesWith(ValOrig);
1603 DeadVals.push_back(&I);
1604 return true;
1605 }
1606
1607 return false;
1608}
1609
1610bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1611 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1612 if (!FPOp)
1613 return false;
1614
1615 Value *X;
1616 Value *Fract = nullptr;
1617
1618 // Match:
1619 // (x - floor(x)) >= MIN_CONSTANT ? MIN_CONSTANT : (x - floor(x))
1620 //
1621 // This is the preferred way to implement fract.
1622 // TODO: Could also match with compare against 1.0
1623 const APFloat *C;
1625 Value *FractSrc = matchFractPatImpl(*X, *C);
1626 if (!FractSrc)
1627 return false;
1628 IRBuilder<> Builder(&I);
1629 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1630 Fract = applyFractPat(Builder, FractSrc);
1631 } else {
1632 // Match patterns which may appear in legacy implementations of the fract()
1633 // function, built around the nan-avoidant minnum intrinsic. These are the
1634 // core pattern plus additional clamping of inf and nan values on the
1635 // result.
1636 Value *Cond = I.getCondition();
1637 Value *TrueVal = I.getTrueValue();
1638 Value *FalseVal = I.getFalseValue();
1639 Value *CmpVal;
1640 CmpPredicate IsNanPred;
1641
1642 // Match fract pattern with nan check.
1643 if (!match(Cond, m_FCmp(IsNanPred, m_Value(CmpVal), m_NonNaN())))
1644 return false;
1645
1646 IRBuilder<> Builder(&I);
1647 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1648
1649 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1650 CmpVal == matchFractPatNanAvoidant(*FalseVal)) {
1651 // isnan(x) ? x : fract(x)
1652 Fract = applyFractPat(Builder, CmpVal);
1653 } else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1654 if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {
1655 // !isnan(x) ? fract(x) : x
1656 Fract = applyFractPat(Builder, CmpVal);
1657 } else {
1658 // Match an intermediate clamp infinity to 0 pattern. i.e.
1659 // !isnan(x) ? (!isinf(x) ? fract(x) : 0.0) : x
1660 CmpPredicate PredInf;
1661 Value *IfNotInf;
1662
1663 if (!match(TrueVal, m_Select(m_FCmp(PredInf, m_FAbs(m_Specific(CmpVal)),
1664 m_PosInf()),
1665 m_Value(IfNotInf), m_PosZeroFP())) ||
1666 PredInf != FCmpInst::FCMP_UNE ||
1667 CmpVal != matchFractPatNanAvoidant(*IfNotInf))
1668 return false;
1669
1670 SelectInst *ClampInfSelect = cast<SelectInst>(TrueVal);
1671
1672 // Insert before the fabs
1673 Value *InsertPt =
1674 cast<Instruction>(ClampInfSelect->getCondition())->getOperand(0);
1675
1676 Builder.SetInsertPoint(cast<Instruction>(InsertPt));
1677 Value *NewFract = applyFractPat(Builder, CmpVal);
1678 NewFract->takeName(TrueVal);
1679
1680 // Thread the new fract into the inf clamping sequence.
1681 DeadVals.push_back(ClampInfSelect->getOperand(1));
1682 ClampInfSelect->setOperand(1, NewFract);
1683
1684 // The outer select nan handling is also absorbed into the fract.
1685 Fract = ClampInfSelect;
1686 }
1687 } else
1688 return false;
1689 }
1690
1691 Fract->takeName(&I);
1692 I.replaceAllUsesWith(Fract);
1693 DeadVals.push_back(&I);
1694 return true;
1695}
1696
1697static bool areInSameBB(const Value *A, const Value *B) {
1698 const auto *IA = dyn_cast<Instruction>(A);
1699 const auto *IB = dyn_cast<Instruction>(B);
1700 return IA && IB && IA->getParent() == IB->getParent();
1701}
1702
1703// Helper for breaking large PHIs that returns true when an extractelement on V
1704// is likely to be folded away by the DAG combiner.
1706 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1707 if (!FVT)
1708 return false;
1709
1710 const Value *CurVal = V;
1711
1712 // Check for insertelements, keeping track of the elements covered.
1713 BitVector EltsCovered(FVT->getNumElements());
1714 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1715 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1716
1717 // Non constant index/out of bounds index -> folding is unlikely.
1718 // The latter is more of a sanity check because canonical IR should just
1719 // have replaced those with poison.
1720 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1721 return false;
1722
1723 const auto *VecSrc = IE->getOperand(0);
1724
1725 // If the vector source is another instruction, it must be in the same basic
1726 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1727 // unlikely to be able to do anything interesting here.
1728 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1729 return false;
1730
1731 CurVal = VecSrc;
1732 EltsCovered.set(Idx->getZExtValue());
1733
1734 // All elements covered.
1735 if (EltsCovered.all())
1736 return true;
1737 }
1738
1739 // We either didn't find a single insertelement, or the insertelement chain
1740 // ended before all elements were covered. Check for other interesting values.
1741
1742 // Constants are always interesting because we can just constant fold the
1743 // extractelements.
1744 if (isa<Constant>(CurVal))
1745 return true;
1746
1747 // shufflevector is likely to be profitable if either operand is a constant,
1748 // or if either source is in the same block.
1749 // This is because shufflevector is most often lowered as a series of
1750 // insert/extract elements anyway.
1751 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1752 return isa<Constant>(SV->getOperand(1)) ||
1753 areInSameBB(SV, SV->getOperand(0)) ||
1754 areInSameBB(SV, SV->getOperand(1));
1755 }
1756
1757 return false;
1758}
1759
1760static void collectPHINodes(const PHINode &I,
1762 const auto [It, Inserted] = SeenPHIs.insert(&I);
1763 if (!Inserted)
1764 return;
1765
1766 for (const Value *Inc : I.incoming_values()) {
1767 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1768 collectPHINodes(*PhiInc, SeenPHIs);
1769 }
1770
1771 for (const User *U : I.users()) {
1772 if (const auto *PhiU = dyn_cast<PHINode>(U))
1773 collectPHINodes(*PhiU, SeenPHIs);
1774 }
1775}
1776
1777bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1778 // Check in the cache first.
1779 if (const auto It = BreakPhiNodesCache.find(&I);
1780 It != BreakPhiNodesCache.end())
1781 return It->second;
1782
1783 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1784 // recursively consider all its users and incoming values that are also PHI
1785 // nodes. We then make a decision about all of those PHIs at once. Either they
1786 // all get broken up, or none of them do. That way, we avoid cases where a
1787 // single PHI is/is not broken and we end up reforming/exploding a vector
1788 // multiple times, or even worse, doing it in a loop.
1789 SmallPtrSet<const PHINode *, 8> WorkList;
1790 collectPHINodes(I, WorkList);
1791
1792#ifndef NDEBUG
1793 // Check that none of the PHI nodes in the worklist are in the map. If some of
1794 // them are, it means we're not good enough at collecting related PHIs.
1795 for (const PHINode *WLP : WorkList) {
1796 assert(BreakPhiNodesCache.count(WLP) == 0);
1797 }
1798#endif
1799
1800 // To consider a PHI profitable to break, we need to see some interesting
1801 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1802 // must have one to consider all PHIs breakable.
1803 //
1804 // This threshold has been determined through performance testing.
1805 //
1806 // Note that the computation below is equivalent to
1807 //
1808 // (unsigned)ceil((K / 3.0) * 2)
1809 //
1810 // It's simply written this way to avoid mixing integral/FP arithmetic.
1811 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1812 unsigned NumBreakablePHIs = 0;
1813 bool CanBreak = false;
1814 for (const PHINode *Cur : WorkList) {
1815 // Don't break PHIs that have no interesting incoming values. That is, where
1816 // there is no clear opportunity to fold the "extractelement" instructions
1817 // we would add.
1818 //
1819 // Note: IC does not run after this pass, so we're only interested in the
1820 // foldings that the DAG combiner can do.
1821 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1822 if (++NumBreakablePHIs >= Threshold) {
1823 CanBreak = true;
1824 break;
1825 }
1826 }
1827 }
1828
1829 for (const PHINode *Cur : WorkList)
1830 BreakPhiNodesCache[Cur] = CanBreak;
1831
1832 return CanBreak;
1833}
1834
1835/// Helper class for "break large PHIs" (visitPHINode).
1836///
1837/// This represents a slice of a PHI's incoming value, which is made up of:
1838/// - The type of the slice (Ty)
1839/// - The index in the incoming value's vector where the slice starts (Idx)
1840/// - The number of elements in the slice (NumElts).
1841/// It also keeps track of the NewPHI node inserted for this particular slice.
1842///
1843/// Slice examples:
1844/// <4 x i64> -> Split into four i64 slices.
1845/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1846/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1847/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1849public:
1850 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1851 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1852
1853 Type *Ty = nullptr;
1854 unsigned Idx = 0;
1855 unsigned NumElts = 0;
1856 PHINode *NewPHI = nullptr;
1857
1858 /// Slice \p Inc according to the information contained within this slice.
1859 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1860 /// pair, it returns the same Sliced value as well.
1861 ///
1862 /// Note this *intentionally* does not return the same value for, say,
1863 /// [%bb.0, %0] & [%bb.1, %0] as:
1864 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1865 /// the value in bb.1 may not be reachable from bb.0 if it's its
1866 /// predecessor.)
1867 /// - We also want to make our extract instructions as local as possible so
1868 /// the DAG has better chances of folding them out. Duplicating them like
1869 /// that is beneficial in that regard.
1870 ///
1871 /// This is both a minor optimization to avoid creating duplicate
1872 /// instructions, but also a requirement for correctness. It is not forbidden
1873 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1874 /// returned a new value each time, those previously identical pairs would all
1875 /// have different incoming values (from the same block) and it'd cause a "PHI
1876 /// node has multiple entries for the same basic block with different incoming
1877 /// values!" verifier error.
1878 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1879 Value *&Res = SlicedVals[{BB, Inc}];
1880 if (Res)
1881 return Res;
1882
1884 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1885 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1886
1887 if (NumElts > 1) {
1889 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1890 Mask.push_back(K);
1891 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1892 } else
1893 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1894
1895 return Res;
1896 }
1897
1898private:
1900};
1901
1902bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1903 // Break-up fixed-vector PHIs into smaller pieces.
1904 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1905 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1906 //
1907 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1908 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1909 // With large, odd-sized PHIs we may end up needing many `build_vector`
1910 // operations with most elements being "undef". This inhibits a lot of
1911 // optimization opportunities and can result in unreasonably high register
1912 // pressure and the inevitable stack spilling.
1913 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1914 return false;
1915
1916 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1917 if (!FVT || FVT->getNumElements() == 1 ||
1918 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1919 return false;
1920
1921 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1922 return false;
1923
1924 std::vector<VectorSlice> Slices;
1925
1926 Type *EltTy = FVT->getElementType();
1927 {
1928 unsigned Idx = 0;
1929 // For 8/16 bits type, don't scalarize fully but break it up into as many
1930 // 32-bit slices as we can, and scalarize the tail.
1931 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1932 const unsigned NumElts = FVT->getNumElements();
1933 if (EltSize == 8 || EltSize == 16) {
1934 const unsigned SubVecSize = (32 / EltSize);
1935 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1936 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1937 Idx += SubVecSize)
1938 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1939 }
1940
1941 // Scalarize all remaining elements.
1942 for (; Idx < NumElts; ++Idx)
1943 Slices.emplace_back(EltTy, Idx, 1);
1944 }
1945
1946 assert(Slices.size() > 1);
1947
1948 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1949 // creating the necessary instruction to extract the relevant slices of each
1950 // incoming value.
1951 IRBuilder<> B(I.getParent());
1952 B.SetCurrentDebugLocation(I.getDebugLoc());
1953
1954 unsigned IncNameSuffix = 0;
1955 for (VectorSlice &S : Slices) {
1956 // We need to reset the build on each iteration, because getSlicedVal may
1957 // have inserted something into I's BB.
1958 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1959 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1960
1961 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1962 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1963 "largephi.extractslice" +
1964 std::to_string(IncNameSuffix++)),
1965 BB);
1966 }
1967 }
1968
1969 // And replace this PHI with a vector of all the previous PHI values.
1970 Value *Vec = PoisonValue::get(FVT);
1971 unsigned NameSuffix = 0;
1972 for (VectorSlice &S : Slices) {
1973 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1974 if (S.NumElts > 1)
1975 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1976 else
1977 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1978 }
1979
1980 I.replaceAllUsesWith(Vec);
1981 DeadVals.push_back(&I);
1982 return true;
1983}
1984
1985/// \param V Value to check
1986/// \param DL DataLayout
1987/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1988/// \param AS Target Address Space
1989/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1990static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1991 const AMDGPUTargetMachine &TM, unsigned AS) {
1992 // Pointer cannot be null if it's a block address, GV or alloca.
1993 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1994 // it as the symbol could be null in such cases.
1996 return true;
1997
1998 // Check nonnull arguments.
1999 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2000 return true;
2001
2002 // Check nonnull loads.
2003 if (const auto *Load = dyn_cast<LoadInst>(V);
2004 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
2005 return true;
2006
2007 // getUnderlyingObject may have looked through another addrspacecast, although
2008 // the optimizable situations most likely folded out by now.
2009 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2010 return false;
2011
2012 // TODO: Calls that return nonnull?
2013
2014 // For all other things, use KnownBits.
2015 // We either use 0 or all bits set to indicate null, so check whether the
2016 // value can be zero or all ones.
2017 //
2018 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2019 // address spaces have non-zero null values.
2020 auto SrcPtrKB = computeKnownBits(V, DL);
2021 const auto NullVal = AMDGPU::getNullPointerValue(AS);
2022
2023 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2024 assert((NullVal == 0 || NullVal == -1) &&
2025 "don't know how to check for this null value!");
2026 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2027}
2028
2029bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2030 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2031 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2032 // worth supporting.
2033 if (I.getType()->isVectorTy())
2034 return false;
2035
2036 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2037 // This is only worthwhile for casts from/to priv/local to flat.
2038 const unsigned SrcAS = I.getSrcAddressSpace();
2039 const unsigned DstAS = I.getDestAddressSpace();
2040
2041 bool CanLower = false;
2042 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2043 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2044 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2045 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2046 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2047 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2048 if (!CanLower)
2049 return false;
2050
2052 getUnderlyingObjects(I.getOperand(0), WorkList);
2053 if (!all_of(WorkList, [&](const Value *V) {
2054 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
2055 }))
2056 return false;
2057
2058 IRBuilder<> B(&I);
2059 auto *Intrin = B.CreateIntrinsic(
2060 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2061 I.replaceAllUsesWith(Intrin);
2062 DeadVals.push_back(&I);
2063 return true;
2064}
2065
2066bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2067 Intrinsic::ID IID = I.getIntrinsicID();
2068 switch (IID) {
2069 case Intrinsic::minnum:
2070 case Intrinsic::minimumnum:
2071 case Intrinsic::minimum:
2072 return visitFMinLike(I);
2073 case Intrinsic::sqrt:
2074 return visitSqrt(I);
2075 case Intrinsic::log:
2076 case Intrinsic::log10:
2077 return visitLog(cast<FPMathOperator>(I), IID);
2078 case Intrinsic::log2:
2079 // No reason to handle log2.
2080 return false;
2081 case Intrinsic::amdgcn_mbcnt_lo:
2082 return visitMbcntLo(I);
2083 case Intrinsic::amdgcn_mbcnt_hi:
2084 return visitMbcntHi(I);
2085 case Intrinsic::vector_reduce_add:
2086 return visitVectorReduceAdd(I);
2087 case Intrinsic::uadd_sat:
2088 case Intrinsic::sadd_sat:
2089 return visitSaturatingAdd(I);
2090 default:
2091 return false;
2092 }
2093}
2094
2095/// Match the core sequence in the fract pattern (x - floor(x), which doesn't
2096/// need to consider edge case handling.
2097Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(Value &FractSrc,
2098 const APFloat &C) const {
2099 if (ST.hasFractBug())
2100 return nullptr;
2101
2102 Type *Ty = FractSrc.getType();
2103 if (!isLegalFloatingTy(Ty->getScalarType()))
2104 return nullptr;
2105
2106 APFloat OneNextDown = APFloat::getOne(C.getSemantics());
2107 OneNextDown.next(true);
2108
2109 // Match nextafter(1.0, -1)
2110 if (OneNextDown != C)
2111 return nullptr;
2112
2113 Value *FloorSrc;
2114 if (match(&FractSrc, m_FSub(m_Value(FloorSrc), m_Intrinsic<Intrinsic::floor>(
2115 m_Deferred(FloorSrc)))))
2116 return FloorSrc;
2117 return nullptr;
2118}
2119
2120/// Match non-nan fract pattern.
2121// MIN_CONSTANT = nextafter(1.0, -1.0)
2122/// minnum(fsub(x, floor(x)), MIN_CONSTANT)
2123/// minimumnum(fsub(x, floor(x)), MIN_CONSTANT)
2124/// minimum(fsub(x, floor(x)), MIN_CONSTANT)
2125
2126// x_sub_floor >= MIN_CONSTANT ? MIN_CONSTANT : x_sub_floor;
2127///
2128/// If fract is a useful instruction for the subtarget. Does not account for the
2129/// nan handling; the instruction has a nan check on the input value.
2130Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(Value &V) {
2131 Value *Arg0;
2132 const APFloat *C;
2133
2134 // The value is only used in contexts where we know the input isn't a nan, so
2135 // any of the fmin variants are fine.
2136 if (!match(&V,
2140 return nullptr;
2141
2142 return matchFractPatImpl(*Arg0, *C);
2143}
2144
2145Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2146 Value *FractArg) {
2147 SmallVector<Value *, 4> FractVals;
2148 extractValues(Builder, FractVals, FractArg);
2149
2150 SmallVector<Value *, 4> ResultVals(FractVals.size());
2151
2152 Type *Ty = FractArg->getType()->getScalarType();
2153 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2154 ResultVals[I] =
2155 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2156 }
2157
2158 return insertValues(Builder, FractArg->getType(), ResultVals);
2159}
2160
2161bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2162 const APFloat *C;
2163 Value *FractArg;
2164
2165 // minimum(x - floor(x), MIN_CONSTANT)
2166 Value *X;
2167 if (!ST.hasFractBug() &&
2169 FractArg = matchFractPatImpl(*X, *C);
2170 if (!FractArg)
2171 return false;
2172 } else {
2173 // minnum(x - floor(x), MIN_CONSTANT)
2174 FractArg = matchFractPatNanAvoidant(I);
2175 if (!FractArg)
2176 return false;
2177
2178 // Match pattern for fract intrinsic in contexts where the nan check has
2179 // been optimized out (and hope the knowledge the source can't be nan wasn't
2180 // lost).
2181 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))
2182 return false;
2183 }
2184
2185 IRBuilder<> Builder(&I);
2186 FastMathFlags FMF = I.getFastMathFlags();
2187 FMF.setNoNaNs();
2188 Builder.setFastMathFlags(FMF);
2189
2190 Value *Fract = applyFractPat(Builder, FractArg);
2191 Fract->takeName(&I);
2192 I.replaceAllUsesWith(Fract);
2193 DeadVals.push_back(&I);
2194 return true;
2195}
2196
2197// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2198bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2199 Type *Ty = Sqrt.getType()->getScalarType();
2200 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2201 return false;
2202
2203 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2204 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2205
2206 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2207 // of fast llvm.sqrt will give the raw instruction anyway.
2208 if (SqrtFMF.approxFunc())
2209 return false;
2210
2211 const float ReqdAccuracy = FPOp->getFPAccuracy();
2212
2213 // Defer correctly rounded expansion to codegen.
2214 if (ReqdAccuracy < 1.0f)
2215 return false;
2216
2217 Value *SrcVal = Sqrt.getOperand(0);
2218 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2219
2220 // The raw instruction is 1 ulp, but the correction for denormal handling
2221 // brings it to 2.
2222 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2223 return false;
2224
2225 IRBuilder<> Builder(&Sqrt);
2226 SmallVector<Value *, 4> SrcVals;
2227 extractValues(Builder, SrcVals, SrcVal);
2228
2229 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2230 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2231 if (CanTreatAsDAZ)
2232 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2233 else
2234 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2235 }
2236
2237 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2238 NewSqrt->takeName(&Sqrt);
2239 Sqrt.replaceAllUsesWith(NewSqrt);
2240 DeadVals.push_back(&Sqrt);
2241 return true;
2242}
2243
2244/// Replace log and log10 intrinsic calls based on fpmath metadata.
2245bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2246 Intrinsic::ID IID) {
2247 Type *Ty = Log.getType();
2248 if (!Ty->getScalarType()->isHalfTy() || !ST.has16BitInsts())
2249 return false;
2250
2251 FastMathFlags FMF = Log.getFastMathFlags();
2252
2253 // Defer fast math cases to codegen.
2254 if (FMF.approxFunc())
2255 return false;
2256
2257 // Limit experimentally determined from OpenCL conformance test (1.79)
2258 if (Log.getFPAccuracy() < 1.80f)
2259 return false;
2260
2261 IRBuilder<> Builder(&cast<CallInst>(Log));
2262
2263 // Use the generic intrinsic for convenience in the vector case. Codegen will
2264 // recognize the denormal handling is not necessary from the fpext.
2265 // TODO: Move to generic code
2266 Value *Log2 =
2267 Builder.CreateUnaryIntrinsic(Intrinsic::log2, Log.getOperand(0), FMF);
2268
2269 double Log2BaseInverted =
2270 IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2271 Value *Mul =
2272 Builder.CreateFMulFMF(Log2, ConstantFP::get(Ty, Log2BaseInverted), FMF);
2273
2274 Mul->takeName(&Log);
2275
2276 Log.replaceAllUsesWith(Mul);
2277 DeadVals.push_back(&Log);
2278 return true;
2279}
2280
2281bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2282 if (skipFunction(F))
2283 return false;
2284
2285 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2286 if (!TPC)
2287 return false;
2288
2289 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2290 const TargetLibraryInfo *TLI =
2291 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2292 AssumptionCache *AC =
2293 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2294 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2295 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2296 const UniformityInfo &UA =
2297 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2298 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2299}
2300
2303 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2304 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
2305 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
2306 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2307 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
2308 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2309 if (!Impl.run())
2310 return PreservedAnalyses::all();
2312 if (!Impl.FlowChanged)
2314 return PA;
2315}
2316
2317INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2318 "AMDGPU IR optimizations", false, false)
2322INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2324
2325/// Create a workitem.id.x intrinsic call with range metadata.
2326CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2327 CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2328 ST.makeLIDRangeMetadata(Tid);
2329 return Tid;
2330}
2331
2332/// Replace the instruction with a direct workitem.id.x call.
2333void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2334 IRBuilder<> B(&I);
2335 CallInst *Tid = createWorkitemIdX(B);
2337 ReplaceInstWithValue(BI, Tid);
2338}
2339
2340/// Replace the instruction with (workitem.id.x & mask).
2341void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2342 Instruction &I, unsigned WaveSize) const {
2343 IRBuilder<> B(&I);
2344 CallInst *Tid = createWorkitemIdX(B);
2345 Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);
2346 Value *AndInst = B.CreateAnd(Tid, Mask);
2348 ReplaceInstWithValue(BI, AndInst);
2349}
2350
2351/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2352/// work group size allows direct computation of lane ID.
2353/// Returns true if optimization was applied, false otherwise.
2354bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2355 unsigned Wave) const {
2356 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);
2357 if (!MaybeX)
2358 return false;
2359
2360 // When work group size == wave_size, each work group contains exactly one
2361 // wave, so the instruction can be replaced with workitem.id.x directly.
2362 if (*MaybeX == Wave) {
2363 replaceWithWorkitemIdX(I);
2364 return true;
2365 }
2366
2367 // When work group evenly splits into waves, compute lane ID within wave
2368 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2369 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
2370 replaceWithMaskedWorkitemIdX(I, Wave);
2371 return true;
2372 }
2373
2374 return false;
2375}
2376
2377/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2378bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2379 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2380 // the full execution mask.
2381 if (!ST.isWave32())
2382 return false;
2383
2384 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2385 // lower IDs.
2386 if (!match(&I,
2388 return false;
2389
2390 return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());
2391}
2392
2393/// Optimize mbcnt.hi calls for lane ID computation.
2394bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2395 // Abort if wave size is not known at compile time.
2396 if (!ST.isWaveSizeKnown())
2397 return false;
2398
2399 unsigned Wave = ST.getWavefrontSize();
2400
2401 // On wave32, the upper 32 bits of execution mask are always 0, so
2402 // mbcnt.hi(mask, val) always returns val unchanged.
2403 if (ST.isWave32()) {
2404 if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
2405 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2406 // wave size (single wave per work group).
2407 if (*MaybeX == Wave) {
2409 ReplaceInstWithValue(BI, I.getArgOperand(1));
2410 return true;
2411 }
2412 }
2413 }
2414
2415 // Optimize the complete lane ID computation pattern:
2416 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2417 // across the full execution mask.
2418 using namespace PatternMatch;
2419
2420 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2423 m_AllOnes(), m_Zero()))))
2424 return false;
2425
2426 return tryReplaceWithWorkitemId(I, Wave);
2427}
2428
2429/// Check if type is <4 x i8>.
2430static bool isV4I8(Type *Ty) {
2432 return VTy && VTy->getNumElements() == 4 &&
2433 VTy->getElementType()->isIntegerTy(8);
2434}
2435
2436/// Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x
2437/// i8>) Returns true if pattern matches and signedness matches IsSigned.
2438/// Sets A, B to the <4 x i8> sources.
2439static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B,
2440 bool IsSigned) {
2441 Value *Src0, *Src1;
2442 if (!match(MulOp, m_Mul(m_Value(Src0), m_Value(Src1))))
2443 return false;
2444
2445 // Check that result type is <4 x i32>
2447 if (!MulTy || MulTy->getNumElements() != 4 ||
2448 !MulTy->getElementType()->isIntegerTy(32))
2449 return false;
2450
2451 // Match zext or sext based on IsSigned
2452 Value *ExtSrc0, *ExtSrc1;
2453 if (IsSigned) {
2454 if (!match(Src0, m_SExt(m_Value(ExtSrc0))) || !isV4I8(ExtSrc0->getType()))
2455 return false;
2456 if (!match(Src1, m_SExt(m_Value(ExtSrc1))) || !isV4I8(ExtSrc1->getType()))
2457 return false;
2458 } else {
2459 if (!match(Src0, m_ZExt(m_Value(ExtSrc0))) || !isV4I8(ExtSrc0->getType()))
2460 return false;
2461 if (!match(Src1, m_ZExt(m_Value(ExtSrc1))) || !isV4I8(ExtSrc1->getType()))
2462 return false;
2463 }
2464
2465 A = ExtSrc0;
2466 B = ExtSrc1;
2467 return true;
2468}
2469
2470/// Try to convert vector.reduce.add(mul(zext/sext <4 x i8>, zext/sext <4 x
2471/// i8>)) to a dot4 intrinsic call (non-saturating case only).
2472bool AMDGPUCodeGenPrepareImpl::visitVectorReduceAdd(IntrinsicInst &I) {
2473 // Check if we have dot4 instructions available
2474 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2475 return false;
2476
2477 Value *A = nullptr, *B = nullptr;
2478
2479 // Try unsigned first, then signed
2480 bool IsSigned = false;
2481 if (!matchDot4Pattern(I.getArgOperand(0), A, B, /*IsSigned=*/false)) {
2482 if (!matchDot4Pattern(I.getArgOperand(0), A, B, /*IsSigned=*/true))
2483 return false;
2484 IsSigned = true;
2485 }
2486
2487 LLVMContext &Ctx = I.getContext();
2488 Type *I32Ty = Type::getInt32Ty(Ctx);
2489 IRBuilder<> Builder(&I);
2490
2491 // Bitcast <4 x i8> to i32
2492 Value *ASrc = Builder.CreateBitCast(A, I32Ty);
2493 Value *BSrc = Builder.CreateBitCast(B, I32Ty);
2494
2495 // Non-saturating case: accumulator is 0, clamp is false
2496 Value *Acc = ConstantInt::get(I32Ty, 0);
2497 Value *Clamp = ConstantInt::getFalse(Ctx);
2498
2499 Intrinsic::ID DotIID =
2500 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2501
2502 Value *Dot = Builder.CreateIntrinsic(DotIID, {}, {ASrc, BSrc, Acc, Clamp});
2503 Dot->takeName(&I);
2504
2505 I.replaceAllUsesWith(Dot);
2506 DeadVals.push_back(&I);
2507
2508 return true;
2509}
2510
2511/// Try to convert uadd.sat/sadd.sat(vector.reduce.add(mul(...)), c) to a
2512/// saturating dot4 intrinsic. This combine starts at the root (saturating add)
2513/// and looks at its operands.
2514bool AMDGPUCodeGenPrepareImpl::visitSaturatingAdd(IntrinsicInst &I) {
2515 // Check if we have dot4 instructions available
2516 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2517 return false;
2518
2519 Intrinsic::ID IID = I.getIntrinsicID();
2520 bool IsSigned = (IID == Intrinsic::sadd_sat);
2521
2522 // Look for vector.reduce.add as one of the operands (commutative match)
2523 Value *Op0 = I.getArgOperand(0);
2524 Value *Op1 = I.getArgOperand(1);
2525 Value *MulOp = nullptr;
2526 Value *Accum = nullptr;
2527 IntrinsicInst *ReduceInst = nullptr;
2528
2530 ReduceInst = cast<IntrinsicInst>(Op0);
2531 Accum = Op1;
2532 } else if (match(Op1,
2534 ReduceInst = cast<IntrinsicInst>(Op1);
2535 Accum = Op0;
2536 } else {
2537 return false;
2538 }
2539
2540 Value *A = nullptr, *B = nullptr;
2541
2542 if (!matchDot4Pattern(MulOp, A, B, IsSigned))
2543 return false;
2544
2545 LLVMContext &Ctx = I.getContext();
2546 Type *I32Ty = Type::getInt32Ty(Ctx);
2547 IRBuilder<> Builder(&I);
2548
2549 // Bitcast <4 x i8> to i32
2550 Value *ASrc = Builder.CreateBitCast(A, I32Ty);
2551 Value *BSrc = Builder.CreateBitCast(B, I32Ty);
2552
2553 // Saturating case: use the accumulator and set clamp to true
2554 Value *Clamp = ConstantInt::getTrue(Ctx);
2555
2556 Intrinsic::ID DotIID =
2557 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2558
2559 Value *Dot = Builder.CreateIntrinsic(DotIID, {}, {ASrc, BSrc, Accum, Clamp});
2560 Dot->takeName(&I);
2561
2562 I.replaceAllUsesWith(Dot);
2563 DeadVals.push_back(&I);
2564 // The reduce.add will be dead after this and cleaned up later
2565 if (ReduceInst->use_empty())
2566 DeadVals.push_back(ReduceInst);
2567
2568 return true;
2569}
2570
2571char AMDGPUCodeGenPrepare::ID = 0;
2572
2574 return new AMDGPUCodeGenPrepare();
2575}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B, bool IsSigned)
Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x i8>) Returns true if pattern...
static bool isV4I8(Type *Ty)
Check if type is <4 x i8>.
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
BinaryOperator * Mul
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
Definition APFloat.h:1147
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
opStatus next(bool nextDown)
Definition APFloat.h:1313
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:409
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
bool all() const
Returns true if all bits are set.
Definition BitVector.h:194
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:512
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:674
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
void setFast(bool B=true)
Definition FMF.h:96
bool noInfs() const
Definition FMF.h:66
bool allowReciprocal() const
Definition FMF.h:68
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:65
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isWave32() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
bool hasFractBug() const
bool isUniformAtDef(ConstValueRefT V) const
Whether V is uniform/non-divergent at its definition.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2637
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1715
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2625
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2148
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2684
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2142
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:247
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false, MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2190
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2439
Value * CreateFAbs(Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fabs intrinsic.
Definition IRBuilder.h:1048
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1852
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Definition IRBuilder.h:1115
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2252
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1928
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:341
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2424
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1592
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2563
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2116
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2396
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1644
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2202
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1696
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1861
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2163
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1701
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2434
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2183
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868
Base class for instruction visitors.
Definition InstVisitor.h:78
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
bool use_empty() const
Definition Value.h:346
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
void copyMetadataForWidenedLoad(LoadInst &Dest, const LoadInst &Source)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
constexpr double ln2
constexpr double ln10
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:362
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
Definition bit.h:90
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:106
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
const DataLayout & DL
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC