LLVM 23.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
18#include "llvm/ADT/SetVector.h"
26#include "llvm/IR/Dominators.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/InstVisitor.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/IR/ValueHandle.h"
33#include "llvm/Pass.h"
39
40#define DEBUG_TYPE "amdgpu-codegenprepare"
41
42using namespace llvm;
43using namespace llvm::PatternMatch;
44
45namespace {
46
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
51 cl::init(false));
52
53static cl::opt<bool>
54 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
55 cl::desc("Break large PHI nodes for DAGISel"),
57
58static cl::opt<bool>
59 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc("For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
63
64static cl::opt<unsigned> BreakLargePHIsThreshold(
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68
69static cl::opt<bool> UseMul24Intrin(
70 "amdgpu-codegenprepare-mul24",
71 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::init(true));
74
75// Legalize 64-bit division by using the generic IR expansion.
76static cl::opt<bool> ExpandDiv64InIR(
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::init(false));
81
82// Leave all division operations as they are. This supersedes ExpandDiv64InIR
83// and is used for testing the legalizer.
84static cl::opt<bool> DisableIDivExpand(
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::init(false));
89
90// Disable processing of fdiv so we can better test the backend implementations.
91static cl::opt<bool> DisableFDivExpand(
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::init(false));
96
97class AMDGPUCodeGenPrepareImpl
98 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
99public:
100 Function &F;
101 const GCNSubtarget &ST;
102 const AMDGPUTargetMachine &TM;
103 const TargetLibraryInfo *TLI;
104 AssumptionCache *AC;
105 const DominatorTree *DT;
106 const UniformityInfo &UA;
107 const DataLayout &DL;
108 const bool HasFP32DenormalFlush;
109 bool FlowChanged = false;
110 mutable Function *SqrtF32 = nullptr;
111 mutable Function *LdexpF32 = nullptr;
112 mutable SmallVector<WeakVH> DeadVals;
113
114 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
115
116 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
117 const TargetLibraryInfo *TLI, AssumptionCache *AC,
118 const DominatorTree *DT, const UniformityInfo &UA)
119 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
120 DT(DT), UA(UA), DL(F.getDataLayout()),
121 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
123
124 Function *getSqrtF32() const {
125 if (SqrtF32)
126 return SqrtF32;
127
128 LLVMContext &Ctx = F.getContext();
130 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
131 return SqrtF32;
132 }
133
134 Function *getLdexpF32() const {
135 if (LdexpF32)
136 return LdexpF32;
137
138 LLVMContext &Ctx = F.getContext();
140 F.getParent(), Intrinsic::ldexp,
141 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
142 return LdexpF32;
143 }
144
145 bool canBreakPHINode(const PHINode &I);
146
147 /// Return true if \p T is a legal scalar floating point type.
148 bool isLegalFloatingTy(const Type *T) const;
149
150 /// Wrapper to pass all the arguments to computeKnownFPClass
152 const Instruction *CtxI) const {
153 return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT);
154 }
155
156 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
157 return HasFP32DenormalFlush ||
159 }
160
161 /// \returns The minimum number of bits needed to store the value of \Op as an
162 /// unsigned integer. Truncating to this size and then zero-extending to
163 /// the original will not change the value.
164 unsigned numBitsUnsigned(Value *Op) const;
165
166 /// \returns The minimum number of bits needed to store the value of \Op as a
167 /// signed integer. Truncating to this size and then sign-extending to
168 /// the original size will not change the value.
169 unsigned numBitsSigned(Value *Op) const;
170
171 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
172 /// SelectionDAG has an issue where an and asserting the bits are known
173 bool replaceMulWithMul24(BinaryOperator &I) const;
174
175 /// Perform same function as equivalently named function in DAGCombiner. Since
176 /// we expand some divisions here, we need to perform this before obscuring.
177 bool foldBinOpIntoSelect(BinaryOperator &I) const;
178
179 bool divHasSpecialOptimization(BinaryOperator &I,
180 Value *Num, Value *Den) const;
181 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
182 unsigned MaxDivBits, bool Signed) const;
183
184 /// Expands 24 bit div or rem.
185 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
186 Value *Num, Value *Den,
187 bool IsDiv, bool IsSigned) const;
188
189 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
190 Value *Num, Value *Den, unsigned NumBits,
191 bool IsDiv, bool IsSigned) const;
192
193 /// Expands 32 bit div or rem.
194 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
195 Value *Num, Value *Den) const;
196
197 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
198 Value *Num, Value *Den) const;
199 void expandDivRem64(BinaryOperator &I) const;
200
201 /// Widen a scalar load.
202 ///
203 /// \details \p Widen scalar load for uniform, small type loads from constant
204 // memory / to a full 32-bits and then truncate the input to allow a scalar
205 // load instead of a vector load.
206 //
207 /// \returns True.
208
209 bool canWidenScalarExtLoad(LoadInst &I) const;
210
211 Value *matchFractPat(IntrinsicInst &I);
212 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
213
214 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
215
216 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
217 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
218 const Instruction *CtxI) const;
219
220 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
221 FastMathFlags FMF, const Instruction *CtxI) const;
222 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
223 float ReqdAccuracy) const;
224
225 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
226 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
227 Value *RsqOp, const Instruction *FDiv,
228 float ReqdAccuracy) const;
229
230 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
231 Value *Src) const;
232
233 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
234 bool IsNegative) const;
235 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
236 FastMathFlags FMF) const;
237 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
238 FastMathFlags FMF) const;
239 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
240 FastMathFlags DivFMF, const Instruction *CtxI,
241 bool IsNegative) const;
242
243 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
244 void replaceWithWorkitemIdX(Instruction &I) const;
245 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
246 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
247
248 bool tryNarrowMathIfNoOverflow(Instruction *I);
249
250public:
251 bool visitFDiv(BinaryOperator &I);
252
253 bool visitInstruction(Instruction &I) { return false; }
254 bool visitBinaryOperator(BinaryOperator &I);
255 bool visitLoadInst(LoadInst &I);
256 bool visitSelectInst(SelectInst &I);
257 bool visitPHINode(PHINode &I);
258 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
259
260 bool visitIntrinsicInst(IntrinsicInst &I);
261 bool visitFMinLike(IntrinsicInst &I);
262 bool visitSqrt(IntrinsicInst &I);
263 bool visitMbcntLo(IntrinsicInst &I) const;
264 bool visitMbcntHi(IntrinsicInst &I) const;
265 bool run();
266};
267
268class AMDGPUCodeGenPrepare : public FunctionPass {
269public:
270 static char ID;
271 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
272 void getAnalysisUsage(AnalysisUsage &AU) const override {
276
277 // FIXME: Division expansion needs to preserve the dominator tree.
278 if (!ExpandDiv64InIR)
279 AU.setPreservesAll();
280 }
281 bool runOnFunction(Function &F) override;
282 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
283};
284
285} // end anonymous namespace
286
287bool AMDGPUCodeGenPrepareImpl::run() {
288 BreakPhiNodesCache.clear();
289 bool MadeChange = false;
290
291 // Need to use make_early_inc_range because integer division expansion is
292 // handled by Transform/Utils, and it can delete instructions such as the
293 // terminator of the BB.
294 for (BasicBlock &BB : reverse(F)) {
295 for (Instruction &I : make_early_inc_range(reverse(BB))) {
296 if (!isInstructionTriviallyDead(&I, TLI))
297 MadeChange |= visit(I);
298 }
299 }
300
301 while (!DeadVals.empty()) {
302 if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
304 }
305
306 return MadeChange;
307}
308
309bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
310 return Ty->isFloatTy() || Ty->isDoubleTy() ||
311 (Ty->isHalfTy() && ST.has16BitInsts());
312}
313
314bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
315 Type *Ty = I.getType();
316 int TySize = DL.getTypeSizeInBits(Ty);
317 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
318
319 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
320}
321
322unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
323 return computeKnownBits(Op, DL, AC).countMaxActiveBits();
324}
325
326unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
327 return ComputeMaxSignificantBits(Op, DL, AC);
328}
329
330static void extractValues(IRBuilder<> &Builder,
331 SmallVectorImpl<Value *> &Values, Value *V) {
332 auto *VT = dyn_cast<FixedVectorType>(V->getType());
333 if (!VT) {
334 Values.push_back(V);
335 return;
336 }
337
338 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
339 Values.push_back(Builder.CreateExtractElement(V, I));
340}
341
343 Type *Ty,
344 SmallVectorImpl<Value *> &Values) {
345 if (!Ty->isVectorTy()) {
346 assert(Values.size() == 1);
347 return Values[0];
348 }
349
350 Value *NewVal = PoisonValue::get(Ty);
351 for (int I = 0, E = Values.size(); I != E; ++I)
352 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
353
354 return NewVal;
355}
356
357bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
358 if (I.getOpcode() != Instruction::Mul)
359 return false;
360
361 Type *Ty = I.getType();
362 unsigned Size = Ty->getScalarSizeInBits();
363 if (Size <= 16 && ST.has16BitInsts())
364 return false;
365
366 // Prefer scalar if this could be s_mul_i32
367 if (UA.isUniform(&I))
368 return false;
369
370 Value *LHS = I.getOperand(0);
371 Value *RHS = I.getOperand(1);
372 IRBuilder<> Builder(&I);
373 Builder.SetCurrentDebugLocation(I.getDebugLoc());
374
375 unsigned LHSBits = 0, RHSBits = 0;
376 bool IsSigned = false;
377
378 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
379 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
380 IsSigned = false;
381
382 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
383 (RHSBits = numBitsSigned(RHS)) <= 24) {
384 IsSigned = true;
385
386 } else
387 return false;
388
389 SmallVector<Value *, 4> LHSVals;
390 SmallVector<Value *, 4> RHSVals;
391 SmallVector<Value *, 4> ResultVals;
392 extractValues(Builder, LHSVals, LHS);
393 extractValues(Builder, RHSVals, RHS);
394
395 IntegerType *I32Ty = Builder.getInt32Ty();
396 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
397 Type *DstTy = LHSVals[0]->getType();
398
399 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
400 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
401 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
402 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
403 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
405 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
406 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
407 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
408 : Builder.CreateZExtOrTrunc(Result, DstTy);
409 ResultVals.push_back(Result);
410 }
411
412 Value *NewVal = insertValues(Builder, Ty, ResultVals);
413 NewVal->takeName(&I);
414 I.replaceAllUsesWith(NewVal);
415 DeadVals.push_back(&I);
416
417 return true;
418}
419
420// Find a select instruction, which may have been casted. This is mostly to deal
421// with cases where i16 selects were promoted here to i32.
423 Cast = nullptr;
424 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
425 return Sel;
426
427 if ((Cast = dyn_cast<CastInst>(V))) {
428 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
429 return Sel;
430 }
431
432 return nullptr;
433}
434
435bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
436 // Don't do this unless the old select is going away. We want to eliminate the
437 // binary operator, not replace a binop with a select.
438 int SelOpNo = 0;
439
440 CastInst *CastOp;
441
442 // TODO: Should probably try to handle some cases with multiple
443 // users. Duplicating the select may be profitable for division.
444 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
445 if (!Sel || !Sel->hasOneUse()) {
446 SelOpNo = 1;
447 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
448 }
449
450 if (!Sel || !Sel->hasOneUse())
451 return false;
452
455 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
456 if (!CBO || !CT || !CF)
457 return false;
458
459 if (CastOp) {
460 if (!CastOp->hasOneUse())
461 return false;
462 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
463 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
464 }
465
466 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
467 // need to handle divisions here.
468 Constant *FoldedT =
469 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
470 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
471 if (!FoldedT || isa<ConstantExpr>(FoldedT))
472 return false;
473
474 Constant *FoldedF =
475 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
476 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
477 if (!FoldedF || isa<ConstantExpr>(FoldedF))
478 return false;
479
480 IRBuilder<> Builder(&BO);
481 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
482 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
483 Builder.setFastMathFlags(FPOp->getFastMathFlags());
484
485 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
486 FoldedT, FoldedF);
487 NewSelect->takeName(&BO);
488 BO.replaceAllUsesWith(NewSelect);
489 DeadVals.push_back(&BO);
490 if (CastOp)
491 DeadVals.push_back(CastOp);
492 DeadVals.push_back(Sel);
493 return true;
494}
495
496std::pair<Value *, Value *>
497AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
498 Value *Src) const {
499 Type *Ty = Src->getType();
500 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
501 {Ty, Builder.getInt32Ty()}, Src);
502 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
503
504 // Bypass the bug workaround for the exponent result since it doesn't matter.
505 // TODO: Does the bug workaround even really need to consider the exponent
506 // result? It's unspecified by the spec.
507
508 Value *FrexpExp =
509 ST.hasFractBug()
510 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
511 {Builder.getInt32Ty(), Ty}, Src)
512 : Builder.CreateExtractValue(Frexp, {1});
513 return {FrexpMant, FrexpExp};
514}
515
516/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
517Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
518 Value *Src,
519 bool IsNegative) const {
520 // Same as for 1.0, but expand the sign out of the constant.
521 // -1.0 / x -> rcp (fneg x)
522 if (IsNegative)
523 Src = Builder.CreateFNeg(Src);
524
525 // The rcp instruction doesn't support denormals, so scale the input
526 // out of the denormal range and convert at the end.
527 //
528 // Expand as 2^-n * (1.0 / (x * 2^n))
529
530 // TODO: Skip scaling if input is known never denormal and the input
531 // range won't underflow to denormal. The hard part is knowing the
532 // result. We need a range check, the result could be denormal for
533 // 0x1p+126 < den <= 0x1p+127.
534 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
535 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
536 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
537 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
538}
539
540/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
541Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
542 Value *RHS,
543 FastMathFlags FMF) const {
544 // If we have have to work around the fract/frexp bug, we're worse off than
545 // using the fdiv.fast expansion. The full safe expansion is faster if we have
546 // fast FMA.
547 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
548 (!FMF.noNaNs() || !FMF.noInfs()))
549 return nullptr;
550
551 // We're scaling the LHS to avoid a denormal input, and scale the denominator
552 // to avoid large values underflowing the result.
553 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
554
555 Value *Rcp =
556 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
557
558 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
559 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
560
561 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
562 // result.
563 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
564 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
565}
566
567/// Emit a sqrt that handles denormals and is accurate to 2ulp.
568Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
569 Value *Src,
570 FastMathFlags FMF) const {
571 Type *Ty = Src->getType();
572 APFloat SmallestNormal =
574 Value *NeedScale =
575 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
576
577 ConstantInt *Zero = Builder.getInt32(0);
578 Value *InputScaleFactor =
579 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
580
581 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
582
583 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
584
585 Value *OutputScaleFactor =
586 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
587 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
588}
589
590/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
591static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
592 bool IsNegative) {
593 // bool need_scale = x < 0x1p-126f;
594 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
595 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
596 // rsq(x * input_scale) * output_scale;
597
598 Type *Ty = Src->getType();
599 APFloat SmallestNormal =
600 APFloat::getSmallestNormalized(Ty->getFltSemantics());
601 Value *NeedScale =
602 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
603 Constant *One = ConstantFP::get(Ty, 1.0);
604 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
605 Constant *OutputScale =
606 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
607
608 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
609
610 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
611 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
612 Value *OutputScaleFactor = Builder.CreateSelect(
613 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
614
615 return Builder.CreateFMul(Rsq, OutputScaleFactor);
616}
617
618/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
619/// v_rsq_f64. This should give a 1ulp result.
620Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
621 FastMathFlags SqrtFMF,
622 FastMathFlags DivFMF,
623 const Instruction *CtxI,
624 bool IsNegative) const {
625 // rsq(x):
626 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
627 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
628 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
629 //
630 // -rsq(x):
631 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
632 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
633 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
634 //
635 // The rsq instruction handles the special cases correctly. We need to check
636 // for the edge case conditions to ensure the special case propagates through
637 // the later instructions.
638
639 Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
640
641 // Try to elide the edge case check.
642 //
643 // Fast math flags imply:
644 // sqrt ninf => !isinf(x)
645 // fdiv ninf => x != 0, !isinf(x)
646 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
647 bool MaybeZero = !DivFMF.noInfs();
648
649 DenormalMode DenormMode;
650 FPClassTest Interested = fcNone;
651 if (MaybePosInf)
652 Interested = fcPosInf;
653 if (MaybeZero)
654 Interested |= fcZero;
655
656 if (Interested != fcNone) {
657 KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
658 if (KnownSrc.isKnownNeverPosInfinity())
659 MaybePosInf = false;
660
661 DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
662 if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
663 MaybeZero = false;
664 }
665
666 Value *SpecialOrRsq = X;
667 if (MaybeZero || MaybePosInf) {
668 Value *Cond;
669 if (MaybePosInf && MaybeZero) {
670 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
671 FPClassTest TestMask = fcPosInf | fcZero;
672 if (DenormMode.inputsAreZero())
673 TestMask |= fcSubnormal;
674
675 Cond = Builder.createIsFPClass(X, TestMask);
676 } else {
677 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
678 // doesn't respect the floating-point environment.
679 Value *IsZero =
680 Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
681 Value *IsInf =
682 Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
683 Cond = Builder.CreateOr(IsZero, IsInf);
684 }
685 } else if (MaybeZero) {
686 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
687 } else {
688 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
689 }
690
691 SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
692 }
693
694 Value *NegY0 = Builder.CreateFNeg(Y0);
695 Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
696
697 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
698 Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
699
700 Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
701
702 Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
703 ConstantFP::get(X->getType(), 0.5));
704
705 return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
706}
707
708bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
709 FastMathFlags SqrtFMF) const {
710 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
711 // f64.
712 return DivFMF.allowContract() && SqrtFMF.allowContract();
713}
714
715Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
716 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
717 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
718 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
719 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
720
721 // rsq_f16 is accurate to 0.51 ulp.
722 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
723 // rsq_f64 is never accurate.
724 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
725 if (!CLHS)
726 return nullptr;
727
728 bool IsNegative = false;
729
730 // TODO: Handle other numerator values with arcp.
731 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
732 // Add in the sqrt flags.
733 IRBuilder<>::FastMathFlagGuard Guard(Builder);
734 Builder.setFastMathFlags(DivFMF | SqrtFMF);
735
736 if (Den->getType()->isFloatTy()) {
737 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
738 canIgnoreDenormalInput(Den, CtxI)) {
739 Value *Result =
740 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
741 // -1.0 / sqrt(x) -> fneg(rsq(x))
742 return IsNegative ? Builder.CreateFNeg(Result) : Result;
743 }
744
745 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
746 }
747
748 if (Den->getType()->isDoubleTy())
749 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
750 }
751
752 return nullptr;
753}
754
755// Optimize fdiv with rcp:
756//
757// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
758// allowed with afn.
759//
760// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
761Value *
762AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
763 Value *Den, FastMathFlags FMF,
764 const Instruction *CtxI) const {
765 // rcp_f16 is accurate to 0.51 ulp.
766 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
767 // rcp_f64 is never accurate.
768 assert(Den->getType()->isFloatTy());
769
770 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
771 bool IsNegative = false;
772 if (CLHS->isExactlyValue(1.0) ||
773 (IsNegative = CLHS->isExactlyValue(-1.0))) {
774 Value *Src = Den;
775
776 if (HasFP32DenormalFlush || FMF.approxFunc()) {
777 // -1.0 / x -> 1.0 / fneg(x)
778 if (IsNegative)
779 Src = Builder.CreateFNeg(Src);
780
781 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
782 // the CI documentation has a worst case error of 1 ulp.
783 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
784 // to use it as long as we aren't trying to use denormals.
785 //
786 // v_rcp_f16 and v_rsq_f16 DO support denormals.
787
788 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
789 // insert rsq intrinsic here.
790
791 // 1.0 / x -> rcp(x)
792 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
793 }
794
795 // TODO: If the input isn't denormal, and we know the input exponent isn't
796 // big enough to introduce a denormal we can avoid the scaling.
797 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
798 }
799 }
800
801 if (FMF.allowReciprocal()) {
802 // x / y -> x * (1.0 / y)
803
804 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
805 // will never underflow.
806 if (HasFP32DenormalFlush || FMF.approxFunc()) {
807 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
808 return Builder.CreateFMul(Num, Recip);
809 }
810
811 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
812 return Builder.CreateFMul(Num, Recip);
813 }
814
815 return nullptr;
816}
817
818// optimize with fdiv.fast:
819//
820// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
821//
822// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
823//
824// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
825Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
826 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
827 // fdiv.fast can achieve 2.5 ULP accuracy.
828 if (ReqdAccuracy < 2.5f)
829 return nullptr;
830
831 // Only have fdiv.fast for f32.
832 assert(Den->getType()->isFloatTy());
833
834 bool NumIsOne = false;
835 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
836 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
837 NumIsOne = true;
838 }
839
840 // fdiv does not support denormals. But 1.0/x is always fine to use it.
841 //
842 // TODO: This works for any value with a specific known exponent range, don't
843 // just limit to constant 1.
844 if (!HasFP32DenormalFlush && !NumIsOne)
845 return nullptr;
846
847 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
848}
849
850Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
851 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
852 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
853 float ReqdDivAccuracy) const {
854 if (RsqOp) {
855 Value *Rsq =
856 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
857 if (Rsq)
858 return Rsq;
859 }
860
861 if (!Num->getType()->isFloatTy())
862 return nullptr;
863
864 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
865 if (Rcp)
866 return Rcp;
867
868 // In the basic case fdiv_fast has the same instruction count as the frexp div
869 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
870 // potentially be fused into a user. Also, materialization of the constants
871 // can be reused for multiple instances.
872 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
873 if (FDivFast)
874 return FDivFast;
875
876 return emitFrexpDiv(Builder, Num, Den, DivFMF);
877}
878
879// Optimizations is performed based on fpmath, fast math flags as well as
880// denormals to optimize fdiv with either rcp or fdiv.fast.
881//
882// With rcp:
883// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
884// allowed with afn.
885//
886// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
887//
888// With fdiv.fast:
889// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
890//
891// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
892//
893// NOTE: rcp is the preference in cases that both are legal.
894bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
895 if (DisableFDivExpand)
896 return false;
897
898 Type *Ty = FDiv.getType()->getScalarType();
899 const bool IsFloat = Ty->isFloatTy();
900 if (!IsFloat && !Ty->isDoubleTy())
901 return false;
902
903 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
904 // expansion around them in codegen. f16 is good enough to always use.
905
906 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
907 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
908 const float ReqdAccuracy = FPOp->getFPAccuracy();
909
910 FastMathFlags SqrtFMF;
911
912 Value *Num = FDiv.getOperand(0);
913 Value *Den = FDiv.getOperand(1);
914
915 Value *RsqOp = nullptr;
916 auto *DenII = dyn_cast<IntrinsicInst>(Den);
917 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
918 DenII->hasOneUse()) {
919 const auto *SqrtOp = cast<FPMathOperator>(DenII);
920 SqrtFMF = SqrtOp->getFastMathFlags();
921 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
922 RsqOp = SqrtOp->getOperand(0);
923 }
924
925 // rcp path not yet implemented for f64.
926 if (!IsFloat && !RsqOp)
927 return false;
928
929 // Inaccurate rcp is allowed with afn.
930 //
931 // Defer to codegen to handle this.
932 //
933 // TODO: Decide on an interpretation for interactions between afn + arcp +
934 // !fpmath, and make it consistent between here and codegen. For now, defer
935 // expansion of afn to codegen. The current interpretation is so aggressive we
936 // don't need any pre-consideration here when we have better information. A
937 // more conservative interpretation could use handling here.
938 const bool AllowInaccurateRcp = DivFMF.approxFunc();
939 if (!RsqOp && AllowInaccurateRcp)
940 return false;
941
942 // Defer the correct implementations to codegen.
943 if (IsFloat && ReqdAccuracy < 1.0f)
944 return false;
945
946 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
947 Builder.setFastMathFlags(DivFMF);
948 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
949
950 SmallVector<Value *, 4> NumVals;
951 SmallVector<Value *, 4> DenVals;
952 SmallVector<Value *, 4> RsqDenVals;
953 extractValues(Builder, NumVals, Num);
954 extractValues(Builder, DenVals, Den);
955
956 if (RsqOp)
957 extractValues(Builder, RsqDenVals, RsqOp);
958
959 SmallVector<Value *, 4> ResultVals(NumVals.size());
960 for (int I = 0, E = NumVals.size(); I != E; ++I) {
961 Value *NumElt = NumVals[I];
962 Value *DenElt = DenVals[I];
963 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
964
965 Value *NewElt =
966 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
967 cast<Instruction>(FPOp), ReqdAccuracy);
968 if (!NewElt) {
969 // Keep the original, but scalarized.
970
971 // This has the unfortunate side effect of sometimes scalarizing when
972 // we're not going to do anything.
973 NewElt = Builder.CreateFDiv(NumElt, DenElt);
974 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
975 NewEltInst->copyMetadata(FDiv);
976 }
977
978 ResultVals[I] = NewElt;
979 }
980
981 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
982
983 if (NewVal) {
984 FDiv.replaceAllUsesWith(NewVal);
985 NewVal->takeName(&FDiv);
986 DeadVals.push_back(&FDiv);
987 }
988
989 return true;
990}
991
992static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
993 Value *LHS, Value *RHS) {
994 Type *I32Ty = Builder.getInt32Ty();
995 Type *I64Ty = Builder.getInt64Ty();
996
997 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
998 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
999 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1000 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1001 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1002 Hi = Builder.CreateTrunc(Hi, I32Ty);
1003 return std::pair(Lo, Hi);
1004}
1005
1006static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1007 return getMul64(Builder, LHS, RHS).second;
1008}
1009
1010/// Figure out how many bits are really needed for this division.
1011/// \p MaxDivBits is an optimization hint to bypass the second
1012/// ComputeNumSignBits/computeKnownBits call if the first one is
1013/// insufficient.
1014unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1015 Value *Den,
1016 unsigned MaxDivBits,
1017 bool IsSigned) const {
1019 Den->getType()->getScalarSizeInBits());
1020 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1021 if (IsSigned) {
1022 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I);
1023 // A sign bit needs to be reserved for shrinking.
1024 unsigned DivBits = SSBits - RHSSignBits + 1;
1025 if (DivBits > MaxDivBits)
1026 return SSBits;
1027
1028 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I);
1029
1030 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1031 DivBits = SSBits - SignBits + 1;
1032 return DivBits;
1033 }
1034
1035 // All bits are used for unsigned division for Num or Den in range
1036 // (SignedMax, UnsignedMax].
1037 KnownBits Known = computeKnownBits(Den, DL, AC, &I);
1038 if (Known.isNegative() || !Known.isNonNegative())
1039 return SSBits;
1040 unsigned RHSSignBits = Known.countMinLeadingZeros();
1041 unsigned DivBits = SSBits - RHSSignBits;
1042 if (DivBits > MaxDivBits)
1043 return SSBits;
1044
1045 Known = computeKnownBits(Num, DL, AC, &I);
1046 if (Known.isNegative() || !Known.isNonNegative())
1047 return SSBits;
1048 unsigned LHSSignBits = Known.countMinLeadingZeros();
1049
1050 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1051 DivBits = SSBits - SignBits;
1052 return DivBits;
1053}
1054
1055// The fractional part of a float is enough to accurately represent up to
1056// a 24-bit signed integer.
1057Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1058 BinaryOperator &I, Value *Num,
1059 Value *Den, bool IsDiv,
1060 bool IsSigned) const {
1061 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
1062 if (DivBits > 24)
1063 return nullptr;
1064 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1065}
1066
1067Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1068 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1069 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1070 Type *I32Ty = Builder.getInt32Ty();
1071 Num = Builder.CreateTrunc(Num, I32Ty);
1072 Den = Builder.CreateTrunc(Den, I32Ty);
1073
1074 Type *F32Ty = Builder.getFloatTy();
1075 ConstantInt *One = Builder.getInt32(1);
1076 Value *JQ = One;
1077
1078 if (IsSigned) {
1079 // char|short jq = ia ^ ib;
1080 JQ = Builder.CreateXor(Num, Den);
1081
1082 // jq = jq >> (bitsize - 2)
1083 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1084
1085 // jq = jq | 0x1
1086 JQ = Builder.CreateOr(JQ, One);
1087 }
1088
1089 // int ia = (int)LHS;
1090 Value *IA = Num;
1091
1092 // int ib, (int)RHS;
1093 Value *IB = Den;
1094
1095 // float fa = (float)ia;
1096 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1097 : Builder.CreateUIToFP(IA, F32Ty);
1098
1099 // float fb = (float)ib;
1100 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1101 : Builder.CreateUIToFP(IB,F32Ty);
1102
1103 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1104 Builder.getFloatTy(), {FB});
1105 Value *FQM = Builder.CreateFMul(FA, RCP);
1106
1107 // fq = trunc(fqm);
1108 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1109 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1110
1111 // float fqneg = -fq;
1112 Value *FQNeg = Builder.CreateFNeg(FQ);
1113
1114 // float fr = mad(fqneg, fb, fa);
1115 auto FMAD = !ST.hasMadMacF32Insts()
1116 ? Intrinsic::fma
1117 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1118 Value *FR = Builder.CreateIntrinsic(FMAD,
1119 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1120
1121 // int iq = (int)fq;
1122 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1123 : Builder.CreateFPToUI(FQ, I32Ty);
1124
1125 // fr = fabs(fr);
1126 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1127
1128 // fb = fabs(fb);
1129 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1130
1131 // int cv = fr >= fb;
1132 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1133
1134 // jq = (cv ? jq : 0);
1135 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1136
1137 // dst = iq + jq;
1138 Value *Div = Builder.CreateAdd(IQ, JQ);
1139
1140 Value *Res = Div;
1141 if (!IsDiv) {
1142 // Rem needs compensation, it's easier to recompute it
1143 Value *Rem = Builder.CreateMul(Div, Den);
1144 Res = Builder.CreateSub(Num, Rem);
1145 }
1146
1147 if (DivBits != 0 && DivBits < 32) {
1148 // Extend in register from the number of bits this divide really is.
1149 if (IsSigned) {
1150 int InRegBits = 32 - DivBits;
1151
1152 Res = Builder.CreateShl(Res, InRegBits);
1153 Res = Builder.CreateAShr(Res, InRegBits);
1154 } else {
1155 ConstantInt *TruncMask
1156 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1157 Res = Builder.CreateAnd(Res, TruncMask);
1158 }
1159 }
1160
1161 return Res;
1162}
1163
1164// Try to recognize special cases the DAG will emit special, better expansions
1165// than the general expansion we do here.
1166
1167// TODO: It would be better to just directly handle those optimizations here.
1168bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1169 Value *Num,
1170 Value *Den) const {
1171 if (Constant *C = dyn_cast<Constant>(Den)) {
1172 // Arbitrary constants get a better expansion as long as a wider mulhi is
1173 // legal.
1174 if (C->getType()->getScalarSizeInBits() <= 32)
1175 return true;
1176
1177 // TODO: Sdiv check for not exact for some reason.
1178
1179 // If there's no wider mulhi, there's only a better expansion for powers of
1180 // two.
1181 // TODO: Should really know for each vector element.
1182 if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT))
1183 return true;
1184
1185 return false;
1186 }
1187
1188 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1189 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1190 if (BinOpDen->getOpcode() == Instruction::Shl &&
1191 isa<Constant>(BinOpDen->getOperand(0)) &&
1192 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) {
1193 return true;
1194 }
1195 }
1196
1197 return false;
1198}
1199
1200static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1201 // Check whether the sign can be determined statically.
1202 KnownBits Known = computeKnownBits(V, DL);
1203 if (Known.isNegative())
1204 return Constant::getAllOnesValue(V->getType());
1205 if (Known.isNonNegative())
1206 return Constant::getNullValue(V->getType());
1207 return Builder.CreateAShr(V, Builder.getInt32(31));
1208}
1209
1210Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1211 BinaryOperator &I, Value *X,
1212 Value *Y) const {
1213 Instruction::BinaryOps Opc = I.getOpcode();
1214 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1215 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1216
1217 FastMathFlags FMF;
1218 FMF.setFast();
1219 Builder.setFastMathFlags(FMF);
1220
1221 if (divHasSpecialOptimization(I, X, Y))
1222 return nullptr; // Keep it for later optimization.
1223
1224 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1225 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1226
1227 Type *Ty = X->getType();
1228 Type *I32Ty = Builder.getInt32Ty();
1229 Type *F32Ty = Builder.getFloatTy();
1230
1231 if (Ty->getScalarSizeInBits() != 32) {
1232 if (IsSigned) {
1233 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1234 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1235 } else {
1236 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1237 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1238 }
1239 }
1240
1241 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1242 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1243 Builder.CreateZExtOrTrunc(Res, Ty);
1244 }
1245
1246 ConstantInt *Zero = Builder.getInt32(0);
1247 ConstantInt *One = Builder.getInt32(1);
1248
1249 Value *Sign = nullptr;
1250 if (IsSigned) {
1251 Value *SignX = getSign32(X, Builder, DL);
1252 Value *SignY = getSign32(Y, Builder, DL);
1253 // Remainder sign is the same as LHS
1254 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1255
1256 X = Builder.CreateAdd(X, SignX);
1257 Y = Builder.CreateAdd(Y, SignY);
1258
1259 X = Builder.CreateXor(X, SignX);
1260 Y = Builder.CreateXor(Y, SignY);
1261 }
1262
1263 // The algorithm here is based on ideas from "Software Integer Division", Tom
1264 // Rodeheffer, August 2008.
1265 //
1266 // unsigned udiv(unsigned x, unsigned y) {
1267 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1268 // // that this is a lower bound on inv(y), even if some of the calculations
1269 // // round up.
1270 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1271 //
1272 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1273 // // Empirically this is guaranteed to give a "two-y" lower bound on
1274 // // inv(y).
1275 // z += umulh(z, -y * z);
1276 //
1277 // // Quotient/remainder estimate.
1278 // unsigned q = umulh(x, z);
1279 // unsigned r = x - q * y;
1280 //
1281 // // Two rounds of quotient/remainder refinement.
1282 // if (r >= y) {
1283 // ++q;
1284 // r -= y;
1285 // }
1286 // if (r >= y) {
1287 // ++q;
1288 // r -= y;
1289 // }
1290 //
1291 // return q;
1292 // }
1293
1294 // Initial estimate of inv(y).
1295 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1296 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1297 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1298 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1299 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1300
1301 // One round of UNR.
1302 Value *NegY = Builder.CreateSub(Zero, Y);
1303 Value *NegYZ = Builder.CreateMul(NegY, Z);
1304 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1305
1306 // Quotient/remainder estimate.
1307 Value *Q = getMulHu(Builder, X, Z);
1308 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1309
1310 // First quotient/remainder refinement.
1311 Value *Cond = Builder.CreateICmpUGE(R, Y);
1312 if (IsDiv)
1313 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1314 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1315
1316 // Second quotient/remainder refinement.
1317 Cond = Builder.CreateICmpUGE(R, Y);
1318 Value *Res;
1319 if (IsDiv)
1320 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1321 else
1322 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1323
1324 if (IsSigned) {
1325 Res = Builder.CreateXor(Res, Sign);
1326 Res = Builder.CreateSub(Res, Sign);
1327 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1328 } else {
1329 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1330 }
1331 return Res;
1332}
1333
1334Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1335 BinaryOperator &I, Value *Num,
1336 Value *Den) const {
1337 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1338 return nullptr; // Keep it for later optimization.
1339
1340 Instruction::BinaryOps Opc = I.getOpcode();
1341
1342 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1343 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1344
1345 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1346 if (NumDivBits > 32)
1347 return nullptr;
1348
1349 Value *Narrowed = nullptr;
1350 if (NumDivBits <= 24) {
1351 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1352 IsDiv, IsSigned);
1353 } else if (NumDivBits <= 32) {
1354 Narrowed = expandDivRem32(Builder, I, Num, Den);
1355 }
1356
1357 if (Narrowed) {
1358 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1359 Builder.CreateZExt(Narrowed, Num->getType());
1360 }
1361
1362 return nullptr;
1363}
1364
1365void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1366 Instruction::BinaryOps Opc = I.getOpcode();
1367 // Do the general expansion.
1368 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1370 return;
1371 }
1372
1373 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1375 return;
1376 }
1377
1378 llvm_unreachable("not a division");
1379}
1380
1381/*
1382This will cause non-byte load in consistency, for example:
1383```
1384 %load = load i1, ptr addrspace(4) %arg, align 4
1385 %zext = zext i1 %load to
1386 i64 %add = add i64 %zext
1387```
1388Instead of creating `s_and_b32 s0, s0, 1`,
1389it will create `s_and_b32 s0, s0, 0xff`.
1390We accept this change since the non-byte load assumes the upper bits
1391within the byte are all 0.
1392*/
1393bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1394 unsigned Opc = I->getOpcode();
1395 Type *OldType = I->getType();
1396
1397 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1398 return false;
1399
1400 unsigned OrigBit = OldType->getScalarSizeInBits();
1401
1402 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1403 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1404 "Instruction::Mul.");
1405
1406 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1407
1408 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1409 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1410 if (!NewType)
1411 return false;
1412 unsigned NewBit = NewType->getIntegerBitWidth();
1413 if (NewBit >= OrigBit)
1414 return false;
1415 NewType = I->getType()->getWithNewBitWidth(NewBit);
1416
1417 // Old cost
1418 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1419 InstructionCost OldCost =
1421 // New cost of new op
1422 InstructionCost NewCost =
1424 // New cost of narrowing 2 operands (use trunc)
1425 int NumOfNonConstOps = 2;
1426 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1427 // Cannot be both constant, should be propagated
1428 NumOfNonConstOps = 1;
1429 }
1430 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1431 NewType, OldType,
1434 // New cost of zext narrowed result to original type
1435 NewCost +=
1436 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1438 if (NewCost >= OldCost)
1439 return false;
1440
1441 IRBuilder<> Builder(I);
1442 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1443 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1444 Value *Arith =
1445 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1446
1447 Value *Zext = Builder.CreateZExt(Arith, OldType);
1448 I->replaceAllUsesWith(Zext);
1449 DeadVals.push_back(I);
1450 return true;
1451}
1452
1453bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1454 if (foldBinOpIntoSelect(I))
1455 return true;
1456
1457 if (UseMul24Intrin && replaceMulWithMul24(I))
1458 return true;
1459 if (tryNarrowMathIfNoOverflow(&I))
1460 return true;
1461
1462 bool Changed = false;
1463 Instruction::BinaryOps Opc = I.getOpcode();
1464 Type *Ty = I.getType();
1465 Value *NewDiv = nullptr;
1466 unsigned ScalarSize = Ty->getScalarSizeInBits();
1467
1469
1470 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1471 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1472 ScalarSize <= 64 &&
1473 !DisableIDivExpand) {
1474 Value *Num = I.getOperand(0);
1475 Value *Den = I.getOperand(1);
1476 IRBuilder<> Builder(&I);
1477 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1478
1479 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1480 NewDiv = PoisonValue::get(VT);
1481
1482 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1483 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1484 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1485
1486 Value *NewElt;
1487 if (ScalarSize <= 32) {
1488 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1489 if (!NewElt)
1490 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1491 } else {
1492 // See if this 64-bit division can be shrunk to 32/24-bits before
1493 // producing the general expansion.
1494 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1495 if (!NewElt) {
1496 // The general 64-bit expansion introduces control flow and doesn't
1497 // return the new value. Just insert a scalar copy and defer
1498 // expanding it.
1499 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1500 // CreateBinOp does constant folding. If the operands are constant,
1501 // it will return a Constant instead of a BinaryOperator.
1502 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1503 Div64ToExpand.push_back(NewEltBO);
1504 }
1505 }
1506
1507 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1508 NewEltI->copyIRFlags(&I);
1509
1510 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1511 }
1512 } else {
1513 if (ScalarSize <= 32)
1514 NewDiv = expandDivRem32(Builder, I, Num, Den);
1515 else {
1516 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1517 if (!NewDiv)
1518 Div64ToExpand.push_back(&I);
1519 }
1520 }
1521
1522 if (NewDiv) {
1523 I.replaceAllUsesWith(NewDiv);
1524 DeadVals.push_back(&I);
1525 Changed = true;
1526 }
1527 }
1528
1529 if (ExpandDiv64InIR) {
1530 // TODO: We get much worse code in specially handled constant cases.
1531 for (BinaryOperator *Div : Div64ToExpand) {
1532 expandDivRem64(*Div);
1533 FlowChanged = true;
1534 Changed = true;
1535 }
1536 }
1537
1538 return Changed;
1539}
1540
1541bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1542 if (!WidenLoads)
1543 return false;
1544
1545 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1546 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1547 canWidenScalarExtLoad(I)) {
1548 IRBuilder<> Builder(&I);
1549 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1550
1551 Type *I32Ty = Builder.getInt32Ty();
1552 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1553 WidenLoad->copyMetadata(I);
1554
1555 // If we have range metadata, we need to convert the type, and not make
1556 // assumptions about the high bits.
1557 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1558 ConstantInt *Lower =
1559 mdconst::extract<ConstantInt>(Range->getOperand(0));
1560
1561 if (Lower->isNullValue()) {
1562 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1563 } else {
1564 Metadata *LowAndHigh[] = {
1565 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1566 // Don't make assumptions about the high bits.
1567 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1568 };
1569
1570 WidenLoad->setMetadata(LLVMContext::MD_range,
1571 MDNode::get(F.getContext(), LowAndHigh));
1572 }
1573 }
1574
1575 int TySize = DL.getTypeSizeInBits(I.getType());
1576 Type *IntNTy = Builder.getIntNTy(TySize);
1577 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1578 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1579 I.replaceAllUsesWith(ValOrig);
1580 DeadVals.push_back(&I);
1581 return true;
1582 }
1583
1584 return false;
1585}
1586
1587bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1588 Value *Cond = I.getCondition();
1589 Value *TrueVal = I.getTrueValue();
1590 Value *FalseVal = I.getFalseValue();
1591 Value *CmpVal;
1592 CmpPredicate Pred;
1593
1594 // Match fract pattern with nan check.
1595 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1596 return false;
1597
1598 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1599 if (!FPOp)
1600 return false;
1601
1602 IRBuilder<> Builder(&I);
1603 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1604
1605 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1606 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1607
1608 Value *Fract = nullptr;
1609 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1610 CmpVal == matchFractPat(*IIFalse)) {
1611 // isnan(x) ? x : fract(x)
1612 Fract = applyFractPat(Builder, CmpVal);
1613 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1614 CmpVal == matchFractPat(*IITrue)) {
1615 // !isnan(x) ? fract(x) : x
1616 Fract = applyFractPat(Builder, CmpVal);
1617 } else
1618 return false;
1619
1620 Fract->takeName(&I);
1621 I.replaceAllUsesWith(Fract);
1622 DeadVals.push_back(&I);
1623 return true;
1624}
1625
1626static bool areInSameBB(const Value *A, const Value *B) {
1627 const auto *IA = dyn_cast<Instruction>(A);
1628 const auto *IB = dyn_cast<Instruction>(B);
1629 return IA && IB && IA->getParent() == IB->getParent();
1630}
1631
1632// Helper for breaking large PHIs that returns true when an extractelement on V
1633// is likely to be folded away by the DAG combiner.
1635 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1636 if (!FVT)
1637 return false;
1638
1639 const Value *CurVal = V;
1640
1641 // Check for insertelements, keeping track of the elements covered.
1642 BitVector EltsCovered(FVT->getNumElements());
1643 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1644 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1645
1646 // Non constant index/out of bounds index -> folding is unlikely.
1647 // The latter is more of a sanity check because canonical IR should just
1648 // have replaced those with poison.
1649 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1650 return false;
1651
1652 const auto *VecSrc = IE->getOperand(0);
1653
1654 // If the vector source is another instruction, it must be in the same basic
1655 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1656 // unlikely to be able to do anything interesting here.
1657 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1658 return false;
1659
1660 CurVal = VecSrc;
1661 EltsCovered.set(Idx->getZExtValue());
1662
1663 // All elements covered.
1664 if (EltsCovered.all())
1665 return true;
1666 }
1667
1668 // We either didn't find a single insertelement, or the insertelement chain
1669 // ended before all elements were covered. Check for other interesting values.
1670
1671 // Constants are always interesting because we can just constant fold the
1672 // extractelements.
1673 if (isa<Constant>(CurVal))
1674 return true;
1675
1676 // shufflevector is likely to be profitable if either operand is a constant,
1677 // or if either source is in the same block.
1678 // This is because shufflevector is most often lowered as a series of
1679 // insert/extract elements anyway.
1680 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1681 return isa<Constant>(SV->getOperand(1)) ||
1682 areInSameBB(SV, SV->getOperand(0)) ||
1683 areInSameBB(SV, SV->getOperand(1));
1684 }
1685
1686 return false;
1687}
1688
1689static void collectPHINodes(const PHINode &I,
1691 const auto [It, Inserted] = SeenPHIs.insert(&I);
1692 if (!Inserted)
1693 return;
1694
1695 for (const Value *Inc : I.incoming_values()) {
1696 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1697 collectPHINodes(*PhiInc, SeenPHIs);
1698 }
1699
1700 for (const User *U : I.users()) {
1701 if (const auto *PhiU = dyn_cast<PHINode>(U))
1702 collectPHINodes(*PhiU, SeenPHIs);
1703 }
1704}
1705
1706bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1707 // Check in the cache first.
1708 if (const auto It = BreakPhiNodesCache.find(&I);
1709 It != BreakPhiNodesCache.end())
1710 return It->second;
1711
1712 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1713 // recursively consider all its users and incoming values that are also PHI
1714 // nodes. We then make a decision about all of those PHIs at once. Either they
1715 // all get broken up, or none of them do. That way, we avoid cases where a
1716 // single PHI is/is not broken and we end up reforming/exploding a vector
1717 // multiple times, or even worse, doing it in a loop.
1718 SmallPtrSet<const PHINode *, 8> WorkList;
1719 collectPHINodes(I, WorkList);
1720
1721#ifndef NDEBUG
1722 // Check that none of the PHI nodes in the worklist are in the map. If some of
1723 // them are, it means we're not good enough at collecting related PHIs.
1724 for (const PHINode *WLP : WorkList) {
1725 assert(BreakPhiNodesCache.count(WLP) == 0);
1726 }
1727#endif
1728
1729 // To consider a PHI profitable to break, we need to see some interesting
1730 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1731 // must have one to consider all PHIs breakable.
1732 //
1733 // This threshold has been determined through performance testing.
1734 //
1735 // Note that the computation below is equivalent to
1736 //
1737 // (unsigned)ceil((K / 3.0) * 2)
1738 //
1739 // It's simply written this way to avoid mixing integral/FP arithmetic.
1740 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1741 unsigned NumBreakablePHIs = 0;
1742 bool CanBreak = false;
1743 for (const PHINode *Cur : WorkList) {
1744 // Don't break PHIs that have no interesting incoming values. That is, where
1745 // there is no clear opportunity to fold the "extractelement" instructions
1746 // we would add.
1747 //
1748 // Note: IC does not run after this pass, so we're only interested in the
1749 // foldings that the DAG combiner can do.
1750 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1751 if (++NumBreakablePHIs >= Threshold) {
1752 CanBreak = true;
1753 break;
1754 }
1755 }
1756 }
1757
1758 for (const PHINode *Cur : WorkList)
1759 BreakPhiNodesCache[Cur] = CanBreak;
1760
1761 return CanBreak;
1762}
1763
1764/// Helper class for "break large PHIs" (visitPHINode).
1765///
1766/// This represents a slice of a PHI's incoming value, which is made up of:
1767/// - The type of the slice (Ty)
1768/// - The index in the incoming value's vector where the slice starts (Idx)
1769/// - The number of elements in the slice (NumElts).
1770/// It also keeps track of the NewPHI node inserted for this particular slice.
1771///
1772/// Slice examples:
1773/// <4 x i64> -> Split into four i64 slices.
1774/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1775/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1776/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1778public:
1779 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1780 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1781
1782 Type *Ty = nullptr;
1783 unsigned Idx = 0;
1784 unsigned NumElts = 0;
1785 PHINode *NewPHI = nullptr;
1786
1787 /// Slice \p Inc according to the information contained within this slice.
1788 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1789 /// pair, it returns the same Sliced value as well.
1790 ///
1791 /// Note this *intentionally* does not return the same value for, say,
1792 /// [%bb.0, %0] & [%bb.1, %0] as:
1793 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1794 /// the value in bb.1 may not be reachable from bb.0 if it's its
1795 /// predecessor.)
1796 /// - We also want to make our extract instructions as local as possible so
1797 /// the DAG has better chances of folding them out. Duplicating them like
1798 /// that is beneficial in that regard.
1799 ///
1800 /// This is both a minor optimization to avoid creating duplicate
1801 /// instructions, but also a requirement for correctness. It is not forbidden
1802 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1803 /// returned a new value each time, those previously identical pairs would all
1804 /// have different incoming values (from the same block) and it'd cause a "PHI
1805 /// node has multiple entries for the same basic block with different incoming
1806 /// values!" verifier error.
1807 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1808 Value *&Res = SlicedVals[{BB, Inc}];
1809 if (Res)
1810 return Res;
1811
1813 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1814 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1815
1816 if (NumElts > 1) {
1818 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1819 Mask.push_back(K);
1820 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1821 } else
1822 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1823
1824 return Res;
1825 }
1826
1827private:
1829};
1830
1831bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1832 // Break-up fixed-vector PHIs into smaller pieces.
1833 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1834 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1835 //
1836 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1837 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1838 // With large, odd-sized PHIs we may end up needing many `build_vector`
1839 // operations with most elements being "undef". This inhibits a lot of
1840 // optimization opportunities and can result in unreasonably high register
1841 // pressure and the inevitable stack spilling.
1842 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1843 return false;
1844
1845 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1846 if (!FVT || FVT->getNumElements() == 1 ||
1847 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1848 return false;
1849
1850 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1851 return false;
1852
1853 std::vector<VectorSlice> Slices;
1854
1855 Type *EltTy = FVT->getElementType();
1856 {
1857 unsigned Idx = 0;
1858 // For 8/16 bits type, don't scalarize fully but break it up into as many
1859 // 32-bit slices as we can, and scalarize the tail.
1860 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1861 const unsigned NumElts = FVT->getNumElements();
1862 if (EltSize == 8 || EltSize == 16) {
1863 const unsigned SubVecSize = (32 / EltSize);
1864 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1865 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1866 Idx += SubVecSize)
1867 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1868 }
1869
1870 // Scalarize all remaining elements.
1871 for (; Idx < NumElts; ++Idx)
1872 Slices.emplace_back(EltTy, Idx, 1);
1873 }
1874
1875 assert(Slices.size() > 1);
1876
1877 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1878 // creating the necessary instruction to extract the relevant slices of each
1879 // incoming value.
1880 IRBuilder<> B(I.getParent());
1881 B.SetCurrentDebugLocation(I.getDebugLoc());
1882
1883 unsigned IncNameSuffix = 0;
1884 for (VectorSlice &S : Slices) {
1885 // We need to reset the build on each iteration, because getSlicedVal may
1886 // have inserted something into I's BB.
1887 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1888 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1889
1890 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1891 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1892 "largephi.extractslice" +
1893 std::to_string(IncNameSuffix++)),
1894 BB);
1895 }
1896 }
1897
1898 // And replace this PHI with a vector of all the previous PHI values.
1899 Value *Vec = PoisonValue::get(FVT);
1900 unsigned NameSuffix = 0;
1901 for (VectorSlice &S : Slices) {
1902 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1903 if (S.NumElts > 1)
1904 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1905 else
1906 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1907 }
1908
1909 I.replaceAllUsesWith(Vec);
1910 DeadVals.push_back(&I);
1911 return true;
1912}
1913
1914/// \param V Value to check
1915/// \param DL DataLayout
1916/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1917/// \param AS Target Address Space
1918/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1919static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1920 const AMDGPUTargetMachine &TM, unsigned AS) {
1921 // Pointer cannot be null if it's a block address, GV or alloca.
1922 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1923 // it as the symbol could be null in such cases.
1925 return true;
1926
1927 // Check nonnull arguments.
1928 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
1929 return true;
1930
1931 // Check nonnull loads.
1932 if (const auto *Load = dyn_cast<LoadInst>(V);
1933 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1934 return true;
1935
1936 // getUnderlyingObject may have looked through another addrspacecast, although
1937 // the optimizable situations most likely folded out by now.
1938 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
1939 return false;
1940
1941 // TODO: Calls that return nonnull?
1942
1943 // For all other things, use KnownBits.
1944 // We either use 0 or all bits set to indicate null, so check whether the
1945 // value can be zero or all ones.
1946 //
1947 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
1948 // address spaces have non-zero null values.
1949 auto SrcPtrKB = computeKnownBits(V, DL);
1950 const auto NullVal = TM.getNullPointerValue(AS);
1951
1952 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1953 assert((NullVal == 0 || NullVal == -1) &&
1954 "don't know how to check for this null value!");
1955 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1956}
1957
1958bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1959 // Intrinsic doesn't support vectors, also it seems that it's often difficult
1960 // to prove that a vector cannot have any nulls in it so it's unclear if it's
1961 // worth supporting.
1962 if (I.getType()->isVectorTy())
1963 return false;
1964
1965 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
1966 // This is only worthwhile for casts from/to priv/local to flat.
1967 const unsigned SrcAS = I.getSrcAddressSpace();
1968 const unsigned DstAS = I.getDestAddressSpace();
1969
1970 bool CanLower = false;
1971 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1972 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
1973 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
1974 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
1975 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1976 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
1977 if (!CanLower)
1978 return false;
1979
1981 getUnderlyingObjects(I.getOperand(0), WorkList);
1982 if (!all_of(WorkList, [&](const Value *V) {
1983 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
1984 }))
1985 return false;
1986
1987 IRBuilder<> B(&I);
1988 auto *Intrin = B.CreateIntrinsic(
1989 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
1990 I.replaceAllUsesWith(Intrin);
1991 DeadVals.push_back(&I);
1992 return true;
1993}
1994
1995bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
1996 switch (I.getIntrinsicID()) {
1997 case Intrinsic::minnum:
1998 case Intrinsic::minimumnum:
1999 case Intrinsic::minimum:
2000 return visitFMinLike(I);
2001 case Intrinsic::sqrt:
2002 return visitSqrt(I);
2003 case Intrinsic::amdgcn_mbcnt_lo:
2004 return visitMbcntLo(I);
2005 case Intrinsic::amdgcn_mbcnt_hi:
2006 return visitMbcntHi(I);
2007 default:
2008 return false;
2009 }
2010}
2011
2012/// Match non-nan fract pattern.
2013/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2014/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2015/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2016///
2017/// If fract is a useful instruction for the subtarget. Does not account for the
2018/// nan handling; the instruction has a nan check on the input value.
2019Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2020 if (ST.hasFractBug())
2021 return nullptr;
2022
2023 Intrinsic::ID IID = I.getIntrinsicID();
2024
2025 // The value is only used in contexts where we know the input isn't a nan, so
2026 // any of the fmin variants are fine.
2027 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2028 IID != Intrinsic::minimumnum)
2029 return nullptr;
2030
2031 Type *Ty = I.getType();
2032 if (!isLegalFloatingTy(Ty->getScalarType()))
2033 return nullptr;
2034
2035 Value *Arg0 = I.getArgOperand(0);
2036 Value *Arg1 = I.getArgOperand(1);
2037
2038 const APFloat *C;
2039 if (!match(Arg1, m_APFloat(C)))
2040 return nullptr;
2041
2042 APFloat One(1.0);
2043 bool LosesInfo;
2044 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2045
2046 // Match nextafter(1.0, -1)
2047 One.next(true);
2048 if (One != *C)
2049 return nullptr;
2050
2051 Value *FloorSrc;
2052 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2054 return FloorSrc;
2055 return nullptr;
2056}
2057
2058Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2059 Value *FractArg) {
2060 SmallVector<Value *, 4> FractVals;
2061 extractValues(Builder, FractVals, FractArg);
2062
2063 SmallVector<Value *, 4> ResultVals(FractVals.size());
2064
2065 Type *Ty = FractArg->getType()->getScalarType();
2066 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2067 ResultVals[I] =
2068 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2069 }
2070
2071 return insertValues(Builder, FractArg->getType(), ResultVals);
2072}
2073
2074bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2075 Value *FractArg = matchFractPat(I);
2076 if (!FractArg)
2077 return false;
2078
2079 // Match pattern for fract intrinsic in contexts where the nan check has been
2080 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2081 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI)))
2082 return false;
2083
2084 IRBuilder<> Builder(&I);
2085 FastMathFlags FMF = I.getFastMathFlags();
2086 FMF.setNoNaNs();
2087 Builder.setFastMathFlags(FMF);
2088
2089 Value *Fract = applyFractPat(Builder, FractArg);
2090 Fract->takeName(&I);
2091 I.replaceAllUsesWith(Fract);
2092 DeadVals.push_back(&I);
2093 return true;
2094}
2095
2096// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2097bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2098 Type *Ty = Sqrt.getType()->getScalarType();
2099 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2100 return false;
2101
2102 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2103 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2104
2105 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2106 // of fast llvm.sqrt will give the raw instruction anyway.
2107 if (SqrtFMF.approxFunc())
2108 return false;
2109
2110 const float ReqdAccuracy = FPOp->getFPAccuracy();
2111
2112 // Defer correctly rounded expansion to codegen.
2113 if (ReqdAccuracy < 1.0f)
2114 return false;
2115
2116 Value *SrcVal = Sqrt.getOperand(0);
2117 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2118
2119 // The raw instruction is 1 ulp, but the correction for denormal handling
2120 // brings it to 2.
2121 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2122 return false;
2123
2124 IRBuilder<> Builder(&Sqrt);
2125 SmallVector<Value *, 4> SrcVals;
2126 extractValues(Builder, SrcVals, SrcVal);
2127
2128 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2129 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2130 if (CanTreatAsDAZ)
2131 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2132 else
2133 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2134 }
2135
2136 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2137 NewSqrt->takeName(&Sqrt);
2138 Sqrt.replaceAllUsesWith(NewSqrt);
2139 DeadVals.push_back(&Sqrt);
2140 return true;
2141}
2142
2143bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2144 if (skipFunction(F))
2145 return false;
2146
2147 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2148 if (!TPC)
2149 return false;
2150
2151 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2152 const TargetLibraryInfo *TLI =
2153 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2154 AssumptionCache *AC =
2155 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2156 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2157 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2158 const UniformityInfo &UA =
2159 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2160 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2161}
2162
2165 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2166 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
2167 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
2168 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2169 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
2170 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2171 if (!Impl.run())
2172 return PreservedAnalyses::all();
2174 if (!Impl.FlowChanged)
2176 return PA;
2177}
2178
2179INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2180 "AMDGPU IR optimizations", false, false)
2184INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2186
2187/// Create a workitem.id.x intrinsic call with range metadata.
2188CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2189 CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2190 ST.makeLIDRangeMetadata(Tid);
2191 return Tid;
2192}
2193
2194/// Replace the instruction with a direct workitem.id.x call.
2195void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2196 IRBuilder<> B(&I);
2197 CallInst *Tid = createWorkitemIdX(B);
2199 ReplaceInstWithValue(BI, Tid);
2200}
2201
2202/// Replace the instruction with (workitem.id.x & mask).
2203void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2204 Instruction &I, unsigned WaveSize) const {
2205 IRBuilder<> B(&I);
2206 CallInst *Tid = createWorkitemIdX(B);
2207 Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);
2208 Value *AndInst = B.CreateAnd(Tid, Mask);
2210 ReplaceInstWithValue(BI, AndInst);
2211}
2212
2213/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2214/// work group size allows direct computation of lane ID.
2215/// Returns true if optimization was applied, false otherwise.
2216bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2217 unsigned Wave) const {
2218 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);
2219 if (!MaybeX)
2220 return false;
2221
2222 // When work group size == wave_size, each work group contains exactly one
2223 // wave, so the instruction can be replaced with workitem.id.x directly.
2224 if (*MaybeX == Wave) {
2225 replaceWithWorkitemIdX(I);
2226 return true;
2227 }
2228
2229 // When work group evenly splits into waves, compute lane ID within wave
2230 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2231 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
2232 replaceWithMaskedWorkitemIdX(I, Wave);
2233 return true;
2234 }
2235
2236 return false;
2237}
2238
2239/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2240bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2241 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2242 // the full execution mask.
2243 if (!ST.isWave32())
2244 return false;
2245
2246 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2247 // lower IDs.
2248 if (!match(&I,
2250 return false;
2251
2252 return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());
2253}
2254
2255/// Optimize mbcnt.hi calls for lane ID computation.
2256bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2257 // Abort if wave size is not known at compile time.
2258 if (!ST.isWaveSizeKnown())
2259 return false;
2260
2261 unsigned Wave = ST.getWavefrontSize();
2262
2263 // On wave32, the upper 32 bits of execution mask are always 0, so
2264 // mbcnt.hi(mask, val) always returns val unchanged.
2265 if (ST.isWave32()) {
2266 if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
2267 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2268 // wave size (single wave per work group).
2269 if (*MaybeX == Wave) {
2271 ReplaceInstWithValue(BI, I.getArgOperand(1));
2272 return true;
2273 }
2274 }
2275 }
2276
2277 // Optimize the complete lane ID computation pattern:
2278 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2279 // across the full execution mask.
2280 using namespace PatternMatch;
2281
2282 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2285 m_AllOnes(), m_Zero()))))
2286 return false;
2287
2288 return tryReplaceWithWorkitemId(I, Wave);
2289}
2290
2291char AMDGPUCodeGenPrepare::ID = 0;
2292
2294 return new AMDGPUCodeGenPrepare();
2295}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Value * RHS
Value * LHS
BinaryOperator * Mul
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasMadMacF32Insts() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
BitVector & set()
Definition BitVector.h:370
bool all() const
all - Returns true if all bits are set.
Definition BitVector.h:194
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:610
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
void setFast(bool B=true)
Definition FMF.h:96
bool noInfs() const
Definition FMF.h:66
bool allowReciprocal() const
Definition FMF.h:68
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:65
bool allowContract() const
Definition FMF.h:69
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isWave32() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
bool hasFractBug() const
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2585
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1670
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2158
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2573
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2103
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2632
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2131
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2145
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2387
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1784
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Definition IRBuilder.h:1073
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:334
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2372
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2344
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1599
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1793
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2118
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2382
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2138
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2794
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Value * getOperand(unsigned i) const
Definition User.h:233
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
Definition bit.h:90
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:251
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.