LLVM 20.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
24#include "llvm/IR/Dominators.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/InstVisitor.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/Pass.h"
34
35#define DEBUG_TYPE "amdgpu-codegenprepare"
36
37using namespace llvm;
38using namespace llvm::PatternMatch;
39
40namespace {
41
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
46 cl::init(false));
47
48static cl::opt<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
52 cl::init(true));
53
54static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
58
59static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
64
65static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
69
70static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
74 cl::init(true));
75
76// Legalize 64-bit division by using the generic IR expansion.
77static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
81 cl::init(false));
82
83// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84// and is used for testing the legalizer.
85static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
89 cl::init(false));
90
91// Disable processing of fdiv so we can better test the backend implementations.
92static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
96 cl::init(false));
97
98static bool hasUnsafeFPMath(const Function &F) {
99 return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
100}
101
102class AMDGPUCodeGenPrepareImpl
103 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
104public:
105 Function &F;
106 const GCNSubtarget &ST;
107 const AMDGPUTargetMachine &TM;
108 const TargetLibraryInfo *TLI;
109 AssumptionCache *AC;
110 const DominatorTree *DT;
111 const UniformityInfo &UA;
112 const DataLayout &DL;
113 const bool HasUnsafeFPMath;
114 const bool HasFP32DenormalFlush;
115 bool FlowChanged = false;
116 mutable Function *SqrtF32 = nullptr;
117 mutable Function *LdexpF32 = nullptr;
118
119 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
120
121 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
122 const TargetLibraryInfo *TLI, AssumptionCache *AC,
123 const DominatorTree *DT, const UniformityInfo &UA)
124 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
125 DT(DT), UA(UA), DL(F.getDataLayout()),
126 HasUnsafeFPMath(hasUnsafeFPMath(F)),
127 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
129
130 Function *getSqrtF32() const {
131 if (SqrtF32)
132 return SqrtF32;
133
134 LLVMContext &Ctx = F.getContext();
136 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
137 return SqrtF32;
138 }
139
140 Function *getLdexpF32() const {
141 if (LdexpF32)
142 return LdexpF32;
143
144 LLVMContext &Ctx = F.getContext();
146 F.getParent(), Intrinsic::ldexp,
147 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
148 return LdexpF32;
149 }
150
151 bool canBreakPHINode(const PHINode &I);
152
153 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
154 /// binary operation \p V.
155 ///
156 /// \returns Binary operation \p V.
157 /// \returns \p T's base element bit width.
158 unsigned getBaseElementBitWidth(const Type *T) const;
159
160 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
161 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
162 /// is returned.
163 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
164
165 /// \returns True if binary operation \p I is a signed binary operation, false
166 /// otherwise.
167 bool isSigned(const BinaryOperator &I) const;
168
169 /// \returns True if the condition of 'select' operation \p I comes from a
170 /// signed 'icmp' operation, false otherwise.
171 bool isSigned(const SelectInst &I) const;
172
173 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
174 /// false otherwise.
175 bool needsPromotionToI32(const Type *T) const;
176
177 /// Return true if \p T is a legal scalar floating point type.
178 bool isLegalFloatingTy(const Type *T) const;
179
180 /// Wrapper to pass all the arguments to computeKnownFPClass
182 const Instruction *CtxI) const {
183 return llvm::computeKnownFPClass(V, DL, Interested, 0, TLI, AC, CtxI, DT);
184 }
185
186 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
187 return HasFP32DenormalFlush ||
189 }
190
191 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
192 /// operation.
193 ///
194 /// \details \p I's base element bit width must be greater than 1 and less
195 /// than or equal 16. Promotion is done by sign or zero extending operands to
196 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
197 /// truncating the result of 32 bit binary operation back to \p I's original
198 /// type. Division operation is not promoted.
199 ///
200 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
201 /// false otherwise.
202 bool promoteUniformOpToI32(BinaryOperator &I) const;
203
204 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
205 ///
206 /// \details \p I's base element bit width must be greater than 1 and less
207 /// than or equal 16. Promotion is done by sign or zero extending operands to
208 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
209 ///
210 /// \returns True.
211 bool promoteUniformOpToI32(ICmpInst &I) const;
212
213 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
214 /// operation.
215 ///
216 /// \details \p I's base element bit width must be greater than 1 and less
217 /// than or equal 16. Promotion is done by sign or zero extending operands to
218 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
219 /// result of 32 bit 'select' operation back to \p I's original type.
220 ///
221 /// \returns True.
222 bool promoteUniformOpToI32(SelectInst &I) const;
223
224 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
225 /// intrinsic.
226 ///
227 /// \details \p I's base element bit width must be greater than 1 and less
228 /// than or equal 16. Promotion is done by zero extending the operand to 32
229 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
230 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
231 /// shift amount is 32 minus \p I's base element bit width), and truncating
232 /// the result of the shift operation back to \p I's original type.
233 ///
234 /// \returns True.
235 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
236
237 /// \returns The minimum number of bits needed to store the value of \Op as an
238 /// unsigned integer. Truncating to this size and then zero-extending to
239 /// the original will not change the value.
240 unsigned numBitsUnsigned(Value *Op) const;
241
242 /// \returns The minimum number of bits needed to store the value of \Op as a
243 /// signed integer. Truncating to this size and then sign-extending to
244 /// the original size will not change the value.
245 unsigned numBitsSigned(Value *Op) const;
246
247 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
248 /// SelectionDAG has an issue where an and asserting the bits are known
249 bool replaceMulWithMul24(BinaryOperator &I) const;
250
251 /// Perform same function as equivalently named function in DAGCombiner. Since
252 /// we expand some divisions here, we need to perform this before obscuring.
253 bool foldBinOpIntoSelect(BinaryOperator &I) const;
254
255 bool divHasSpecialOptimization(BinaryOperator &I,
256 Value *Num, Value *Den) const;
257 int getDivNumBits(BinaryOperator &I,
258 Value *Num, Value *Den,
259 unsigned AtLeast, bool Signed) const;
260
261 /// Expands 24 bit div or rem.
262 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
263 Value *Num, Value *Den,
264 bool IsDiv, bool IsSigned) const;
265
266 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
267 Value *Num, Value *Den, unsigned NumBits,
268 bool IsDiv, bool IsSigned) const;
269
270 /// Expands 32 bit div or rem.
271 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
272 Value *Num, Value *Den) const;
273
274 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
275 Value *Num, Value *Den) const;
276 void expandDivRem64(BinaryOperator &I) const;
277
278 /// Widen a scalar load.
279 ///
280 /// \details \p Widen scalar load for uniform, small type loads from constant
281 // memory / to a full 32-bits and then truncate the input to allow a scalar
282 // load instead of a vector load.
283 //
284 /// \returns True.
285
286 bool canWidenScalarExtLoad(LoadInst &I) const;
287
288 Value *matchFractPat(IntrinsicInst &I);
289 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
290
291 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
292 FastMathFlags SqrtFMF) const;
293
294 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
295 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
296 const Instruction *CtxI) const;
297
298 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
299 FastMathFlags FMF, const Instruction *CtxI) const;
300 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
301 float ReqdAccuracy) const;
302
303 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
304 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
305 Value *RsqOp, const Instruction *FDiv,
306 float ReqdAccuracy) const;
307
308 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
309 Value *Src) const;
310
311 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
312 bool IsNegative) const;
313 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
314 FastMathFlags FMF) const;
315 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
316 FastMathFlags FMF) const;
317
318public:
319 bool visitFDiv(BinaryOperator &I);
320
321 bool visitInstruction(Instruction &I) { return false; }
323 bool visitLoadInst(LoadInst &I);
324 bool visitICmpInst(ICmpInst &I);
326 bool visitPHINode(PHINode &I);
328
330 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
331 bool visitMinNum(IntrinsicInst &I);
332 bool visitSqrt(IntrinsicInst &I);
333 bool run();
334};
335
336class AMDGPUCodeGenPrepare : public FunctionPass {
337public:
338 static char ID;
339 AMDGPUCodeGenPrepare() : FunctionPass(ID) {
341 }
342 void getAnalysisUsage(AnalysisUsage &AU) const override {
346
347 // FIXME: Division expansion needs to preserve the dominator tree.
348 if (!ExpandDiv64InIR)
349 AU.setPreservesAll();
350 }
351 bool runOnFunction(Function &F) override;
352 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
353};
354
355} // end anonymous namespace
356
357bool AMDGPUCodeGenPrepareImpl::run() {
358 BreakPhiNodesCache.clear();
359 bool MadeChange = false;
360
361 Function::iterator NextBB;
362 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
363 BasicBlock *BB = &*FI;
364 NextBB = std::next(FI);
365
367 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
368 I = Next) {
369 Next = std::next(I);
370
371 MadeChange |= visit(*I);
372
373 if (Next != E) { // Control flow changed
374 BasicBlock *NextInstBB = Next->getParent();
375 if (NextInstBB != BB) {
376 BB = NextInstBB;
377 E = BB->end();
378 FE = F.end();
379 }
380 }
381 }
382 }
383 return MadeChange;
384}
385
386unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
387 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
388
389 if (T->isIntegerTy())
390 return T->getIntegerBitWidth();
391 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
392}
393
394Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
395 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
396
397 if (T->isIntegerTy())
398 return B.getInt32Ty();
399 return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
400}
401
402bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
403 return I.getOpcode() == Instruction::AShr ||
404 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
405}
406
407bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
408 return isa<ICmpInst>(I.getOperand(0)) ?
409 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
410}
411
412bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
413 if (!Widen16BitOps)
414 return false;
415
416 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
417 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
418 return true;
419
420 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
421 // TODO: The set of packed operations is more limited, so may want to
422 // promote some anyway.
423 if (ST.hasVOP3PInsts())
424 return false;
425
426 return needsPromotionToI32(VT->getElementType());
427 }
428
429 return false;
430}
431
432bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
433 return Ty->isFloatTy() || Ty->isDoubleTy() ||
434 (Ty->isHalfTy() && ST.has16BitInsts());
435}
436
437// Return true if the op promoted to i32 should have nsw set.
438static bool promotedOpIsNSW(const Instruction &I) {
439 switch (I.getOpcode()) {
440 case Instruction::Shl:
441 case Instruction::Add:
442 case Instruction::Sub:
443 return true;
444 case Instruction::Mul:
445 return I.hasNoUnsignedWrap();
446 default:
447 return false;
448 }
449}
450
451// Return true if the op promoted to i32 should have nuw set.
452static bool promotedOpIsNUW(const Instruction &I) {
453 switch (I.getOpcode()) {
454 case Instruction::Shl:
455 case Instruction::Add:
456 case Instruction::Mul:
457 return true;
458 case Instruction::Sub:
459 return I.hasNoUnsignedWrap();
460 default:
461 return false;
462 }
463}
464
465bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
466 Type *Ty = I.getType();
467 int TySize = DL.getTypeSizeInBits(Ty);
468 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
469
470 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
471}
472
473bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
474 assert(needsPromotionToI32(I.getType()) &&
475 "I does not need promotion to i32");
476
477 if (I.getOpcode() == Instruction::SDiv ||
478 I.getOpcode() == Instruction::UDiv ||
479 I.getOpcode() == Instruction::SRem ||
480 I.getOpcode() == Instruction::URem)
481 return false;
482
483 IRBuilder<> Builder(&I);
484 Builder.SetCurrentDebugLocation(I.getDebugLoc());
485
486 Type *I32Ty = getI32Ty(Builder, I.getType());
487 Value *ExtOp0 = nullptr;
488 Value *ExtOp1 = nullptr;
489 Value *ExtRes = nullptr;
490 Value *TruncRes = nullptr;
491
492 if (isSigned(I)) {
493 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
494 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
495 } else {
496 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
497 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
498 }
499
500 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
501 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
502 if (promotedOpIsNSW(cast<Instruction>(I)))
503 Inst->setHasNoSignedWrap();
504
505 if (promotedOpIsNUW(cast<Instruction>(I)))
506 Inst->setHasNoUnsignedWrap();
507
508 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
509 Inst->setIsExact(ExactOp->isExact());
510 }
511
512 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
513
514 I.replaceAllUsesWith(TruncRes);
515 I.eraseFromParent();
516
517 return true;
518}
519
520bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
521 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
522 "I does not need promotion to i32");
523
524 IRBuilder<> Builder(&I);
525 Builder.SetCurrentDebugLocation(I.getDebugLoc());
526
527 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
528 Value *ExtOp0 = nullptr;
529 Value *ExtOp1 = nullptr;
530 Value *NewICmp = nullptr;
531
532 if (I.isSigned()) {
533 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
534 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
535 } else {
536 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
537 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
538 }
539 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
540
541 I.replaceAllUsesWith(NewICmp);
542 I.eraseFromParent();
543
544 return true;
545}
546
547bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
548 assert(needsPromotionToI32(I.getType()) &&
549 "I does not need promotion to i32");
550
551 IRBuilder<> Builder(&I);
552 Builder.SetCurrentDebugLocation(I.getDebugLoc());
553
554 Type *I32Ty = getI32Ty(Builder, I.getType());
555 Value *ExtOp1 = nullptr;
556 Value *ExtOp2 = nullptr;
557 Value *ExtRes = nullptr;
558 Value *TruncRes = nullptr;
559
560 if (isSigned(I)) {
561 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
562 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
563 } else {
564 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
565 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
566 }
567 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
568 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
569
570 I.replaceAllUsesWith(TruncRes);
571 I.eraseFromParent();
572
573 return true;
574}
575
576bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
577 IntrinsicInst &I) const {
578 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
579 "I must be bitreverse intrinsic");
580 assert(needsPromotionToI32(I.getType()) &&
581 "I does not need promotion to i32");
582
583 IRBuilder<> Builder(&I);
584 Builder.SetCurrentDebugLocation(I.getDebugLoc());
585
586 Type *I32Ty = getI32Ty(Builder, I.getType());
587 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
588 Value *ExtRes =
589 Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
590 Value *LShrOp =
591 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
592 Value *TruncRes =
593 Builder.CreateTrunc(LShrOp, I.getType());
594
595 I.replaceAllUsesWith(TruncRes);
596 I.eraseFromParent();
597
598 return true;
599}
600
601unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
602 return computeKnownBits(Op, DL, 0, AC).countMaxActiveBits();
603}
604
605unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
606 return ComputeMaxSignificantBits(Op, DL, 0, AC);
607}
608
609static void extractValues(IRBuilder<> &Builder,
610 SmallVectorImpl<Value *> &Values, Value *V) {
611 auto *VT = dyn_cast<FixedVectorType>(V->getType());
612 if (!VT) {
613 Values.push_back(V);
614 return;
615 }
616
617 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
618 Values.push_back(Builder.CreateExtractElement(V, I));
619}
620
622 Type *Ty,
623 SmallVectorImpl<Value *> &Values) {
624 if (!Ty->isVectorTy()) {
625 assert(Values.size() == 1);
626 return Values[0];
627 }
628
629 Value *NewVal = PoisonValue::get(Ty);
630 for (int I = 0, E = Values.size(); I != E; ++I)
631 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
632
633 return NewVal;
634}
635
636bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
637 if (I.getOpcode() != Instruction::Mul)
638 return false;
639
640 Type *Ty = I.getType();
641 unsigned Size = Ty->getScalarSizeInBits();
642 if (Size <= 16 && ST.has16BitInsts())
643 return false;
644
645 // Prefer scalar if this could be s_mul_i32
646 if (UA.isUniform(&I))
647 return false;
648
649 Value *LHS = I.getOperand(0);
650 Value *RHS = I.getOperand(1);
651 IRBuilder<> Builder(&I);
652 Builder.SetCurrentDebugLocation(I.getDebugLoc());
653
654 unsigned LHSBits = 0, RHSBits = 0;
655 bool IsSigned = false;
656
657 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
658 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
659 IsSigned = false;
660
661 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
662 (RHSBits = numBitsSigned(RHS)) <= 24) {
663 IsSigned = true;
664
665 } else
666 return false;
667
670 SmallVector<Value *, 4> ResultVals;
671 extractValues(Builder, LHSVals, LHS);
672 extractValues(Builder, RHSVals, RHS);
673
674 IntegerType *I32Ty = Builder.getInt32Ty();
675 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
676 Type *DstTy = LHSVals[0]->getType();
677
678 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
679 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
680 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
681 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
682 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
684 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
685 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
686 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
687 : Builder.CreateZExtOrTrunc(Result, DstTy);
688 ResultVals.push_back(Result);
689 }
690
691 Value *NewVal = insertValues(Builder, Ty, ResultVals);
692 NewVal->takeName(&I);
693 I.replaceAllUsesWith(NewVal);
694 I.eraseFromParent();
695
696 return true;
697}
698
699// Find a select instruction, which may have been casted. This is mostly to deal
700// with cases where i16 selects were promoted here to i32.
702 Cast = nullptr;
703 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
704 return Sel;
705
706 if ((Cast = dyn_cast<CastInst>(V))) {
707 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
708 return Sel;
709 }
710
711 return nullptr;
712}
713
714bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
715 // Don't do this unless the old select is going away. We want to eliminate the
716 // binary operator, not replace a binop with a select.
717 int SelOpNo = 0;
718
719 CastInst *CastOp;
720
721 // TODO: Should probably try to handle some cases with multiple
722 // users. Duplicating the select may be profitable for division.
723 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
724 if (!Sel || !Sel->hasOneUse()) {
725 SelOpNo = 1;
726 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
727 }
728
729 if (!Sel || !Sel->hasOneUse())
730 return false;
731
732 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
733 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
734 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
735 if (!CBO || !CT || !CF)
736 return false;
737
738 if (CastOp) {
739 if (!CastOp->hasOneUse())
740 return false;
741 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
742 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
743 }
744
745 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
746 // need to handle divisions here.
747 Constant *FoldedT =
748 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
749 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
750 if (!FoldedT || isa<ConstantExpr>(FoldedT))
751 return false;
752
753 Constant *FoldedF =
754 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
755 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
756 if (!FoldedF || isa<ConstantExpr>(FoldedF))
757 return false;
758
759 IRBuilder<> Builder(&BO);
760 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
761 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
762 Builder.setFastMathFlags(FPOp->getFastMathFlags());
763
764 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
765 FoldedT, FoldedF);
766 NewSelect->takeName(&BO);
767 BO.replaceAllUsesWith(NewSelect);
768 BO.eraseFromParent();
769 if (CastOp)
770 CastOp->eraseFromParent();
771 Sel->eraseFromParent();
772 return true;
773}
774
775std::pair<Value *, Value *>
776AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
777 Value *Src) const {
778 Type *Ty = Src->getType();
779 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
780 {Ty, Builder.getInt32Ty()}, Src);
781 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
782
783 // Bypass the bug workaround for the exponent result since it doesn't matter.
784 // TODO: Does the bug workaround even really need to consider the exponent
785 // result? It's unspecified by the spec.
786
787 Value *FrexpExp =
788 ST.hasFractBug()
789 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
790 {Builder.getInt32Ty(), Ty}, Src)
791 : Builder.CreateExtractValue(Frexp, {1});
792 return {FrexpMant, FrexpExp};
793}
794
795/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
796Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
797 Value *Src,
798 bool IsNegative) const {
799 // Same as for 1.0, but expand the sign out of the constant.
800 // -1.0 / x -> rcp (fneg x)
801 if (IsNegative)
802 Src = Builder.CreateFNeg(Src);
803
804 // The rcp instruction doesn't support denormals, so scale the input
805 // out of the denormal range and convert at the end.
806 //
807 // Expand as 2^-n * (1.0 / (x * 2^n))
808
809 // TODO: Skip scaling if input is known never denormal and the input
810 // range won't underflow to denormal. The hard part is knowing the
811 // result. We need a range check, the result could be denormal for
812 // 0x1p+126 < den <= 0x1p+127.
813 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
814 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
815 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
816 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
817}
818
819/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
820Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
821 Value *RHS,
822 FastMathFlags FMF) const {
823 // If we have have to work around the fract/frexp bug, we're worse off than
824 // using the fdiv.fast expansion. The full safe expansion is faster if we have
825 // fast FMA.
826 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
827 (!FMF.noNaNs() || !FMF.noInfs()))
828 return nullptr;
829
830 // We're scaling the LHS to avoid a denormal input, and scale the denominator
831 // to avoid large values underflowing the result.
832 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
833
834 Value *Rcp =
835 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
836
837 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
838 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
839
840 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
841 // result.
842 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
843 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
844}
845
846/// Emit a sqrt that handles denormals and is accurate to 2ulp.
847Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
848 Value *Src,
849 FastMathFlags FMF) const {
850 Type *Ty = Src->getType();
851 APFloat SmallestNormal =
853 Value *NeedScale =
854 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
855
856 ConstantInt *Zero = Builder.getInt32(0);
857 Value *InputScaleFactor =
858 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
859
860 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
861
862 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
863
864 Value *OutputScaleFactor =
865 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
866 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
867}
868
869/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
870static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
871 bool IsNegative) {
872 // bool need_scale = x < 0x1p-126f;
873 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
874 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
875 // rsq(x * input_scale) * output_scale;
876
877 Type *Ty = Src->getType();
878 APFloat SmallestNormal =
880 Value *NeedScale =
881 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
882 Constant *One = ConstantFP::get(Ty, 1.0);
883 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
884 Constant *OutputScale =
885 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
886
887 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
888
889 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
890 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
891 Value *OutputScaleFactor = Builder.CreateSelect(
892 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
893
894 return Builder.CreateFMul(Rsq, OutputScaleFactor);
895}
896
897bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
898 FastMathFlags DivFMF,
899 FastMathFlags SqrtFMF) const {
900 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
901 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
902 return false;
903
904 // v_rsq_f32 gives 1ulp
905 return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
906 SqrtOp->getFPAccuracy() >= 1.0f;
907}
908
909Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
910 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
911 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
912 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
913 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
914
915 // rsq_f16 is accurate to 0.51 ulp.
916 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
917 // rsq_f64 is never accurate.
918 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
919 if (!CLHS)
920 return nullptr;
921
922 assert(Den->getType()->isFloatTy());
923
924 bool IsNegative = false;
925
926 // TODO: Handle other numerator values with arcp.
927 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
928 // Add in the sqrt flags.
929 IRBuilder<>::FastMathFlagGuard Guard(Builder);
930 Builder.setFastMathFlags(DivFMF | SqrtFMF);
931
932 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
933 canIgnoreDenormalInput(Den, CtxI)) {
934 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
935 // -1.0 / sqrt(x) -> fneg(rsq(x))
936 return IsNegative ? Builder.CreateFNeg(Result) : Result;
937 }
938
939 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
940 }
941
942 return nullptr;
943}
944
945// Optimize fdiv with rcp:
946//
947// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
948// allowed with unsafe-fp-math or afn.
949//
950// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
951Value *
952AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
953 Value *Den, FastMathFlags FMF,
954 const Instruction *CtxI) const {
955 // rcp_f16 is accurate to 0.51 ulp.
956 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
957 // rcp_f64 is never accurate.
958 assert(Den->getType()->isFloatTy());
959
960 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
961 bool IsNegative = false;
962 if (CLHS->isExactlyValue(1.0) ||
963 (IsNegative = CLHS->isExactlyValue(-1.0))) {
964 Value *Src = Den;
965
966 if (HasFP32DenormalFlush || FMF.approxFunc()) {
967 // -1.0 / x -> 1.0 / fneg(x)
968 if (IsNegative)
969 Src = Builder.CreateFNeg(Src);
970
971 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
972 // the CI documentation has a worst case error of 1 ulp.
973 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
974 // to use it as long as we aren't trying to use denormals.
975 //
976 // v_rcp_f16 and v_rsq_f16 DO support denormals.
977
978 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
979 // insert rsq intrinsic here.
980
981 // 1.0 / x -> rcp(x)
982 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
983 }
984
985 // TODO: If the input isn't denormal, and we know the input exponent isn't
986 // big enough to introduce a denormal we can avoid the scaling.
987 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
988 }
989 }
990
991 if (FMF.allowReciprocal()) {
992 // x / y -> x * (1.0 / y)
993
994 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
995 // will never underflow.
996 if (HasFP32DenormalFlush || FMF.approxFunc()) {
997 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
998 return Builder.CreateFMul(Num, Recip);
999 }
1000
1001 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
1002 return Builder.CreateFMul(Num, Recip);
1003 }
1004
1005 return nullptr;
1006}
1007
1008// optimize with fdiv.fast:
1009//
1010// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1011//
1012// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1013//
1014// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1015Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1016 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1017 // fdiv.fast can achieve 2.5 ULP accuracy.
1018 if (ReqdAccuracy < 2.5f)
1019 return nullptr;
1020
1021 // Only have fdiv.fast for f32.
1022 assert(Den->getType()->isFloatTy());
1023
1024 bool NumIsOne = false;
1025 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1026 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1027 NumIsOne = true;
1028 }
1029
1030 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1031 //
1032 // TODO: This works for any value with a specific known exponent range, don't
1033 // just limit to constant 1.
1034 if (!HasFP32DenormalFlush && !NumIsOne)
1035 return nullptr;
1036
1037 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1038}
1039
1040Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1041 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
1042 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
1043 float ReqdDivAccuracy) const {
1044 if (RsqOp) {
1045 Value *Rsq =
1046 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1047 if (Rsq)
1048 return Rsq;
1049 }
1050
1051 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1052 if (Rcp)
1053 return Rcp;
1054
1055 // In the basic case fdiv_fast has the same instruction count as the frexp div
1056 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1057 // potentially be fused into a user. Also, materialization of the constants
1058 // can be reused for multiple instances.
1059 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1060 if (FDivFast)
1061 return FDivFast;
1062
1063 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1064}
1065
1066// Optimizations is performed based on fpmath, fast math flags as well as
1067// denormals to optimize fdiv with either rcp or fdiv.fast.
1068//
1069// With rcp:
1070// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1071// allowed with unsafe-fp-math or afn.
1072//
1073// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1074//
1075// With fdiv.fast:
1076// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1077//
1078// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1079//
1080// NOTE: rcp is the preference in cases that both are legal.
1081bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1082 if (DisableFDivExpand)
1083 return false;
1084
1085 Type *Ty = FDiv.getType()->getScalarType();
1086 if (!Ty->isFloatTy())
1087 return false;
1088
1089 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1090 // expansion around them in codegen. f16 is good enough to always use.
1091
1092 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
1093 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1094 const float ReqdAccuracy = FPOp->getFPAccuracy();
1095
1096 FastMathFlags SqrtFMF;
1097
1098 Value *Num = FDiv.getOperand(0);
1099 Value *Den = FDiv.getOperand(1);
1100
1101 Value *RsqOp = nullptr;
1102 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1103 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1104 DenII->hasOneUse()) {
1105 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1106 SqrtFMF = SqrtOp->getFastMathFlags();
1107 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1108 RsqOp = SqrtOp->getOperand(0);
1109 }
1110
1111 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1112 //
1113 // Defer to codegen to handle this.
1114 //
1115 // TODO: Decide on an interpretation for interactions between afn + arcp +
1116 // !fpmath, and make it consistent between here and codegen. For now, defer
1117 // expansion of afn to codegen. The current interpretation is so aggressive we
1118 // don't need any pre-consideration here when we have better information. A
1119 // more conservative interpretation could use handling here.
1120 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1121 if (!RsqOp && AllowInaccurateRcp)
1122 return false;
1123
1124 // Defer the correct implementations to codegen.
1125 if (ReqdAccuracy < 1.0f)
1126 return false;
1127
1128 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
1129 Builder.setFastMathFlags(DivFMF);
1130 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1131
1134 SmallVector<Value *, 4> RsqDenVals;
1135 extractValues(Builder, NumVals, Num);
1136 extractValues(Builder, DenVals, Den);
1137
1138 if (RsqOp)
1139 extractValues(Builder, RsqDenVals, RsqOp);
1140
1141 SmallVector<Value *, 4> ResultVals(NumVals.size());
1142 for (int I = 0, E = NumVals.size(); I != E; ++I) {
1143 Value *NumElt = NumVals[I];
1144 Value *DenElt = DenVals[I];
1145 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
1146
1147 Value *NewElt =
1148 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1149 cast<Instruction>(FPOp), ReqdAccuracy);
1150 if (!NewElt) {
1151 // Keep the original, but scalarized.
1152
1153 // This has the unfortunate side effect of sometimes scalarizing when
1154 // we're not going to do anything.
1155 NewElt = Builder.CreateFDiv(NumElt, DenElt);
1156 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1157 NewEltInst->copyMetadata(FDiv);
1158 }
1159
1160 ResultVals[I] = NewElt;
1161 }
1162
1163 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
1164
1165 if (NewVal) {
1166 FDiv.replaceAllUsesWith(NewVal);
1167 NewVal->takeName(&FDiv);
1169 }
1170
1171 return true;
1172}
1173
1174static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1175 Value *LHS, Value *RHS) {
1176 Type *I32Ty = Builder.getInt32Ty();
1177 Type *I64Ty = Builder.getInt64Ty();
1178
1179 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1180 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1181 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1182 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1183 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1184 Hi = Builder.CreateTrunc(Hi, I32Ty);
1185 return std::pair(Lo, Hi);
1186}
1187
1188static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1189 return getMul64(Builder, LHS, RHS).second;
1190}
1191
1192/// Figure out how many bits are really needed for this division. \p AtLeast is
1193/// an optimization hint to bypass the second ComputeNumSignBits call if we the
1194/// first one is insufficient. Returns -1 on failure.
1195int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1196 Value *Den, unsigned AtLeast,
1197 bool IsSigned) const {
1199 Den->getType()->getScalarSizeInBits());
1200 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1201 if (IsSigned) {
1202 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
1203 if (RHSSignBits < AtLeast)
1204 return -1;
1205
1206 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
1207 if (LHSSignBits < AtLeast)
1208 return -1;
1209
1210 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1211 unsigned DivBits = SSBits - SignBits + 1;
1212 return DivBits; // a SignBit needs to be reserved for shrinking
1213 }
1214
1215 // All bits are used for unsigned division for Num or Den in range
1216 // (SignedMax, UnsignedMax].
1217 KnownBits Known = computeKnownBits(Den, DL, 0, AC, &I);
1218 if (Known.isNegative() || !Known.isNonNegative())
1219 return SSBits;
1220 unsigned RHSSignBits = Known.countMinLeadingZeros();
1221
1222 Known = computeKnownBits(Num, DL, 0, AC, &I);
1223 if (Known.isNegative() || !Known.isNonNegative())
1224 return SSBits;
1225 unsigned LHSSignBits = Known.countMinLeadingZeros();
1226
1227 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1228 unsigned DivBits = SSBits - SignBits;
1229 return DivBits;
1230}
1231
1232// The fractional part of a float is enough to accurately represent up to
1233// a 24-bit signed integer.
1234Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1235 BinaryOperator &I, Value *Num,
1236 Value *Den, bool IsDiv,
1237 bool IsSigned) const {
1238 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1239 // If Num bits <= 24, assume 0 signbits.
1240 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1241 int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
1242 if (DivBits == -1 || DivBits > 24)
1243 return nullptr;
1244 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1245}
1246
1247Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1248 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1249 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1250 Type *I32Ty = Builder.getInt32Ty();
1251 Num = Builder.CreateTrunc(Num, I32Ty);
1252 Den = Builder.CreateTrunc(Den, I32Ty);
1253
1254 Type *F32Ty = Builder.getFloatTy();
1255 ConstantInt *One = Builder.getInt32(1);
1256 Value *JQ = One;
1257
1258 if (IsSigned) {
1259 // char|short jq = ia ^ ib;
1260 JQ = Builder.CreateXor(Num, Den);
1261
1262 // jq = jq >> (bitsize - 2)
1263 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1264
1265 // jq = jq | 0x1
1266 JQ = Builder.CreateOr(JQ, One);
1267 }
1268
1269 // int ia = (int)LHS;
1270 Value *IA = Num;
1271
1272 // int ib, (int)RHS;
1273 Value *IB = Den;
1274
1275 // float fa = (float)ia;
1276 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1277 : Builder.CreateUIToFP(IA, F32Ty);
1278
1279 // float fb = (float)ib;
1280 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1281 : Builder.CreateUIToFP(IB,F32Ty);
1282
1283 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1284 Builder.getFloatTy(), {FB});
1285 Value *FQM = Builder.CreateFMul(FA, RCP);
1286
1287 // fq = trunc(fqm);
1288 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1289 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1290
1291 // float fqneg = -fq;
1292 Value *FQNeg = Builder.CreateFNeg(FQ);
1293
1294 // float fr = mad(fqneg, fb, fa);
1295 auto FMAD = !ST.hasMadMacF32Insts()
1296 ? Intrinsic::fma
1297 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1298 Value *FR = Builder.CreateIntrinsic(FMAD,
1299 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1300
1301 // int iq = (int)fq;
1302 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1303 : Builder.CreateFPToUI(FQ, I32Ty);
1304
1305 // fr = fabs(fr);
1306 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1307
1308 // fb = fabs(fb);
1309 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1310
1311 // int cv = fr >= fb;
1312 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1313
1314 // jq = (cv ? jq : 0);
1315 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1316
1317 // dst = iq + jq;
1318 Value *Div = Builder.CreateAdd(IQ, JQ);
1319
1320 Value *Res = Div;
1321 if (!IsDiv) {
1322 // Rem needs compensation, it's easier to recompute it
1323 Value *Rem = Builder.CreateMul(Div, Den);
1324 Res = Builder.CreateSub(Num, Rem);
1325 }
1326
1327 if (DivBits != 0 && DivBits < 32) {
1328 // Extend in register from the number of bits this divide really is.
1329 if (IsSigned) {
1330 int InRegBits = 32 - DivBits;
1331
1332 Res = Builder.CreateShl(Res, InRegBits);
1333 Res = Builder.CreateAShr(Res, InRegBits);
1334 } else {
1335 ConstantInt *TruncMask
1336 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1337 Res = Builder.CreateAnd(Res, TruncMask);
1338 }
1339 }
1340
1341 return Res;
1342}
1343
1344// Try to recognize special cases the DAG will emit special, better expansions
1345// than the general expansion we do here.
1346
1347// TODO: It would be better to just directly handle those optimizations here.
1348bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1349 Value *Num,
1350 Value *Den) const {
1351 if (Constant *C = dyn_cast<Constant>(Den)) {
1352 // Arbitrary constants get a better expansion as long as a wider mulhi is
1353 // legal.
1354 if (C->getType()->getScalarSizeInBits() <= 32)
1355 return true;
1356
1357 // TODO: Sdiv check for not exact for some reason.
1358
1359 // If there's no wider mulhi, there's only a better expansion for powers of
1360 // two.
1361 // TODO: Should really know for each vector element.
1362 if (isKnownToBeAPowerOfTwo(C, DL, true, 0, AC, &I, DT))
1363 return true;
1364
1365 return false;
1366 }
1367
1368 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1369 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1370 if (BinOpDen->getOpcode() == Instruction::Shl &&
1371 isa<Constant>(BinOpDen->getOperand(0)) &&
1372 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, 0, AC, &I,
1373 DT)) {
1374 return true;
1375 }
1376 }
1377
1378 return false;
1379}
1380
1381static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1382 // Check whether the sign can be determined statically.
1383 KnownBits Known = computeKnownBits(V, DL);
1384 if (Known.isNegative())
1385 return Constant::getAllOnesValue(V->getType());
1386 if (Known.isNonNegative())
1387 return Constant::getNullValue(V->getType());
1388 return Builder.CreateAShr(V, Builder.getInt32(31));
1389}
1390
1391Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1393 Value *Y) const {
1394 Instruction::BinaryOps Opc = I.getOpcode();
1395 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1396 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1397
1398 FastMathFlags FMF;
1399 FMF.setFast();
1400 Builder.setFastMathFlags(FMF);
1401
1402 if (divHasSpecialOptimization(I, X, Y))
1403 return nullptr; // Keep it for later optimization.
1404
1405 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1406 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1407
1408 Type *Ty = X->getType();
1409 Type *I32Ty = Builder.getInt32Ty();
1410 Type *F32Ty = Builder.getFloatTy();
1411
1412 if (Ty->getScalarSizeInBits() != 32) {
1413 if (IsSigned) {
1414 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1415 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1416 } else {
1417 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1418 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1419 }
1420 }
1421
1422 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1423 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1424 Builder.CreateZExtOrTrunc(Res, Ty);
1425 }
1426
1427 ConstantInt *Zero = Builder.getInt32(0);
1428 ConstantInt *One = Builder.getInt32(1);
1429
1430 Value *Sign = nullptr;
1431 if (IsSigned) {
1432 Value *SignX = getSign32(X, Builder, DL);
1433 Value *SignY = getSign32(Y, Builder, DL);
1434 // Remainder sign is the same as LHS
1435 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1436
1437 X = Builder.CreateAdd(X, SignX);
1438 Y = Builder.CreateAdd(Y, SignY);
1439
1440 X = Builder.CreateXor(X, SignX);
1441 Y = Builder.CreateXor(Y, SignY);
1442 }
1443
1444 // The algorithm here is based on ideas from "Software Integer Division", Tom
1445 // Rodeheffer, August 2008.
1446 //
1447 // unsigned udiv(unsigned x, unsigned y) {
1448 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1449 // // that this is a lower bound on inv(y), even if some of the calculations
1450 // // round up.
1451 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1452 //
1453 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1454 // // Empirically this is guaranteed to give a "two-y" lower bound on
1455 // // inv(y).
1456 // z += umulh(z, -y * z);
1457 //
1458 // // Quotient/remainder estimate.
1459 // unsigned q = umulh(x, z);
1460 // unsigned r = x - q * y;
1461 //
1462 // // Two rounds of quotient/remainder refinement.
1463 // if (r >= y) {
1464 // ++q;
1465 // r -= y;
1466 // }
1467 // if (r >= y) {
1468 // ++q;
1469 // r -= y;
1470 // }
1471 //
1472 // return q;
1473 // }
1474
1475 // Initial estimate of inv(y).
1476 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1477 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1478 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1479 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1480 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1481
1482 // One round of UNR.
1483 Value *NegY = Builder.CreateSub(Zero, Y);
1484 Value *NegYZ = Builder.CreateMul(NegY, Z);
1485 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1486
1487 // Quotient/remainder estimate.
1488 Value *Q = getMulHu(Builder, X, Z);
1489 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1490
1491 // First quotient/remainder refinement.
1492 Value *Cond = Builder.CreateICmpUGE(R, Y);
1493 if (IsDiv)
1494 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1495 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1496
1497 // Second quotient/remainder refinement.
1498 Cond = Builder.CreateICmpUGE(R, Y);
1499 Value *Res;
1500 if (IsDiv)
1501 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1502 else
1503 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1504
1505 if (IsSigned) {
1506 Res = Builder.CreateXor(Res, Sign);
1507 Res = Builder.CreateSub(Res, Sign);
1508 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1509 } else {
1510 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1511 }
1512 return Res;
1513}
1514
1515Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1516 BinaryOperator &I, Value *Num,
1517 Value *Den) const {
1518 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1519 return nullptr; // Keep it for later optimization.
1520
1521 Instruction::BinaryOps Opc = I.getOpcode();
1522
1523 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1524 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1525
1526 int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1527 if (NumDivBits == -1)
1528 return nullptr;
1529
1530 Value *Narrowed = nullptr;
1531 if (NumDivBits <= 24) {
1532 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1533 IsDiv, IsSigned);
1534 } else if (NumDivBits <= 32) {
1535 Narrowed = expandDivRem32(Builder, I, Num, Den);
1536 }
1537
1538 if (Narrowed) {
1539 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1540 Builder.CreateZExt(Narrowed, Num->getType());
1541 }
1542
1543 return nullptr;
1544}
1545
1546void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1547 Instruction::BinaryOps Opc = I.getOpcode();
1548 // Do the general expansion.
1549 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1551 return;
1552 }
1553
1554 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1556 return;
1557 }
1558
1559 llvm_unreachable("not a division");
1560}
1561
1562bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1563 if (foldBinOpIntoSelect(I))
1564 return true;
1565
1566 if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
1567 UA.isUniform(&I) && promoteUniformOpToI32(I))
1568 return true;
1569
1570 if (UseMul24Intrin && replaceMulWithMul24(I))
1571 return true;
1572
1573 bool Changed = false;
1574 Instruction::BinaryOps Opc = I.getOpcode();
1575 Type *Ty = I.getType();
1576 Value *NewDiv = nullptr;
1577 unsigned ScalarSize = Ty->getScalarSizeInBits();
1578
1580
1581 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1582 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1583 ScalarSize <= 64 &&
1584 !DisableIDivExpand) {
1585 Value *Num = I.getOperand(0);
1586 Value *Den = I.getOperand(1);
1587 IRBuilder<> Builder(&I);
1588 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1589
1590 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1591 NewDiv = PoisonValue::get(VT);
1592
1593 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1594 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1595 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1596
1597 Value *NewElt;
1598 if (ScalarSize <= 32) {
1599 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1600 if (!NewElt)
1601 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1602 } else {
1603 // See if this 64-bit division can be shrunk to 32/24-bits before
1604 // producing the general expansion.
1605 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1606 if (!NewElt) {
1607 // The general 64-bit expansion introduces control flow and doesn't
1608 // return the new value. Just insert a scalar copy and defer
1609 // expanding it.
1610 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1611 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1612 }
1613 }
1614
1615 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1616 NewEltI->copyIRFlags(&I);
1617
1618 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1619 }
1620 } else {
1621 if (ScalarSize <= 32)
1622 NewDiv = expandDivRem32(Builder, I, Num, Den);
1623 else {
1624 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1625 if (!NewDiv)
1626 Div64ToExpand.push_back(&I);
1627 }
1628 }
1629
1630 if (NewDiv) {
1631 I.replaceAllUsesWith(NewDiv);
1632 I.eraseFromParent();
1633 Changed = true;
1634 }
1635 }
1636
1637 if (ExpandDiv64InIR) {
1638 // TODO: We get much worse code in specially handled constant cases.
1639 for (BinaryOperator *Div : Div64ToExpand) {
1640 expandDivRem64(*Div);
1641 FlowChanged = true;
1642 Changed = true;
1643 }
1644 }
1645
1646 return Changed;
1647}
1648
1649bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1650 if (!WidenLoads)
1651 return false;
1652
1653 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1654 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1655 canWidenScalarExtLoad(I)) {
1656 IRBuilder<> Builder(&I);
1657 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1658
1659 Type *I32Ty = Builder.getInt32Ty();
1660 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1661 WidenLoad->copyMetadata(I);
1662
1663 // If we have range metadata, we need to convert the type, and not make
1664 // assumptions about the high bits.
1665 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1667 mdconst::extract<ConstantInt>(Range->getOperand(0));
1668
1669 if (Lower->isNullValue()) {
1670 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1671 } else {
1672 Metadata *LowAndHigh[] = {
1673 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1674 // Don't make assumptions about the high bits.
1675 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1676 };
1677
1678 WidenLoad->setMetadata(LLVMContext::MD_range,
1679 MDNode::get(F.getContext(), LowAndHigh));
1680 }
1681 }
1682
1683 int TySize = DL.getTypeSizeInBits(I.getType());
1684 Type *IntNTy = Builder.getIntNTy(TySize);
1685 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1686 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1687 I.replaceAllUsesWith(ValOrig);
1688 I.eraseFromParent();
1689 return true;
1690 }
1691
1692 return false;
1693}
1694
1695bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1696 bool Changed = false;
1697
1698 if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1699 UA.isUniform(&I))
1700 Changed |= promoteUniformOpToI32(I);
1701
1702 return Changed;
1703}
1704
1705bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1706 Value *Cond = I.getCondition();
1707 Value *TrueVal = I.getTrueValue();
1708 Value *FalseVal = I.getFalseValue();
1709 Value *CmpVal;
1710 CmpPredicate Pred;
1711
1712 if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) {
1713 if (UA.isUniform(&I))
1714 return promoteUniformOpToI32(I);
1715 return false;
1716 }
1717
1718 // Match fract pattern with nan check.
1719 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1720 return false;
1721
1722 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1723 if (!FPOp)
1724 return false;
1725
1726 IRBuilder<> Builder(&I);
1727 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1728
1729 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1730 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1731
1732 Value *Fract = nullptr;
1733 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1734 CmpVal == matchFractPat(*IIFalse)) {
1735 // isnan(x) ? x : fract(x)
1736 Fract = applyFractPat(Builder, CmpVal);
1737 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1738 CmpVal == matchFractPat(*IITrue)) {
1739 // !isnan(x) ? fract(x) : x
1740 Fract = applyFractPat(Builder, CmpVal);
1741 } else
1742 return false;
1743
1744 Fract->takeName(&I);
1745 I.replaceAllUsesWith(Fract);
1747 return true;
1748}
1749
1750static bool areInSameBB(const Value *A, const Value *B) {
1751 const auto *IA = dyn_cast<Instruction>(A);
1752 const auto *IB = dyn_cast<Instruction>(B);
1753 return IA && IB && IA->getParent() == IB->getParent();
1754}
1755
1756// Helper for breaking large PHIs that returns true when an extractelement on V
1757// is likely to be folded away by the DAG combiner.
1759 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1760 if (!FVT)
1761 return false;
1762
1763 const Value *CurVal = V;
1764
1765 // Check for insertelements, keeping track of the elements covered.
1766 BitVector EltsCovered(FVT->getNumElements());
1767 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1768 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1769
1770 // Non constant index/out of bounds index -> folding is unlikely.
1771 // The latter is more of a sanity check because canonical IR should just
1772 // have replaced those with poison.
1773 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1774 return false;
1775
1776 const auto *VecSrc = IE->getOperand(0);
1777
1778 // If the vector source is another instruction, it must be in the same basic
1779 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1780 // unlikely to be able to do anything interesting here.
1781 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1782 return false;
1783
1784 CurVal = VecSrc;
1785 EltsCovered.set(Idx->getZExtValue());
1786
1787 // All elements covered.
1788 if (EltsCovered.all())
1789 return true;
1790 }
1791
1792 // We either didn't find a single insertelement, or the insertelement chain
1793 // ended before all elements were covered. Check for other interesting values.
1794
1795 // Constants are always interesting because we can just constant fold the
1796 // extractelements.
1797 if (isa<Constant>(CurVal))
1798 return true;
1799
1800 // shufflevector is likely to be profitable if either operand is a constant,
1801 // or if either source is in the same block.
1802 // This is because shufflevector is most often lowered as a series of
1803 // insert/extract elements anyway.
1804 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1805 return isa<Constant>(SV->getOperand(1)) ||
1806 areInSameBB(SV, SV->getOperand(0)) ||
1807 areInSameBB(SV, SV->getOperand(1));
1808 }
1809
1810 return false;
1811}
1812
1813static void collectPHINodes(const PHINode &I,
1815 const auto [It, Inserted] = SeenPHIs.insert(&I);
1816 if (!Inserted)
1817 return;
1818
1819 for (const Value *Inc : I.incoming_values()) {
1820 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1821 collectPHINodes(*PhiInc, SeenPHIs);
1822 }
1823
1824 for (const User *U : I.users()) {
1825 if (const auto *PhiU = dyn_cast<PHINode>(U))
1826 collectPHINodes(*PhiU, SeenPHIs);
1827 }
1828}
1829
1830bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1831 // Check in the cache first.
1832 if (const auto It = BreakPhiNodesCache.find(&I);
1833 It != BreakPhiNodesCache.end())
1834 return It->second;
1835
1836 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1837 // recursively consider all its users and incoming values that are also PHI
1838 // nodes. We then make a decision about all of those PHIs at once. Either they
1839 // all get broken up, or none of them do. That way, we avoid cases where a
1840 // single PHI is/is not broken and we end up reforming/exploding a vector
1841 // multiple times, or even worse, doing it in a loop.
1843 collectPHINodes(I, WorkList);
1844
1845#ifndef NDEBUG
1846 // Check that none of the PHI nodes in the worklist are in the map. If some of
1847 // them are, it means we're not good enough at collecting related PHIs.
1848 for (const PHINode *WLP : WorkList) {
1849 assert(BreakPhiNodesCache.count(WLP) == 0);
1850 }
1851#endif
1852
1853 // To consider a PHI profitable to break, we need to see some interesting
1854 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1855 // must have one to consider all PHIs breakable.
1856 //
1857 // This threshold has been determined through performance testing.
1858 //
1859 // Note that the computation below is equivalent to
1860 //
1861 // (unsigned)ceil((K / 3.0) * 2)
1862 //
1863 // It's simply written this way to avoid mixing integral/FP arithmetic.
1864 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1865 unsigned NumBreakablePHIs = 0;
1866 bool CanBreak = false;
1867 for (const PHINode *Cur : WorkList) {
1868 // Don't break PHIs that have no interesting incoming values. That is, where
1869 // there is no clear opportunity to fold the "extractelement" instructions
1870 // we would add.
1871 //
1872 // Note: IC does not run after this pass, so we're only interested in the
1873 // foldings that the DAG combiner can do.
1874 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1875 if (++NumBreakablePHIs >= Threshold) {
1876 CanBreak = true;
1877 break;
1878 }
1879 }
1880 }
1881
1882 for (const PHINode *Cur : WorkList)
1883 BreakPhiNodesCache[Cur] = CanBreak;
1884
1885 return CanBreak;
1886}
1887
1888/// Helper class for "break large PHIs" (visitPHINode).
1889///
1890/// This represents a slice of a PHI's incoming value, which is made up of:
1891/// - The type of the slice (Ty)
1892/// - The index in the incoming value's vector where the slice starts (Idx)
1893/// - The number of elements in the slice (NumElts).
1894/// It also keeps track of the NewPHI node inserted for this particular slice.
1895///
1896/// Slice examples:
1897/// <4 x i64> -> Split into four i64 slices.
1898/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1899/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1900/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1902public:
1903 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1904 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1905
1906 Type *Ty = nullptr;
1907 unsigned Idx = 0;
1908 unsigned NumElts = 0;
1909 PHINode *NewPHI = nullptr;
1910
1911 /// Slice \p Inc according to the information contained within this slice.
1912 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1913 /// pair, it returns the same Sliced value as well.
1914 ///
1915 /// Note this *intentionally* does not return the same value for, say,
1916 /// [%bb.0, %0] & [%bb.1, %0] as:
1917 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1918 /// the value in bb.1 may not be reachable from bb.0 if it's its
1919 /// predecessor.)
1920 /// - We also want to make our extract instructions as local as possible so
1921 /// the DAG has better chances of folding them out. Duplicating them like
1922 /// that is beneficial in that regard.
1923 ///
1924 /// This is both a minor optimization to avoid creating duplicate
1925 /// instructions, but also a requirement for correctness. It is not forbidden
1926 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1927 /// returned a new value each time, those previously identical pairs would all
1928 /// have different incoming values (from the same block) and it'd cause a "PHI
1929 /// node has multiple entries for the same basic block with different incoming
1930 /// values!" verifier error.
1931 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1932 Value *&Res = SlicedVals[{BB, Inc}];
1933 if (Res)
1934 return Res;
1935
1937 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1938 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1939
1940 if (NumElts > 1) {
1942 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1943 Mask.push_back(K);
1944 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1945 } else
1946 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1947
1948 return Res;
1949 }
1950
1951private:
1953};
1954
1955bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1956 // Break-up fixed-vector PHIs into smaller pieces.
1957 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1958 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1959 //
1960 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1961 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1962 // With large, odd-sized PHIs we may end up needing many `build_vector`
1963 // operations with most elements being "undef". This inhibits a lot of
1964 // optimization opportunities and can result in unreasonably high register
1965 // pressure and the inevitable stack spilling.
1966 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1967 return false;
1968
1969 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1970 if (!FVT || FVT->getNumElements() == 1 ||
1971 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1972 return false;
1973
1974 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1975 return false;
1976
1977 std::vector<VectorSlice> Slices;
1978
1979 Type *EltTy = FVT->getElementType();
1980 {
1981 unsigned Idx = 0;
1982 // For 8/16 bits type, don't scalarize fully but break it up into as many
1983 // 32-bit slices as we can, and scalarize the tail.
1984 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1985 const unsigned NumElts = FVT->getNumElements();
1986 if (EltSize == 8 || EltSize == 16) {
1987 const unsigned SubVecSize = (32 / EltSize);
1988 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1989 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1990 Idx += SubVecSize)
1991 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1992 }
1993
1994 // Scalarize all remaining elements.
1995 for (; Idx < NumElts; ++Idx)
1996 Slices.emplace_back(EltTy, Idx, 1);
1997 }
1998
1999 assert(Slices.size() > 1);
2000
2001 // Create one PHI per vector piece. The "VectorSlice" class takes care of
2002 // creating the necessary instruction to extract the relevant slices of each
2003 // incoming value.
2004 IRBuilder<> B(I.getParent());
2005 B.SetCurrentDebugLocation(I.getDebugLoc());
2006
2007 unsigned IncNameSuffix = 0;
2008 for (VectorSlice &S : Slices) {
2009 // We need to reset the build on each iteration, because getSlicedVal may
2010 // have inserted something into I's BB.
2011 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
2012 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
2013
2014 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
2015 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
2016 "largephi.extractslice" +
2017 std::to_string(IncNameSuffix++)),
2018 BB);
2019 }
2020 }
2021
2022 // And replace this PHI with a vector of all the previous PHI values.
2023 Value *Vec = PoisonValue::get(FVT);
2024 unsigned NameSuffix = 0;
2025 for (VectorSlice &S : Slices) {
2026 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
2027 if (S.NumElts > 1)
2028 Vec =
2029 B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
2030 else
2031 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2032 }
2033
2034 I.replaceAllUsesWith(Vec);
2035 I.eraseFromParent();
2036 return true;
2037}
2038
2039/// \param V Value to check
2040/// \param DL DataLayout
2041/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2042/// \param AS Target Address Space
2043/// \return true if \p V cannot be the null value of \p AS, false otherwise.
2044static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2045 const AMDGPUTargetMachine &TM, unsigned AS) {
2046 // Pointer cannot be null if it's a block address, GV or alloca.
2047 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2048 // it as the symbol could be null in such cases.
2049 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2050 return true;
2051
2052 // Check nonnull arguments.
2053 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2054 return true;
2055
2056 // getUnderlyingObject may have looked through another addrspacecast, although
2057 // the optimizable situations most likely folded out by now.
2058 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2059 return false;
2060
2061 // TODO: Calls that return nonnull?
2062
2063 // For all other things, use KnownBits.
2064 // We either use 0 or all bits set to indicate null, so check whether the
2065 // value can be zero or all ones.
2066 //
2067 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2068 // address spaces have non-zero null values.
2069 auto SrcPtrKB = computeKnownBits(V, DL);
2070 const auto NullVal = TM.getNullPointerValue(AS);
2071
2072 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2073 assert((NullVal == 0 || NullVal == -1) &&
2074 "don't know how to check for this null value!");
2075 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2076}
2077
2078bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2079 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2080 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2081 // worth supporting.
2082 if (I.getType()->isVectorTy())
2083 return false;
2084
2085 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2086 // This is only worthwhile for casts from/to priv/local to flat.
2087 const unsigned SrcAS = I.getSrcAddressSpace();
2088 const unsigned DstAS = I.getDestAddressSpace();
2089
2090 bool CanLower = false;
2091 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2092 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2093 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2094 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2095 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2096 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2097 if (!CanLower)
2098 return false;
2099
2101 getUnderlyingObjects(I.getOperand(0), WorkList);
2102 if (!all_of(WorkList, [&](const Value *V) {
2103 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
2104 }))
2105 return false;
2106
2107 IRBuilder<> B(&I);
2108 auto *Intrin = B.CreateIntrinsic(
2109 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2110 I.replaceAllUsesWith(Intrin);
2111 I.eraseFromParent();
2112 return true;
2113}
2114
2115bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2116 switch (I.getIntrinsicID()) {
2117 case Intrinsic::bitreverse:
2118 return visitBitreverseIntrinsicInst(I);
2119 case Intrinsic::minnum:
2120 return visitMinNum(I);
2121 case Intrinsic::sqrt:
2122 return visitSqrt(I);
2123 default:
2124 return false;
2125 }
2126}
2127
2128bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2129 bool Changed = false;
2130
2131 if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
2132 UA.isUniform(&I))
2133 Changed |= promoteUniformBitreverseToI32(I);
2134
2135 return Changed;
2136}
2137
2138/// Match non-nan fract pattern.
2139/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2140///
2141/// If fract is a useful instruction for the subtarget. Does not account for the
2142/// nan handling; the instruction has a nan check on the input value.
2143Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2144 if (ST.hasFractBug())
2145 return nullptr;
2146
2147 if (I.getIntrinsicID() != Intrinsic::minnum)
2148 return nullptr;
2149
2150 Type *Ty = I.getType();
2151 if (!isLegalFloatingTy(Ty->getScalarType()))
2152 return nullptr;
2153
2154 Value *Arg0 = I.getArgOperand(0);
2155 Value *Arg1 = I.getArgOperand(1);
2156
2157 const APFloat *C;
2158 if (!match(Arg1, m_APFloat(C)))
2159 return nullptr;
2160
2161 APFloat One(1.0);
2162 bool LosesInfo;
2163 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2164
2165 // Match nextafter(1.0, -1)
2166 One.next(true);
2167 if (One != *C)
2168 return nullptr;
2169
2170 Value *FloorSrc;
2171 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2172 m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
2173 return FloorSrc;
2174 return nullptr;
2175}
2176
2177Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2178 Value *FractArg) {
2179 SmallVector<Value *, 4> FractVals;
2180 extractValues(Builder, FractVals, FractArg);
2181
2182 SmallVector<Value *, 4> ResultVals(FractVals.size());
2183
2184 Type *Ty = FractArg->getType()->getScalarType();
2185 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2186 ResultVals[I] =
2187 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2188 }
2189
2190 return insertValues(Builder, FractArg->getType(), ResultVals);
2191}
2192
2193bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
2194 Value *FractArg = matchFractPat(I);
2195 if (!FractArg)
2196 return false;
2197
2198 // Match pattern for fract intrinsic in contexts where the nan check has been
2199 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2200 if (!I.hasNoNaNs() &&
2201 !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(DL, TLI)))
2202 return false;
2203
2204 IRBuilder<> Builder(&I);
2205 FastMathFlags FMF = I.getFastMathFlags();
2206 FMF.setNoNaNs();
2207 Builder.setFastMathFlags(FMF);
2208
2209 Value *Fract = applyFractPat(Builder, FractArg);
2210 Fract->takeName(&I);
2211 I.replaceAllUsesWith(Fract);
2212
2214 return true;
2215}
2216
2217static bool isOneOrNegOne(const Value *Val) {
2218 const APFloat *C;
2219 return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
2220}
2221
2222// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2223bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2224 Type *Ty = Sqrt.getType()->getScalarType();
2225 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2226 return false;
2227
2228 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2229 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2230
2231 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2232 // of fast llvm.sqrt will give the raw instruction anyway.
2233 if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2234 return false;
2235
2236 const float ReqdAccuracy = FPOp->getFPAccuracy();
2237
2238 // Defer correctly rounded expansion to codegen.
2239 if (ReqdAccuracy < 1.0f)
2240 return false;
2241
2242 // FIXME: This is an ugly hack for this pass using forward iteration instead
2243 // of reverse. If it worked like a normal combiner, the rsq would form before
2244 // we saw a sqrt call.
2245 auto *FDiv =
2246 dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
2247 if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2248 FDiv->getFPAccuracy() >= 1.0f &&
2249 canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
2250 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2251 isOneOrNegOne(FDiv->getOperand(0)))
2252 return false;
2253
2254 Value *SrcVal = Sqrt.getOperand(0);
2255 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2256
2257 // The raw instruction is 1 ulp, but the correction for denormal handling
2258 // brings it to 2.
2259 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2260 return false;
2261
2262 IRBuilder<> Builder(&Sqrt);
2264 extractValues(Builder, SrcVals, SrcVal);
2265
2266 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2267 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2268 if (CanTreatAsDAZ)
2269 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2270 else
2271 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2272 }
2273
2274 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2275 NewSqrt->takeName(&Sqrt);
2276 Sqrt.replaceAllUsesWith(NewSqrt);
2277 Sqrt.eraseFromParent();
2278 return true;
2279}
2280
2281bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2282 if (skipFunction(F))
2283 return false;
2284
2285 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2286 if (!TPC)
2287 return false;
2288
2289 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2290 const TargetLibraryInfo *TLI =
2291 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2292 AssumptionCache *AC =
2293 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2294 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2295 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2296 const UniformityInfo &UA =
2297 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2298 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2299}
2300
2303 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2308 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2309 if (!Impl.run())
2310 return PreservedAnalyses::all();
2312 if (!Impl.FlowChanged)
2314 return PA;
2315}
2316
2317INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2318 "AMDGPU IR optimizations", false, false)
2322INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2324
2325char AMDGPUCodeGenPrepare::ID = 0;
2326
2328 return new AMDGPUCodeGenPrepare();
2329}
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define DEBUG_TYPE
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
Generic memory optimizations
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
BinaryOperator * Mul
support::ulittle16_t & Lo
Definition: aarch32.cpp:204
support::ulittle16_t & Hi
Definition: aarch32.cpp:203
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1155
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
BitVector & set()
Definition: BitVector.h:351
bool all() const
all - Returns true if all bits are set.
Definition: BitVector.h:175
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:608
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
Definition: Constants.cpp:1119
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:205
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:338
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void setFast(bool B=true)
Definition: FMF.h:97
bool noInfs() const
Definition: FMF.h:67
bool allowReciprocal() const
Definition: FMF.h:69
bool approxFunc() const
Definition: FMF.h:71
void setNoNaNs(bool B=true)
Definition: FMF.h:79
bool noNaNs() const
Definition: FMF.h:66
bool allowContract() const
Definition: FMF.h:70
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
Definition: Function.h:68
This instruction compares its operands according to the predicate given to the constructor.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:871
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2503
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1637
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2121
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2491
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2066
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2547
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1048
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2094
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2060
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1460
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2108
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2328
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1744
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1367
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1813
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1439
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition: IRBuilder.h:297
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2048
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1498
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1350
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:551
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2444
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2034
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1520
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1689
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2285
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1479
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1542
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1610
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1753
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2081
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1384
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2323
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2101
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIntrinsicInst(IntrinsicInst &I)
Definition: InstVisitor.h:222
RetTy visitPHINode(PHINode &I)
Definition: InstVisitor.h:175
RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I)
Definition: InstVisitor.h:188
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:264
RetTy visitICmpInst(ICmpInst &I)
Definition: InstVisitor.h:166
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:283
RetTy visitLoadInst(LoadInst &I)
Definition: InstVisitor.h:169
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:471
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:386
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1543
Root of the metadata hierarchy.
Definition: Metadata.h:62
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
static IntegerType * getInt32Ty(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:903
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
Definition: PatternMatch.h:717
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
Definition: PatternMatch.h:316
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isKnownNeverNaN(const Value *V, unsigned Depth, const SimplifyQuery &SQ)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, unsigned Depth, const SimplifyQuery &SQ)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
CGPassBuilderOption getCGPassBuilderOption()
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:100
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:97
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.