LLVM 20.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
24#include "llvm/IR/Dominators.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/InstVisitor.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/Pass.h"
34
35#define DEBUG_TYPE "amdgpu-codegenprepare"
36
37using namespace llvm;
38using namespace llvm::PatternMatch;
39
40namespace {
41
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
46 cl::init(false));
47
48static cl::opt<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
52 cl::init(true));
53
54static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
58
59static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
64
65static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
69
70static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
74 cl::init(true));
75
76// Legalize 64-bit division by using the generic IR expansion.
77static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
81 cl::init(false));
82
83// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84// and is used for testing the legalizer.
85static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
89 cl::init(false));
90
91// Disable processing of fdiv so we can better test the backend implementations.
92static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
96 cl::init(false));
97
98static bool hasUnsafeFPMath(const Function &F) {
99 return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
100}
101
102class AMDGPUCodeGenPrepareImpl
103 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
104public:
105 Function &F;
106 const GCNSubtarget &ST;
107 const AMDGPUTargetMachine &TM;
108 const TargetLibraryInfo *TLI;
109 AssumptionCache *AC;
110 const DominatorTree *DT;
111 const UniformityInfo &UA;
112 const DataLayout &DL;
113 const bool HasUnsafeFPMath;
114 const bool HasFP32DenormalFlush;
115 bool FlowChanged = false;
116 mutable Function *SqrtF32 = nullptr;
117 mutable Function *LdexpF32 = nullptr;
118
119 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
120
121 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
122 const TargetLibraryInfo *TLI, AssumptionCache *AC,
123 const DominatorTree *DT, const UniformityInfo &UA)
124 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
125 DT(DT), UA(UA), DL(F.getDataLayout()),
126 HasUnsafeFPMath(hasUnsafeFPMath(F)),
127 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
129
130 Function *getSqrtF32() const {
131 if (SqrtF32)
132 return SqrtF32;
133
134 LLVMContext &Ctx = F.getContext();
136 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
137 return SqrtF32;
138 }
139
140 Function *getLdexpF32() const {
141 if (LdexpF32)
142 return LdexpF32;
143
144 LLVMContext &Ctx = F.getContext();
146 F.getParent(), Intrinsic::ldexp,
147 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
148 return LdexpF32;
149 }
150
151 bool canBreakPHINode(const PHINode &I);
152
153 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
154 /// binary operation \p V.
155 ///
156 /// \returns Binary operation \p V.
157 /// \returns \p T's base element bit width.
158 unsigned getBaseElementBitWidth(const Type *T) const;
159
160 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
161 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
162 /// is returned.
163 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
164
165 /// \returns True if binary operation \p I is a signed binary operation, false
166 /// otherwise.
167 bool isSigned(const BinaryOperator &I) const;
168
169 /// \returns True if the condition of 'select' operation \p I comes from a
170 /// signed 'icmp' operation, false otherwise.
171 bool isSigned(const SelectInst &I) const;
172
173 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
174 /// false otherwise.
175 bool needsPromotionToI32(const Type *T) const;
176
177 /// Return true if \p T is a legal scalar floating point type.
178 bool isLegalFloatingTy(const Type *T) const;
179
180 /// Wrapper to pass all the arguments to computeKnownFPClass
182 const Instruction *CtxI) const {
183 return llvm::computeKnownFPClass(V, DL, Interested, 0, TLI, AC, CtxI, DT);
184 }
185
186 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
187 return HasFP32DenormalFlush ||
189 }
190
191 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
192 /// operation.
193 ///
194 /// \details \p I's base element bit width must be greater than 1 and less
195 /// than or equal 16. Promotion is done by sign or zero extending operands to
196 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
197 /// truncating the result of 32 bit binary operation back to \p I's original
198 /// type. Division operation is not promoted.
199 ///
200 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
201 /// false otherwise.
202 bool promoteUniformOpToI32(BinaryOperator &I) const;
203
204 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
205 ///
206 /// \details \p I's base element bit width must be greater than 1 and less
207 /// than or equal 16. Promotion is done by sign or zero extending operands to
208 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
209 ///
210 /// \returns True.
211 bool promoteUniformOpToI32(ICmpInst &I) const;
212
213 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
214 /// operation.
215 ///
216 /// \details \p I's base element bit width must be greater than 1 and less
217 /// than or equal 16. Promotion is done by sign or zero extending operands to
218 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
219 /// result of 32 bit 'select' operation back to \p I's original type.
220 ///
221 /// \returns True.
222 bool promoteUniformOpToI32(SelectInst &I) const;
223
224 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
225 /// intrinsic.
226 ///
227 /// \details \p I's base element bit width must be greater than 1 and less
228 /// than or equal 16. Promotion is done by zero extending the operand to 32
229 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
230 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
231 /// shift amount is 32 minus \p I's base element bit width), and truncating
232 /// the result of the shift operation back to \p I's original type.
233 ///
234 /// \returns True.
235 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
236
237 /// \returns The minimum number of bits needed to store the value of \Op as an
238 /// unsigned integer. Truncating to this size and then zero-extending to
239 /// the original will not change the value.
240 unsigned numBitsUnsigned(Value *Op) const;
241
242 /// \returns The minimum number of bits needed to store the value of \Op as a
243 /// signed integer. Truncating to this size and then sign-extending to
244 /// the original size will not change the value.
245 unsigned numBitsSigned(Value *Op) const;
246
247 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
248 /// SelectionDAG has an issue where an and asserting the bits are known
249 bool replaceMulWithMul24(BinaryOperator &I) const;
250
251 /// Perform same function as equivalently named function in DAGCombiner. Since
252 /// we expand some divisions here, we need to perform this before obscuring.
253 bool foldBinOpIntoSelect(BinaryOperator &I) const;
254
255 bool divHasSpecialOptimization(BinaryOperator &I,
256 Value *Num, Value *Den) const;
257 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
258 unsigned MaxDivBits, bool Signed) const;
259
260 /// Expands 24 bit div or rem.
261 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
262 Value *Num, Value *Den,
263 bool IsDiv, bool IsSigned) const;
264
265 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
266 Value *Num, Value *Den, unsigned NumBits,
267 bool IsDiv, bool IsSigned) const;
268
269 /// Expands 32 bit div or rem.
270 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
271 Value *Num, Value *Den) const;
272
273 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
274 Value *Num, Value *Den) const;
275 void expandDivRem64(BinaryOperator &I) const;
276
277 /// Widen a scalar load.
278 ///
279 /// \details \p Widen scalar load for uniform, small type loads from constant
280 // memory / to a full 32-bits and then truncate the input to allow a scalar
281 // load instead of a vector load.
282 //
283 /// \returns True.
284
285 bool canWidenScalarExtLoad(LoadInst &I) const;
286
287 Value *matchFractPat(IntrinsicInst &I);
288 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
289
290 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
291 FastMathFlags SqrtFMF) const;
292
293 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
294 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
295 const Instruction *CtxI) const;
296
297 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
298 FastMathFlags FMF, const Instruction *CtxI) const;
299 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
300 float ReqdAccuracy) const;
301
302 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
303 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
304 Value *RsqOp, const Instruction *FDiv,
305 float ReqdAccuracy) const;
306
307 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
308 Value *Src) const;
309
310 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
311 bool IsNegative) const;
312 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
313 FastMathFlags FMF) const;
314 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
315 FastMathFlags FMF) const;
316
317public:
318 bool visitFDiv(BinaryOperator &I);
319
320 bool visitInstruction(Instruction &I) { return false; }
322 bool visitLoadInst(LoadInst &I);
323 bool visitICmpInst(ICmpInst &I);
325 bool visitPHINode(PHINode &I);
327
329 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
330 bool visitMinNum(IntrinsicInst &I);
331 bool visitSqrt(IntrinsicInst &I);
332 bool run();
333};
334
335class AMDGPUCodeGenPrepare : public FunctionPass {
336public:
337 static char ID;
338 AMDGPUCodeGenPrepare() : FunctionPass(ID) {
340 }
341 void getAnalysisUsage(AnalysisUsage &AU) const override {
345
346 // FIXME: Division expansion needs to preserve the dominator tree.
347 if (!ExpandDiv64InIR)
348 AU.setPreservesAll();
349 }
350 bool runOnFunction(Function &F) override;
351 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
352};
353
354} // end anonymous namespace
355
356bool AMDGPUCodeGenPrepareImpl::run() {
357 BreakPhiNodesCache.clear();
358 bool MadeChange = false;
359
360 Function::iterator NextBB;
361 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
362 BasicBlock *BB = &*FI;
363 NextBB = std::next(FI);
364
366 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
367 I = Next) {
368 Next = std::next(I);
369
370 MadeChange |= visit(*I);
371
372 if (Next != E) { // Control flow changed
373 BasicBlock *NextInstBB = Next->getParent();
374 if (NextInstBB != BB) {
375 BB = NextInstBB;
376 E = BB->end();
377 FE = F.end();
378 }
379 }
380 }
381 }
382 return MadeChange;
383}
384
385unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
386 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
387
388 if (T->isIntegerTy())
389 return T->getIntegerBitWidth();
390 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
391}
392
393Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
394 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
395
396 if (T->isIntegerTy())
397 return B.getInt32Ty();
398 return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
399}
400
401bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
402 return I.getOpcode() == Instruction::AShr ||
403 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
404}
405
406bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
407 return isa<ICmpInst>(I.getOperand(0)) ?
408 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
409}
410
411bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
412 if (!Widen16BitOps)
413 return false;
414
415 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
416 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
417 return true;
418
419 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
420 // TODO: The set of packed operations is more limited, so may want to
421 // promote some anyway.
422 if (ST.hasVOP3PInsts())
423 return false;
424
425 return needsPromotionToI32(VT->getElementType());
426 }
427
428 return false;
429}
430
431bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
432 return Ty->isFloatTy() || Ty->isDoubleTy() ||
433 (Ty->isHalfTy() && ST.has16BitInsts());
434}
435
436// Return true if the op promoted to i32 should have nsw set.
437static bool promotedOpIsNSW(const Instruction &I) {
438 switch (I.getOpcode()) {
439 case Instruction::Shl:
440 case Instruction::Add:
441 case Instruction::Sub:
442 return true;
443 case Instruction::Mul:
444 return I.hasNoUnsignedWrap();
445 default:
446 return false;
447 }
448}
449
450// Return true if the op promoted to i32 should have nuw set.
451static bool promotedOpIsNUW(const Instruction &I) {
452 switch (I.getOpcode()) {
453 case Instruction::Shl:
454 case Instruction::Add:
455 case Instruction::Mul:
456 return true;
457 case Instruction::Sub:
458 return I.hasNoUnsignedWrap();
459 default:
460 return false;
461 }
462}
463
464bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
465 Type *Ty = I.getType();
466 int TySize = DL.getTypeSizeInBits(Ty);
467 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
468
469 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
470}
471
472bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
473 assert(needsPromotionToI32(I.getType()) &&
474 "I does not need promotion to i32");
475
476 if (I.getOpcode() == Instruction::SDiv ||
477 I.getOpcode() == Instruction::UDiv ||
478 I.getOpcode() == Instruction::SRem ||
479 I.getOpcode() == Instruction::URem)
480 return false;
481
482 IRBuilder<> Builder(&I);
483 Builder.SetCurrentDebugLocation(I.getDebugLoc());
484
485 Type *I32Ty = getI32Ty(Builder, I.getType());
486 Value *ExtOp0 = nullptr;
487 Value *ExtOp1 = nullptr;
488 Value *ExtRes = nullptr;
489 Value *TruncRes = nullptr;
490
491 if (isSigned(I)) {
492 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
493 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
494 } else {
495 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
496 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
497 }
498
499 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
500 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
501 if (promotedOpIsNSW(cast<Instruction>(I)))
502 Inst->setHasNoSignedWrap();
503
504 if (promotedOpIsNUW(cast<Instruction>(I)))
505 Inst->setHasNoUnsignedWrap();
506
507 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
508 Inst->setIsExact(ExactOp->isExact());
509 }
510
511 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
512
513 I.replaceAllUsesWith(TruncRes);
514 I.eraseFromParent();
515
516 return true;
517}
518
519bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
520 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
521 "I does not need promotion to i32");
522
523 IRBuilder<> Builder(&I);
524 Builder.SetCurrentDebugLocation(I.getDebugLoc());
525
526 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
527 Value *ExtOp0 = nullptr;
528 Value *ExtOp1 = nullptr;
529 Value *NewICmp = nullptr;
530
531 if (I.isSigned()) {
532 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
533 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
534 } else {
535 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
536 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
537 }
538 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
539
540 I.replaceAllUsesWith(NewICmp);
541 I.eraseFromParent();
542
543 return true;
544}
545
546bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
547 assert(needsPromotionToI32(I.getType()) &&
548 "I does not need promotion to i32");
549
550 IRBuilder<> Builder(&I);
551 Builder.SetCurrentDebugLocation(I.getDebugLoc());
552
553 Type *I32Ty = getI32Ty(Builder, I.getType());
554 Value *ExtOp1 = nullptr;
555 Value *ExtOp2 = nullptr;
556 Value *ExtRes = nullptr;
557 Value *TruncRes = nullptr;
558
559 if (isSigned(I)) {
560 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
561 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
562 } else {
563 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
564 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
565 }
566 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
567 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
568
569 I.replaceAllUsesWith(TruncRes);
570 I.eraseFromParent();
571
572 return true;
573}
574
575bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
576 IntrinsicInst &I) const {
577 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
578 "I must be bitreverse intrinsic");
579 assert(needsPromotionToI32(I.getType()) &&
580 "I does not need promotion to i32");
581
582 IRBuilder<> Builder(&I);
583 Builder.SetCurrentDebugLocation(I.getDebugLoc());
584
585 Type *I32Ty = getI32Ty(Builder, I.getType());
586 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
587 Value *ExtRes =
588 Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
589 Value *LShrOp =
590 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
591 Value *TruncRes =
592 Builder.CreateTrunc(LShrOp, I.getType());
593
594 I.replaceAllUsesWith(TruncRes);
595 I.eraseFromParent();
596
597 return true;
598}
599
600unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
601 return computeKnownBits(Op, DL, 0, AC).countMaxActiveBits();
602}
603
604unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
605 return ComputeMaxSignificantBits(Op, DL, 0, AC);
606}
607
608static void extractValues(IRBuilder<> &Builder,
609 SmallVectorImpl<Value *> &Values, Value *V) {
610 auto *VT = dyn_cast<FixedVectorType>(V->getType());
611 if (!VT) {
612 Values.push_back(V);
613 return;
614 }
615
616 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
617 Values.push_back(Builder.CreateExtractElement(V, I));
618}
619
621 Type *Ty,
622 SmallVectorImpl<Value *> &Values) {
623 if (!Ty->isVectorTy()) {
624 assert(Values.size() == 1);
625 return Values[0];
626 }
627
628 Value *NewVal = PoisonValue::get(Ty);
629 for (int I = 0, E = Values.size(); I != E; ++I)
630 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
631
632 return NewVal;
633}
634
635bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
636 if (I.getOpcode() != Instruction::Mul)
637 return false;
638
639 Type *Ty = I.getType();
640 unsigned Size = Ty->getScalarSizeInBits();
641 if (Size <= 16 && ST.has16BitInsts())
642 return false;
643
644 // Prefer scalar if this could be s_mul_i32
645 if (UA.isUniform(&I))
646 return false;
647
648 Value *LHS = I.getOperand(0);
649 Value *RHS = I.getOperand(1);
650 IRBuilder<> Builder(&I);
651 Builder.SetCurrentDebugLocation(I.getDebugLoc());
652
653 unsigned LHSBits = 0, RHSBits = 0;
654 bool IsSigned = false;
655
656 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
657 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
658 IsSigned = false;
659
660 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
661 (RHSBits = numBitsSigned(RHS)) <= 24) {
662 IsSigned = true;
663
664 } else
665 return false;
666
669 SmallVector<Value *, 4> ResultVals;
670 extractValues(Builder, LHSVals, LHS);
671 extractValues(Builder, RHSVals, RHS);
672
673 IntegerType *I32Ty = Builder.getInt32Ty();
674 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
675 Type *DstTy = LHSVals[0]->getType();
676
677 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
678 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
679 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
680 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
681 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
683 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
684 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
685 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
686 : Builder.CreateZExtOrTrunc(Result, DstTy);
687 ResultVals.push_back(Result);
688 }
689
690 Value *NewVal = insertValues(Builder, Ty, ResultVals);
691 NewVal->takeName(&I);
692 I.replaceAllUsesWith(NewVal);
693 I.eraseFromParent();
694
695 return true;
696}
697
698// Find a select instruction, which may have been casted. This is mostly to deal
699// with cases where i16 selects were promoted here to i32.
701 Cast = nullptr;
702 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
703 return Sel;
704
705 if ((Cast = dyn_cast<CastInst>(V))) {
706 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
707 return Sel;
708 }
709
710 return nullptr;
711}
712
713bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
714 // Don't do this unless the old select is going away. We want to eliminate the
715 // binary operator, not replace a binop with a select.
716 int SelOpNo = 0;
717
718 CastInst *CastOp;
719
720 // TODO: Should probably try to handle some cases with multiple
721 // users. Duplicating the select may be profitable for division.
722 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
723 if (!Sel || !Sel->hasOneUse()) {
724 SelOpNo = 1;
725 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
726 }
727
728 if (!Sel || !Sel->hasOneUse())
729 return false;
730
731 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
732 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
733 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
734 if (!CBO || !CT || !CF)
735 return false;
736
737 if (CastOp) {
738 if (!CastOp->hasOneUse())
739 return false;
740 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
741 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
742 }
743
744 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
745 // need to handle divisions here.
746 Constant *FoldedT =
747 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
748 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
749 if (!FoldedT || isa<ConstantExpr>(FoldedT))
750 return false;
751
752 Constant *FoldedF =
753 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
754 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
755 if (!FoldedF || isa<ConstantExpr>(FoldedF))
756 return false;
757
758 IRBuilder<> Builder(&BO);
759 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
760 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
761 Builder.setFastMathFlags(FPOp->getFastMathFlags());
762
763 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
764 FoldedT, FoldedF);
765 NewSelect->takeName(&BO);
766 BO.replaceAllUsesWith(NewSelect);
767 BO.eraseFromParent();
768 if (CastOp)
769 CastOp->eraseFromParent();
770 Sel->eraseFromParent();
771 return true;
772}
773
774std::pair<Value *, Value *>
775AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
776 Value *Src) const {
777 Type *Ty = Src->getType();
778 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
779 {Ty, Builder.getInt32Ty()}, Src);
780 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
781
782 // Bypass the bug workaround for the exponent result since it doesn't matter.
783 // TODO: Does the bug workaround even really need to consider the exponent
784 // result? It's unspecified by the spec.
785
786 Value *FrexpExp =
787 ST.hasFractBug()
788 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
789 {Builder.getInt32Ty(), Ty}, Src)
790 : Builder.CreateExtractValue(Frexp, {1});
791 return {FrexpMant, FrexpExp};
792}
793
794/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
795Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
796 Value *Src,
797 bool IsNegative) const {
798 // Same as for 1.0, but expand the sign out of the constant.
799 // -1.0 / x -> rcp (fneg x)
800 if (IsNegative)
801 Src = Builder.CreateFNeg(Src);
802
803 // The rcp instruction doesn't support denormals, so scale the input
804 // out of the denormal range and convert at the end.
805 //
806 // Expand as 2^-n * (1.0 / (x * 2^n))
807
808 // TODO: Skip scaling if input is known never denormal and the input
809 // range won't underflow to denormal. The hard part is knowing the
810 // result. We need a range check, the result could be denormal for
811 // 0x1p+126 < den <= 0x1p+127.
812 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
813 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
814 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
815 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
816}
817
818/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
819Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
820 Value *RHS,
821 FastMathFlags FMF) const {
822 // If we have have to work around the fract/frexp bug, we're worse off than
823 // using the fdiv.fast expansion. The full safe expansion is faster if we have
824 // fast FMA.
825 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
826 (!FMF.noNaNs() || !FMF.noInfs()))
827 return nullptr;
828
829 // We're scaling the LHS to avoid a denormal input, and scale the denominator
830 // to avoid large values underflowing the result.
831 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
832
833 Value *Rcp =
834 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
835
836 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
837 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
838
839 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
840 // result.
841 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
842 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
843}
844
845/// Emit a sqrt that handles denormals and is accurate to 2ulp.
846Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
847 Value *Src,
848 FastMathFlags FMF) const {
849 Type *Ty = Src->getType();
850 APFloat SmallestNormal =
852 Value *NeedScale =
853 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
854
855 ConstantInt *Zero = Builder.getInt32(0);
856 Value *InputScaleFactor =
857 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
858
859 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
860
861 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
862
863 Value *OutputScaleFactor =
864 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
865 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
866}
867
868/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
869static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
870 bool IsNegative) {
871 // bool need_scale = x < 0x1p-126f;
872 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
873 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
874 // rsq(x * input_scale) * output_scale;
875
876 Type *Ty = Src->getType();
877 APFloat SmallestNormal =
879 Value *NeedScale =
880 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
881 Constant *One = ConstantFP::get(Ty, 1.0);
882 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
883 Constant *OutputScale =
884 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
885
886 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
887
888 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
889 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
890 Value *OutputScaleFactor = Builder.CreateSelect(
891 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
892
893 return Builder.CreateFMul(Rsq, OutputScaleFactor);
894}
895
896bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
897 FastMathFlags DivFMF,
898 FastMathFlags SqrtFMF) const {
899 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
900 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
901 return false;
902
903 // v_rsq_f32 gives 1ulp
904 return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
905 SqrtOp->getFPAccuracy() >= 1.0f;
906}
907
908Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
909 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
910 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
911 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
912 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
913
914 // rsq_f16 is accurate to 0.51 ulp.
915 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
916 // rsq_f64 is never accurate.
917 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
918 if (!CLHS)
919 return nullptr;
920
921 assert(Den->getType()->isFloatTy());
922
923 bool IsNegative = false;
924
925 // TODO: Handle other numerator values with arcp.
926 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
927 // Add in the sqrt flags.
928 IRBuilder<>::FastMathFlagGuard Guard(Builder);
929 Builder.setFastMathFlags(DivFMF | SqrtFMF);
930
931 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
932 canIgnoreDenormalInput(Den, CtxI)) {
933 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
934 // -1.0 / sqrt(x) -> fneg(rsq(x))
935 return IsNegative ? Builder.CreateFNeg(Result) : Result;
936 }
937
938 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
939 }
940
941 return nullptr;
942}
943
944// Optimize fdiv with rcp:
945//
946// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
947// allowed with unsafe-fp-math or afn.
948//
949// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
950Value *
951AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
952 Value *Den, FastMathFlags FMF,
953 const Instruction *CtxI) const {
954 // rcp_f16 is accurate to 0.51 ulp.
955 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
956 // rcp_f64 is never accurate.
957 assert(Den->getType()->isFloatTy());
958
959 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
960 bool IsNegative = false;
961 if (CLHS->isExactlyValue(1.0) ||
962 (IsNegative = CLHS->isExactlyValue(-1.0))) {
963 Value *Src = Den;
964
965 if (HasFP32DenormalFlush || FMF.approxFunc()) {
966 // -1.0 / x -> 1.0 / fneg(x)
967 if (IsNegative)
968 Src = Builder.CreateFNeg(Src);
969
970 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
971 // the CI documentation has a worst case error of 1 ulp.
972 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
973 // to use it as long as we aren't trying to use denormals.
974 //
975 // v_rcp_f16 and v_rsq_f16 DO support denormals.
976
977 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
978 // insert rsq intrinsic here.
979
980 // 1.0 / x -> rcp(x)
981 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
982 }
983
984 // TODO: If the input isn't denormal, and we know the input exponent isn't
985 // big enough to introduce a denormal we can avoid the scaling.
986 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
987 }
988 }
989
990 if (FMF.allowReciprocal()) {
991 // x / y -> x * (1.0 / y)
992
993 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
994 // will never underflow.
995 if (HasFP32DenormalFlush || FMF.approxFunc()) {
996 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
997 return Builder.CreateFMul(Num, Recip);
998 }
999
1000 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
1001 return Builder.CreateFMul(Num, Recip);
1002 }
1003
1004 return nullptr;
1005}
1006
1007// optimize with fdiv.fast:
1008//
1009// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1010//
1011// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1012//
1013// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1014Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1015 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1016 // fdiv.fast can achieve 2.5 ULP accuracy.
1017 if (ReqdAccuracy < 2.5f)
1018 return nullptr;
1019
1020 // Only have fdiv.fast for f32.
1021 assert(Den->getType()->isFloatTy());
1022
1023 bool NumIsOne = false;
1024 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1025 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1026 NumIsOne = true;
1027 }
1028
1029 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1030 //
1031 // TODO: This works for any value with a specific known exponent range, don't
1032 // just limit to constant 1.
1033 if (!HasFP32DenormalFlush && !NumIsOne)
1034 return nullptr;
1035
1036 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1037}
1038
1039Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1040 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
1041 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
1042 float ReqdDivAccuracy) const {
1043 if (RsqOp) {
1044 Value *Rsq =
1045 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1046 if (Rsq)
1047 return Rsq;
1048 }
1049
1050 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1051 if (Rcp)
1052 return Rcp;
1053
1054 // In the basic case fdiv_fast has the same instruction count as the frexp div
1055 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1056 // potentially be fused into a user. Also, materialization of the constants
1057 // can be reused for multiple instances.
1058 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1059 if (FDivFast)
1060 return FDivFast;
1061
1062 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1063}
1064
1065// Optimizations is performed based on fpmath, fast math flags as well as
1066// denormals to optimize fdiv with either rcp or fdiv.fast.
1067//
1068// With rcp:
1069// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1070// allowed with unsafe-fp-math or afn.
1071//
1072// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1073//
1074// With fdiv.fast:
1075// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1076//
1077// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1078//
1079// NOTE: rcp is the preference in cases that both are legal.
1080bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1081 if (DisableFDivExpand)
1082 return false;
1083
1084 Type *Ty = FDiv.getType()->getScalarType();
1085 if (!Ty->isFloatTy())
1086 return false;
1087
1088 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1089 // expansion around them in codegen. f16 is good enough to always use.
1090
1091 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
1092 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1093 const float ReqdAccuracy = FPOp->getFPAccuracy();
1094
1095 FastMathFlags SqrtFMF;
1096
1097 Value *Num = FDiv.getOperand(0);
1098 Value *Den = FDiv.getOperand(1);
1099
1100 Value *RsqOp = nullptr;
1101 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1102 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1103 DenII->hasOneUse()) {
1104 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1105 SqrtFMF = SqrtOp->getFastMathFlags();
1106 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1107 RsqOp = SqrtOp->getOperand(0);
1108 }
1109
1110 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1111 //
1112 // Defer to codegen to handle this.
1113 //
1114 // TODO: Decide on an interpretation for interactions between afn + arcp +
1115 // !fpmath, and make it consistent between here and codegen. For now, defer
1116 // expansion of afn to codegen. The current interpretation is so aggressive we
1117 // don't need any pre-consideration here when we have better information. A
1118 // more conservative interpretation could use handling here.
1119 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1120 if (!RsqOp && AllowInaccurateRcp)
1121 return false;
1122
1123 // Defer the correct implementations to codegen.
1124 if (ReqdAccuracy < 1.0f)
1125 return false;
1126
1127 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
1128 Builder.setFastMathFlags(DivFMF);
1129 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1130
1133 SmallVector<Value *, 4> RsqDenVals;
1134 extractValues(Builder, NumVals, Num);
1135 extractValues(Builder, DenVals, Den);
1136
1137 if (RsqOp)
1138 extractValues(Builder, RsqDenVals, RsqOp);
1139
1140 SmallVector<Value *, 4> ResultVals(NumVals.size());
1141 for (int I = 0, E = NumVals.size(); I != E; ++I) {
1142 Value *NumElt = NumVals[I];
1143 Value *DenElt = DenVals[I];
1144 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
1145
1146 Value *NewElt =
1147 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1148 cast<Instruction>(FPOp), ReqdAccuracy);
1149 if (!NewElt) {
1150 // Keep the original, but scalarized.
1151
1152 // This has the unfortunate side effect of sometimes scalarizing when
1153 // we're not going to do anything.
1154 NewElt = Builder.CreateFDiv(NumElt, DenElt);
1155 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1156 NewEltInst->copyMetadata(FDiv);
1157 }
1158
1159 ResultVals[I] = NewElt;
1160 }
1161
1162 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
1163
1164 if (NewVal) {
1165 FDiv.replaceAllUsesWith(NewVal);
1166 NewVal->takeName(&FDiv);
1168 }
1169
1170 return true;
1171}
1172
1173static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1174 Value *LHS, Value *RHS) {
1175 Type *I32Ty = Builder.getInt32Ty();
1176 Type *I64Ty = Builder.getInt64Ty();
1177
1178 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1179 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1180 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1181 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1182 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1183 Hi = Builder.CreateTrunc(Hi, I32Ty);
1184 return std::pair(Lo, Hi);
1185}
1186
1187static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1188 return getMul64(Builder, LHS, RHS).second;
1189}
1190
1191/// Figure out how many bits are really needed for this division.
1192/// \p MaxDivBits is an optimization hint to bypass the second
1193/// ComputeNumSignBits/computeKnownBits call if the first one is
1194/// insufficient.
1195unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1196 Value *Den,
1197 unsigned MaxDivBits,
1198 bool IsSigned) const {
1200 Den->getType()->getScalarSizeInBits());
1201 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1202 if (IsSigned) {
1203 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
1204 // A sign bit needs to be reserved for shrinking.
1205 unsigned DivBits = SSBits - RHSSignBits + 1;
1206 if (DivBits > MaxDivBits)
1207 return SSBits;
1208
1209 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
1210
1211 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1212 DivBits = SSBits - SignBits + 1;
1213 return DivBits;
1214 }
1215
1216 // All bits are used for unsigned division for Num or Den in range
1217 // (SignedMax, UnsignedMax].
1218 KnownBits Known = computeKnownBits(Den, DL, 0, AC, &I);
1219 if (Known.isNegative() || !Known.isNonNegative())
1220 return SSBits;
1221 unsigned RHSSignBits = Known.countMinLeadingZeros();
1222 unsigned DivBits = SSBits - RHSSignBits;
1223 if (DivBits > MaxDivBits)
1224 return SSBits;
1225
1226 Known = computeKnownBits(Num, DL, 0, AC, &I);
1227 if (Known.isNegative() || !Known.isNonNegative())
1228 return SSBits;
1229 unsigned LHSSignBits = Known.countMinLeadingZeros();
1230
1231 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1232 DivBits = SSBits - SignBits;
1233 return DivBits;
1234}
1235
1236// The fractional part of a float is enough to accurately represent up to
1237// a 24-bit signed integer.
1238Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1239 BinaryOperator &I, Value *Num,
1240 Value *Den, bool IsDiv,
1241 bool IsSigned) const {
1242 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
1243 if (DivBits > 24)
1244 return nullptr;
1245 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1246}
1247
1248Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1249 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1250 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1251 Type *I32Ty = Builder.getInt32Ty();
1252 Num = Builder.CreateTrunc(Num, I32Ty);
1253 Den = Builder.CreateTrunc(Den, I32Ty);
1254
1255 Type *F32Ty = Builder.getFloatTy();
1256 ConstantInt *One = Builder.getInt32(1);
1257 Value *JQ = One;
1258
1259 if (IsSigned) {
1260 // char|short jq = ia ^ ib;
1261 JQ = Builder.CreateXor(Num, Den);
1262
1263 // jq = jq >> (bitsize - 2)
1264 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1265
1266 // jq = jq | 0x1
1267 JQ = Builder.CreateOr(JQ, One);
1268 }
1269
1270 // int ia = (int)LHS;
1271 Value *IA = Num;
1272
1273 // int ib, (int)RHS;
1274 Value *IB = Den;
1275
1276 // float fa = (float)ia;
1277 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1278 : Builder.CreateUIToFP(IA, F32Ty);
1279
1280 // float fb = (float)ib;
1281 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1282 : Builder.CreateUIToFP(IB,F32Ty);
1283
1284 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1285 Builder.getFloatTy(), {FB});
1286 Value *FQM = Builder.CreateFMul(FA, RCP);
1287
1288 // fq = trunc(fqm);
1289 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1290 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1291
1292 // float fqneg = -fq;
1293 Value *FQNeg = Builder.CreateFNeg(FQ);
1294
1295 // float fr = mad(fqneg, fb, fa);
1296 auto FMAD = !ST.hasMadMacF32Insts()
1297 ? Intrinsic::fma
1298 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1299 Value *FR = Builder.CreateIntrinsic(FMAD,
1300 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1301
1302 // int iq = (int)fq;
1303 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1304 : Builder.CreateFPToUI(FQ, I32Ty);
1305
1306 // fr = fabs(fr);
1307 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1308
1309 // fb = fabs(fb);
1310 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1311
1312 // int cv = fr >= fb;
1313 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1314
1315 // jq = (cv ? jq : 0);
1316 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1317
1318 // dst = iq + jq;
1319 Value *Div = Builder.CreateAdd(IQ, JQ);
1320
1321 Value *Res = Div;
1322 if (!IsDiv) {
1323 // Rem needs compensation, it's easier to recompute it
1324 Value *Rem = Builder.CreateMul(Div, Den);
1325 Res = Builder.CreateSub(Num, Rem);
1326 }
1327
1328 if (DivBits != 0 && DivBits < 32) {
1329 // Extend in register from the number of bits this divide really is.
1330 if (IsSigned) {
1331 int InRegBits = 32 - DivBits;
1332
1333 Res = Builder.CreateShl(Res, InRegBits);
1334 Res = Builder.CreateAShr(Res, InRegBits);
1335 } else {
1336 ConstantInt *TruncMask
1337 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1338 Res = Builder.CreateAnd(Res, TruncMask);
1339 }
1340 }
1341
1342 return Res;
1343}
1344
1345// Try to recognize special cases the DAG will emit special, better expansions
1346// than the general expansion we do here.
1347
1348// TODO: It would be better to just directly handle those optimizations here.
1349bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1350 Value *Num,
1351 Value *Den) const {
1352 if (Constant *C = dyn_cast<Constant>(Den)) {
1353 // Arbitrary constants get a better expansion as long as a wider mulhi is
1354 // legal.
1355 if (C->getType()->getScalarSizeInBits() <= 32)
1356 return true;
1357
1358 // TODO: Sdiv check for not exact for some reason.
1359
1360 // If there's no wider mulhi, there's only a better expansion for powers of
1361 // two.
1362 // TODO: Should really know for each vector element.
1363 if (isKnownToBeAPowerOfTwo(C, DL, true, 0, AC, &I, DT))
1364 return true;
1365
1366 return false;
1367 }
1368
1369 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1370 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1371 if (BinOpDen->getOpcode() == Instruction::Shl &&
1372 isa<Constant>(BinOpDen->getOperand(0)) &&
1373 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, 0, AC, &I,
1374 DT)) {
1375 return true;
1376 }
1377 }
1378
1379 return false;
1380}
1381
1382static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1383 // Check whether the sign can be determined statically.
1384 KnownBits Known = computeKnownBits(V, DL);
1385 if (Known.isNegative())
1386 return Constant::getAllOnesValue(V->getType());
1387 if (Known.isNonNegative())
1388 return Constant::getNullValue(V->getType());
1389 return Builder.CreateAShr(V, Builder.getInt32(31));
1390}
1391
1392Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1394 Value *Y) const {
1395 Instruction::BinaryOps Opc = I.getOpcode();
1396 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1397 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1398
1399 FastMathFlags FMF;
1400 FMF.setFast();
1401 Builder.setFastMathFlags(FMF);
1402
1403 if (divHasSpecialOptimization(I, X, Y))
1404 return nullptr; // Keep it for later optimization.
1405
1406 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1407 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1408
1409 Type *Ty = X->getType();
1410 Type *I32Ty = Builder.getInt32Ty();
1411 Type *F32Ty = Builder.getFloatTy();
1412
1413 if (Ty->getScalarSizeInBits() != 32) {
1414 if (IsSigned) {
1415 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1416 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1417 } else {
1418 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1419 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1420 }
1421 }
1422
1423 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1424 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1425 Builder.CreateZExtOrTrunc(Res, Ty);
1426 }
1427
1428 ConstantInt *Zero = Builder.getInt32(0);
1429 ConstantInt *One = Builder.getInt32(1);
1430
1431 Value *Sign = nullptr;
1432 if (IsSigned) {
1433 Value *SignX = getSign32(X, Builder, DL);
1434 Value *SignY = getSign32(Y, Builder, DL);
1435 // Remainder sign is the same as LHS
1436 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1437
1438 X = Builder.CreateAdd(X, SignX);
1439 Y = Builder.CreateAdd(Y, SignY);
1440
1441 X = Builder.CreateXor(X, SignX);
1442 Y = Builder.CreateXor(Y, SignY);
1443 }
1444
1445 // The algorithm here is based on ideas from "Software Integer Division", Tom
1446 // Rodeheffer, August 2008.
1447 //
1448 // unsigned udiv(unsigned x, unsigned y) {
1449 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1450 // // that this is a lower bound on inv(y), even if some of the calculations
1451 // // round up.
1452 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1453 //
1454 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1455 // // Empirically this is guaranteed to give a "two-y" lower bound on
1456 // // inv(y).
1457 // z += umulh(z, -y * z);
1458 //
1459 // // Quotient/remainder estimate.
1460 // unsigned q = umulh(x, z);
1461 // unsigned r = x - q * y;
1462 //
1463 // // Two rounds of quotient/remainder refinement.
1464 // if (r >= y) {
1465 // ++q;
1466 // r -= y;
1467 // }
1468 // if (r >= y) {
1469 // ++q;
1470 // r -= y;
1471 // }
1472 //
1473 // return q;
1474 // }
1475
1476 // Initial estimate of inv(y).
1477 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1478 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1479 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1480 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1481 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1482
1483 // One round of UNR.
1484 Value *NegY = Builder.CreateSub(Zero, Y);
1485 Value *NegYZ = Builder.CreateMul(NegY, Z);
1486 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1487
1488 // Quotient/remainder estimate.
1489 Value *Q = getMulHu(Builder, X, Z);
1490 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1491
1492 // First quotient/remainder refinement.
1493 Value *Cond = Builder.CreateICmpUGE(R, Y);
1494 if (IsDiv)
1495 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1496 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1497
1498 // Second quotient/remainder refinement.
1499 Cond = Builder.CreateICmpUGE(R, Y);
1500 Value *Res;
1501 if (IsDiv)
1502 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1503 else
1504 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1505
1506 if (IsSigned) {
1507 Res = Builder.CreateXor(Res, Sign);
1508 Res = Builder.CreateSub(Res, Sign);
1509 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1510 } else {
1511 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1512 }
1513 return Res;
1514}
1515
1516Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1517 BinaryOperator &I, Value *Num,
1518 Value *Den) const {
1519 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1520 return nullptr; // Keep it for later optimization.
1521
1522 Instruction::BinaryOps Opc = I.getOpcode();
1523
1524 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1525 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1526
1527 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1528 if (NumDivBits > 32)
1529 return nullptr;
1530
1531 Value *Narrowed = nullptr;
1532 if (NumDivBits <= 24) {
1533 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1534 IsDiv, IsSigned);
1535 } else if (NumDivBits <= 32) {
1536 Narrowed = expandDivRem32(Builder, I, Num, Den);
1537 }
1538
1539 if (Narrowed) {
1540 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1541 Builder.CreateZExt(Narrowed, Num->getType());
1542 }
1543
1544 return nullptr;
1545}
1546
1547void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1548 Instruction::BinaryOps Opc = I.getOpcode();
1549 // Do the general expansion.
1550 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1552 return;
1553 }
1554
1555 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1557 return;
1558 }
1559
1560 llvm_unreachable("not a division");
1561}
1562
1563bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1564 if (foldBinOpIntoSelect(I))
1565 return true;
1566
1567 if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
1568 UA.isUniform(&I) && promoteUniformOpToI32(I))
1569 return true;
1570
1571 if (UseMul24Intrin && replaceMulWithMul24(I))
1572 return true;
1573
1574 bool Changed = false;
1575 Instruction::BinaryOps Opc = I.getOpcode();
1576 Type *Ty = I.getType();
1577 Value *NewDiv = nullptr;
1578 unsigned ScalarSize = Ty->getScalarSizeInBits();
1579
1581
1582 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1583 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1584 ScalarSize <= 64 &&
1585 !DisableIDivExpand) {
1586 Value *Num = I.getOperand(0);
1587 Value *Den = I.getOperand(1);
1588 IRBuilder<> Builder(&I);
1589 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1590
1591 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1592 NewDiv = PoisonValue::get(VT);
1593
1594 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1595 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1596 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1597
1598 Value *NewElt;
1599 if (ScalarSize <= 32) {
1600 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1601 if (!NewElt)
1602 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1603 } else {
1604 // See if this 64-bit division can be shrunk to 32/24-bits before
1605 // producing the general expansion.
1606 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1607 if (!NewElt) {
1608 // The general 64-bit expansion introduces control flow and doesn't
1609 // return the new value. Just insert a scalar copy and defer
1610 // expanding it.
1611 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1612 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1613 }
1614 }
1615
1616 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1617 NewEltI->copyIRFlags(&I);
1618
1619 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1620 }
1621 } else {
1622 if (ScalarSize <= 32)
1623 NewDiv = expandDivRem32(Builder, I, Num, Den);
1624 else {
1625 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1626 if (!NewDiv)
1627 Div64ToExpand.push_back(&I);
1628 }
1629 }
1630
1631 if (NewDiv) {
1632 I.replaceAllUsesWith(NewDiv);
1633 I.eraseFromParent();
1634 Changed = true;
1635 }
1636 }
1637
1638 if (ExpandDiv64InIR) {
1639 // TODO: We get much worse code in specially handled constant cases.
1640 for (BinaryOperator *Div : Div64ToExpand) {
1641 expandDivRem64(*Div);
1642 FlowChanged = true;
1643 Changed = true;
1644 }
1645 }
1646
1647 return Changed;
1648}
1649
1650bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1651 if (!WidenLoads)
1652 return false;
1653
1654 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1655 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1656 canWidenScalarExtLoad(I)) {
1657 IRBuilder<> Builder(&I);
1658 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1659
1660 Type *I32Ty = Builder.getInt32Ty();
1661 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1662 WidenLoad->copyMetadata(I);
1663
1664 // If we have range metadata, we need to convert the type, and not make
1665 // assumptions about the high bits.
1666 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1668 mdconst::extract<ConstantInt>(Range->getOperand(0));
1669
1670 if (Lower->isNullValue()) {
1671 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1672 } else {
1673 Metadata *LowAndHigh[] = {
1674 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1675 // Don't make assumptions about the high bits.
1676 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1677 };
1678
1679 WidenLoad->setMetadata(LLVMContext::MD_range,
1680 MDNode::get(F.getContext(), LowAndHigh));
1681 }
1682 }
1683
1684 int TySize = DL.getTypeSizeInBits(I.getType());
1685 Type *IntNTy = Builder.getIntNTy(TySize);
1686 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1687 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1688 I.replaceAllUsesWith(ValOrig);
1689 I.eraseFromParent();
1690 return true;
1691 }
1692
1693 return false;
1694}
1695
1696bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1697 bool Changed = false;
1698
1699 if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1700 UA.isUniform(&I))
1701 Changed |= promoteUniformOpToI32(I);
1702
1703 return Changed;
1704}
1705
1706bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1707 Value *Cond = I.getCondition();
1708 Value *TrueVal = I.getTrueValue();
1709 Value *FalseVal = I.getFalseValue();
1710 Value *CmpVal;
1711 CmpPredicate Pred;
1712
1713 if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) {
1714 if (UA.isUniform(&I))
1715 return promoteUniformOpToI32(I);
1716 return false;
1717 }
1718
1719 // Match fract pattern with nan check.
1720 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1721 return false;
1722
1723 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1724 if (!FPOp)
1725 return false;
1726
1727 IRBuilder<> Builder(&I);
1728 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1729
1730 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1731 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1732
1733 Value *Fract = nullptr;
1734 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1735 CmpVal == matchFractPat(*IIFalse)) {
1736 // isnan(x) ? x : fract(x)
1737 Fract = applyFractPat(Builder, CmpVal);
1738 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1739 CmpVal == matchFractPat(*IITrue)) {
1740 // !isnan(x) ? fract(x) : x
1741 Fract = applyFractPat(Builder, CmpVal);
1742 } else
1743 return false;
1744
1745 Fract->takeName(&I);
1746 I.replaceAllUsesWith(Fract);
1748 return true;
1749}
1750
1751static bool areInSameBB(const Value *A, const Value *B) {
1752 const auto *IA = dyn_cast<Instruction>(A);
1753 const auto *IB = dyn_cast<Instruction>(B);
1754 return IA && IB && IA->getParent() == IB->getParent();
1755}
1756
1757// Helper for breaking large PHIs that returns true when an extractelement on V
1758// is likely to be folded away by the DAG combiner.
1760 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1761 if (!FVT)
1762 return false;
1763
1764 const Value *CurVal = V;
1765
1766 // Check for insertelements, keeping track of the elements covered.
1767 BitVector EltsCovered(FVT->getNumElements());
1768 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1769 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1770
1771 // Non constant index/out of bounds index -> folding is unlikely.
1772 // The latter is more of a sanity check because canonical IR should just
1773 // have replaced those with poison.
1774 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1775 return false;
1776
1777 const auto *VecSrc = IE->getOperand(0);
1778
1779 // If the vector source is another instruction, it must be in the same basic
1780 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1781 // unlikely to be able to do anything interesting here.
1782 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1783 return false;
1784
1785 CurVal = VecSrc;
1786 EltsCovered.set(Idx->getZExtValue());
1787
1788 // All elements covered.
1789 if (EltsCovered.all())
1790 return true;
1791 }
1792
1793 // We either didn't find a single insertelement, or the insertelement chain
1794 // ended before all elements were covered. Check for other interesting values.
1795
1796 // Constants are always interesting because we can just constant fold the
1797 // extractelements.
1798 if (isa<Constant>(CurVal))
1799 return true;
1800
1801 // shufflevector is likely to be profitable if either operand is a constant,
1802 // or if either source is in the same block.
1803 // This is because shufflevector is most often lowered as a series of
1804 // insert/extract elements anyway.
1805 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1806 return isa<Constant>(SV->getOperand(1)) ||
1807 areInSameBB(SV, SV->getOperand(0)) ||
1808 areInSameBB(SV, SV->getOperand(1));
1809 }
1810
1811 return false;
1812}
1813
1814static void collectPHINodes(const PHINode &I,
1816 const auto [It, Inserted] = SeenPHIs.insert(&I);
1817 if (!Inserted)
1818 return;
1819
1820 for (const Value *Inc : I.incoming_values()) {
1821 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1822 collectPHINodes(*PhiInc, SeenPHIs);
1823 }
1824
1825 for (const User *U : I.users()) {
1826 if (const auto *PhiU = dyn_cast<PHINode>(U))
1827 collectPHINodes(*PhiU, SeenPHIs);
1828 }
1829}
1830
1831bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1832 // Check in the cache first.
1833 if (const auto It = BreakPhiNodesCache.find(&I);
1834 It != BreakPhiNodesCache.end())
1835 return It->second;
1836
1837 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1838 // recursively consider all its users and incoming values that are also PHI
1839 // nodes. We then make a decision about all of those PHIs at once. Either they
1840 // all get broken up, or none of them do. That way, we avoid cases where a
1841 // single PHI is/is not broken and we end up reforming/exploding a vector
1842 // multiple times, or even worse, doing it in a loop.
1844 collectPHINodes(I, WorkList);
1845
1846#ifndef NDEBUG
1847 // Check that none of the PHI nodes in the worklist are in the map. If some of
1848 // them are, it means we're not good enough at collecting related PHIs.
1849 for (const PHINode *WLP : WorkList) {
1850 assert(BreakPhiNodesCache.count(WLP) == 0);
1851 }
1852#endif
1853
1854 // To consider a PHI profitable to break, we need to see some interesting
1855 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1856 // must have one to consider all PHIs breakable.
1857 //
1858 // This threshold has been determined through performance testing.
1859 //
1860 // Note that the computation below is equivalent to
1861 //
1862 // (unsigned)ceil((K / 3.0) * 2)
1863 //
1864 // It's simply written this way to avoid mixing integral/FP arithmetic.
1865 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1866 unsigned NumBreakablePHIs = 0;
1867 bool CanBreak = false;
1868 for (const PHINode *Cur : WorkList) {
1869 // Don't break PHIs that have no interesting incoming values. That is, where
1870 // there is no clear opportunity to fold the "extractelement" instructions
1871 // we would add.
1872 //
1873 // Note: IC does not run after this pass, so we're only interested in the
1874 // foldings that the DAG combiner can do.
1875 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1876 if (++NumBreakablePHIs >= Threshold) {
1877 CanBreak = true;
1878 break;
1879 }
1880 }
1881 }
1882
1883 for (const PHINode *Cur : WorkList)
1884 BreakPhiNodesCache[Cur] = CanBreak;
1885
1886 return CanBreak;
1887}
1888
1889/// Helper class for "break large PHIs" (visitPHINode).
1890///
1891/// This represents a slice of a PHI's incoming value, which is made up of:
1892/// - The type of the slice (Ty)
1893/// - The index in the incoming value's vector where the slice starts (Idx)
1894/// - The number of elements in the slice (NumElts).
1895/// It also keeps track of the NewPHI node inserted for this particular slice.
1896///
1897/// Slice examples:
1898/// <4 x i64> -> Split into four i64 slices.
1899/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1900/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1901/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1903public:
1904 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1905 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1906
1907 Type *Ty = nullptr;
1908 unsigned Idx = 0;
1909 unsigned NumElts = 0;
1910 PHINode *NewPHI = nullptr;
1911
1912 /// Slice \p Inc according to the information contained within this slice.
1913 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1914 /// pair, it returns the same Sliced value as well.
1915 ///
1916 /// Note this *intentionally* does not return the same value for, say,
1917 /// [%bb.0, %0] & [%bb.1, %0] as:
1918 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1919 /// the value in bb.1 may not be reachable from bb.0 if it's its
1920 /// predecessor.)
1921 /// - We also want to make our extract instructions as local as possible so
1922 /// the DAG has better chances of folding them out. Duplicating them like
1923 /// that is beneficial in that regard.
1924 ///
1925 /// This is both a minor optimization to avoid creating duplicate
1926 /// instructions, but also a requirement for correctness. It is not forbidden
1927 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1928 /// returned a new value each time, those previously identical pairs would all
1929 /// have different incoming values (from the same block) and it'd cause a "PHI
1930 /// node has multiple entries for the same basic block with different incoming
1931 /// values!" verifier error.
1932 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1933 Value *&Res = SlicedVals[{BB, Inc}];
1934 if (Res)
1935 return Res;
1936
1938 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1939 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1940
1941 if (NumElts > 1) {
1943 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1944 Mask.push_back(K);
1945 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1946 } else
1947 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1948
1949 return Res;
1950 }
1951
1952private:
1954};
1955
1956bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1957 // Break-up fixed-vector PHIs into smaller pieces.
1958 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1959 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1960 //
1961 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1962 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1963 // With large, odd-sized PHIs we may end up needing many `build_vector`
1964 // operations with most elements being "undef". This inhibits a lot of
1965 // optimization opportunities and can result in unreasonably high register
1966 // pressure and the inevitable stack spilling.
1967 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1968 return false;
1969
1970 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1971 if (!FVT || FVT->getNumElements() == 1 ||
1972 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1973 return false;
1974
1975 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1976 return false;
1977
1978 std::vector<VectorSlice> Slices;
1979
1980 Type *EltTy = FVT->getElementType();
1981 {
1982 unsigned Idx = 0;
1983 // For 8/16 bits type, don't scalarize fully but break it up into as many
1984 // 32-bit slices as we can, and scalarize the tail.
1985 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1986 const unsigned NumElts = FVT->getNumElements();
1987 if (EltSize == 8 || EltSize == 16) {
1988 const unsigned SubVecSize = (32 / EltSize);
1989 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1990 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1991 Idx += SubVecSize)
1992 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1993 }
1994
1995 // Scalarize all remaining elements.
1996 for (; Idx < NumElts; ++Idx)
1997 Slices.emplace_back(EltTy, Idx, 1);
1998 }
1999
2000 assert(Slices.size() > 1);
2001
2002 // Create one PHI per vector piece. The "VectorSlice" class takes care of
2003 // creating the necessary instruction to extract the relevant slices of each
2004 // incoming value.
2005 IRBuilder<> B(I.getParent());
2006 B.SetCurrentDebugLocation(I.getDebugLoc());
2007
2008 unsigned IncNameSuffix = 0;
2009 for (VectorSlice &S : Slices) {
2010 // We need to reset the build on each iteration, because getSlicedVal may
2011 // have inserted something into I's BB.
2012 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
2013 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
2014
2015 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
2016 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
2017 "largephi.extractslice" +
2018 std::to_string(IncNameSuffix++)),
2019 BB);
2020 }
2021 }
2022
2023 // And replace this PHI with a vector of all the previous PHI values.
2024 Value *Vec = PoisonValue::get(FVT);
2025 unsigned NameSuffix = 0;
2026 for (VectorSlice &S : Slices) {
2027 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
2028 if (S.NumElts > 1)
2029 Vec =
2030 B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
2031 else
2032 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2033 }
2034
2035 I.replaceAllUsesWith(Vec);
2036 I.eraseFromParent();
2037 return true;
2038}
2039
2040/// \param V Value to check
2041/// \param DL DataLayout
2042/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2043/// \param AS Target Address Space
2044/// \return true if \p V cannot be the null value of \p AS, false otherwise.
2045static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2046 const AMDGPUTargetMachine &TM, unsigned AS) {
2047 // Pointer cannot be null if it's a block address, GV or alloca.
2048 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2049 // it as the symbol could be null in such cases.
2050 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2051 return true;
2052
2053 // Check nonnull arguments.
2054 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2055 return true;
2056
2057 // getUnderlyingObject may have looked through another addrspacecast, although
2058 // the optimizable situations most likely folded out by now.
2059 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2060 return false;
2061
2062 // TODO: Calls that return nonnull?
2063
2064 // For all other things, use KnownBits.
2065 // We either use 0 or all bits set to indicate null, so check whether the
2066 // value can be zero or all ones.
2067 //
2068 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2069 // address spaces have non-zero null values.
2070 auto SrcPtrKB = computeKnownBits(V, DL);
2071 const auto NullVal = TM.getNullPointerValue(AS);
2072
2073 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2074 assert((NullVal == 0 || NullVal == -1) &&
2075 "don't know how to check for this null value!");
2076 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2077}
2078
2079bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2080 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2081 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2082 // worth supporting.
2083 if (I.getType()->isVectorTy())
2084 return false;
2085
2086 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2087 // This is only worthwhile for casts from/to priv/local to flat.
2088 const unsigned SrcAS = I.getSrcAddressSpace();
2089 const unsigned DstAS = I.getDestAddressSpace();
2090
2091 bool CanLower = false;
2092 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2093 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2094 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2095 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2096 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2097 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2098 if (!CanLower)
2099 return false;
2100
2102 getUnderlyingObjects(I.getOperand(0), WorkList);
2103 if (!all_of(WorkList, [&](const Value *V) {
2104 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
2105 }))
2106 return false;
2107
2108 IRBuilder<> B(&I);
2109 auto *Intrin = B.CreateIntrinsic(
2110 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2111 I.replaceAllUsesWith(Intrin);
2112 I.eraseFromParent();
2113 return true;
2114}
2115
2116bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2117 switch (I.getIntrinsicID()) {
2118 case Intrinsic::bitreverse:
2119 return visitBitreverseIntrinsicInst(I);
2120 case Intrinsic::minnum:
2121 return visitMinNum(I);
2122 case Intrinsic::sqrt:
2123 return visitSqrt(I);
2124 default:
2125 return false;
2126 }
2127}
2128
2129bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2130 bool Changed = false;
2131
2132 if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
2133 UA.isUniform(&I))
2134 Changed |= promoteUniformBitreverseToI32(I);
2135
2136 return Changed;
2137}
2138
2139/// Match non-nan fract pattern.
2140/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2141///
2142/// If fract is a useful instruction for the subtarget. Does not account for the
2143/// nan handling; the instruction has a nan check on the input value.
2144Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2145 if (ST.hasFractBug())
2146 return nullptr;
2147
2148 if (I.getIntrinsicID() != Intrinsic::minnum)
2149 return nullptr;
2150
2151 Type *Ty = I.getType();
2152 if (!isLegalFloatingTy(Ty->getScalarType()))
2153 return nullptr;
2154
2155 Value *Arg0 = I.getArgOperand(0);
2156 Value *Arg1 = I.getArgOperand(1);
2157
2158 const APFloat *C;
2159 if (!match(Arg1, m_APFloat(C)))
2160 return nullptr;
2161
2162 APFloat One(1.0);
2163 bool LosesInfo;
2164 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2165
2166 // Match nextafter(1.0, -1)
2167 One.next(true);
2168 if (One != *C)
2169 return nullptr;
2170
2171 Value *FloorSrc;
2172 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2173 m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
2174 return FloorSrc;
2175 return nullptr;
2176}
2177
2178Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2179 Value *FractArg) {
2180 SmallVector<Value *, 4> FractVals;
2181 extractValues(Builder, FractVals, FractArg);
2182
2183 SmallVector<Value *, 4> ResultVals(FractVals.size());
2184
2185 Type *Ty = FractArg->getType()->getScalarType();
2186 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2187 ResultVals[I] =
2188 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2189 }
2190
2191 return insertValues(Builder, FractArg->getType(), ResultVals);
2192}
2193
2194bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
2195 Value *FractArg = matchFractPat(I);
2196 if (!FractArg)
2197 return false;
2198
2199 // Match pattern for fract intrinsic in contexts where the nan check has been
2200 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2201 if (!I.hasNoNaNs() &&
2202 !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(DL, TLI)))
2203 return false;
2204
2205 IRBuilder<> Builder(&I);
2206 FastMathFlags FMF = I.getFastMathFlags();
2207 FMF.setNoNaNs();
2208 Builder.setFastMathFlags(FMF);
2209
2210 Value *Fract = applyFractPat(Builder, FractArg);
2211 Fract->takeName(&I);
2212 I.replaceAllUsesWith(Fract);
2213
2215 return true;
2216}
2217
2218static bool isOneOrNegOne(const Value *Val) {
2219 const APFloat *C;
2220 return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
2221}
2222
2223// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2224bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2225 Type *Ty = Sqrt.getType()->getScalarType();
2226 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2227 return false;
2228
2229 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2230 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2231
2232 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2233 // of fast llvm.sqrt will give the raw instruction anyway.
2234 if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2235 return false;
2236
2237 const float ReqdAccuracy = FPOp->getFPAccuracy();
2238
2239 // Defer correctly rounded expansion to codegen.
2240 if (ReqdAccuracy < 1.0f)
2241 return false;
2242
2243 // FIXME: This is an ugly hack for this pass using forward iteration instead
2244 // of reverse. If it worked like a normal combiner, the rsq would form before
2245 // we saw a sqrt call.
2246 auto *FDiv =
2247 dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
2248 if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2249 FDiv->getFPAccuracy() >= 1.0f &&
2250 canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
2251 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2252 isOneOrNegOne(FDiv->getOperand(0)))
2253 return false;
2254
2255 Value *SrcVal = Sqrt.getOperand(0);
2256 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2257
2258 // The raw instruction is 1 ulp, but the correction for denormal handling
2259 // brings it to 2.
2260 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2261 return false;
2262
2263 IRBuilder<> Builder(&Sqrt);
2265 extractValues(Builder, SrcVals, SrcVal);
2266
2267 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2268 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2269 if (CanTreatAsDAZ)
2270 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2271 else
2272 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2273 }
2274
2275 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2276 NewSqrt->takeName(&Sqrt);
2277 Sqrt.replaceAllUsesWith(NewSqrt);
2278 Sqrt.eraseFromParent();
2279 return true;
2280}
2281
2282bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2283 if (skipFunction(F))
2284 return false;
2285
2286 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2287 if (!TPC)
2288 return false;
2289
2290 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2291 const TargetLibraryInfo *TLI =
2292 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2293 AssumptionCache *AC =
2294 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2295 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2296 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2297 const UniformityInfo &UA =
2298 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2299 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2300}
2301
2304 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2309 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2310 if (!Impl.run())
2311 return PreservedAnalyses::all();
2313 if (!Impl.FlowChanged)
2315 return PA;
2316}
2317
2318INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2319 "AMDGPU IR optimizations", false, false)
2323INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2325
2326char AMDGPUCodeGenPrepare::ID = 0;
2327
2329 return new AMDGPUCodeGenPrepare();
2330}
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define DEBUG_TYPE
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
Generic memory optimizations
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
BinaryOperator * Mul
support::ulittle16_t & Lo
Definition: aarch32.cpp:204
support::ulittle16_t & Hi
Definition: aarch32.cpp:203
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1155
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
BitVector & set()
Definition: BitVector.h:351
bool all() const
all - Returns true if all bits are set.
Definition: BitVector.h:175
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:608
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
Definition: Constants.cpp:1119
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:205
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:338
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void setFast(bool B=true)
Definition: FMF.h:97
bool noInfs() const
Definition: FMF.h:67
bool allowReciprocal() const
Definition: FMF.h:69
bool approxFunc() const
Definition: FMF.h:71
void setNoNaNs(bool B=true)
Definition: FMF.h:79
bool noNaNs() const
Definition: FMF.h:66
bool allowContract() const
Definition: FMF.h:70
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
Definition: Function.h:68
This instruction compares its operands according to the predicate given to the constructor.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2510
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1632
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2105
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2498
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2050
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2554
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2078
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2044
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1479
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2092
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2324
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1732
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1386
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:871
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1797
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1458
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition: IRBuilder.h:319
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2032
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1517
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1369
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:573
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2448
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2018
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1539
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1670
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2281
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1498
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1561
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1613
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1741
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2065
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1403
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2319
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2085
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIntrinsicInst(IntrinsicInst &I)
Definition: InstVisitor.h:222
RetTy visitPHINode(PHINode &I)
Definition: InstVisitor.h:175
RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I)
Definition: InstVisitor.h:188
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:264
RetTy visitICmpInst(ICmpInst &I)
Definition: InstVisitor.h:166
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:283
RetTy visitLoadInst(LoadInst &I)
Definition: InstVisitor.h:169
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1545
Root of the metadata hierarchy.
Definition: Metadata.h:62
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
static IntegerType * getInt32Ty(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:460
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:903
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
Definition: PatternMatch.h:717
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
Definition: PatternMatch.h:316
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isKnownNeverNaN(const Value *V, unsigned Depth, const SimplifyQuery &SQ)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, unsigned Depth, const SimplifyQuery &SQ)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
CGPassBuilderOption getCGPassBuilderOption()
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:100
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:97
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.