LLVM 20.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
24#include "llvm/IR/Dominators.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/InstVisitor.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/Pass.h"
34
35#define DEBUG_TYPE "amdgpu-codegenprepare"
36
37using namespace llvm;
38using namespace llvm::PatternMatch;
39
40namespace {
41
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
46 cl::init(false));
47
48static cl::opt<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
52 cl::init(true));
53
54static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
58
59static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
64
65static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
69
70static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
74 cl::init(true));
75
76// Legalize 64-bit division by using the generic IR expansion.
77static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
81 cl::init(false));
82
83// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84// and is used for testing the legalizer.
85static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
89 cl::init(false));
90
91// Disable processing of fdiv so we can better test the backend implementations.
92static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
96 cl::init(false));
97
98class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100public:
101 const GCNSubtarget *ST = nullptr;
102 const AMDGPUTargetMachine *TM = nullptr;
103 const TargetLibraryInfo *TLInfo = nullptr;
104 AssumptionCache *AC = nullptr;
105 DominatorTree *DT = nullptr;
106 UniformityInfo *UA = nullptr;
107 Module *Mod = nullptr;
108 const DataLayout *DL = nullptr;
109 bool HasUnsafeFPMath = false;
110 bool HasFP32DenormalFlush = false;
111 bool FlowChanged = false;
112 mutable Function *SqrtF32 = nullptr;
113 mutable Function *LdexpF32 = nullptr;
114
115 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
116
117 Function *getSqrtF32() const {
118 if (SqrtF32)
119 return SqrtF32;
120
121 LLVMContext &Ctx = Mod->getContext();
122 SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt,
123 {Type::getFloatTy(Ctx)});
124 return SqrtF32;
125 }
126
127 Function *getLdexpF32() const {
128 if (LdexpF32)
129 return LdexpF32;
130
131 LLVMContext &Ctx = Mod->getContext();
132 LdexpF32 = Intrinsic::getDeclaration(
133 Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
134 return LdexpF32;
135 }
136
137 bool canBreakPHINode(const PHINode &I);
138
139 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
140 /// binary operation \p V.
141 ///
142 /// \returns Binary operation \p V.
143 /// \returns \p T's base element bit width.
144 unsigned getBaseElementBitWidth(const Type *T) const;
145
146 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
147 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
148 /// is returned.
149 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
150
151 /// \returns True if binary operation \p I is a signed binary operation, false
152 /// otherwise.
153 bool isSigned(const BinaryOperator &I) const;
154
155 /// \returns True if the condition of 'select' operation \p I comes from a
156 /// signed 'icmp' operation, false otherwise.
157 bool isSigned(const SelectInst &I) const;
158
159 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
160 /// false otherwise.
161 bool needsPromotionToI32(const Type *T) const;
162
163 /// Return true if \p T is a legal scalar floating point type.
164 bool isLegalFloatingTy(const Type *T) const;
165
166 /// Wrapper to pass all the arguments to computeKnownFPClass
168 const Instruction *CtxI) const {
169 return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI,
170 DT);
171 }
172
173 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
174 return HasFP32DenormalFlush ||
176 }
177
178 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
179 /// operation.
180 ///
181 /// \details \p I's base element bit width must be greater than 1 and less
182 /// than or equal 16. Promotion is done by sign or zero extending operands to
183 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
184 /// truncating the result of 32 bit binary operation back to \p I's original
185 /// type. Division operation is not promoted.
186 ///
187 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
188 /// false otherwise.
189 bool promoteUniformOpToI32(BinaryOperator &I) const;
190
191 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
192 ///
193 /// \details \p I's base element bit width must be greater than 1 and less
194 /// than or equal 16. Promotion is done by sign or zero extending operands to
195 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
196 ///
197 /// \returns True.
198 bool promoteUniformOpToI32(ICmpInst &I) const;
199
200 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
201 /// operation.
202 ///
203 /// \details \p I's base element bit width must be greater than 1 and less
204 /// than or equal 16. Promotion is done by sign or zero extending operands to
205 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
206 /// result of 32 bit 'select' operation back to \p I's original type.
207 ///
208 /// \returns True.
209 bool promoteUniformOpToI32(SelectInst &I) const;
210
211 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
212 /// intrinsic.
213 ///
214 /// \details \p I's base element bit width must be greater than 1 and less
215 /// than or equal 16. Promotion is done by zero extending the operand to 32
216 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
217 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
218 /// shift amount is 32 minus \p I's base element bit width), and truncating
219 /// the result of the shift operation back to \p I's original type.
220 ///
221 /// \returns True.
222 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
223
224 /// \returns The minimum number of bits needed to store the value of \Op as an
225 /// unsigned integer. Truncating to this size and then zero-extending to
226 /// the original will not change the value.
227 unsigned numBitsUnsigned(Value *Op) const;
228
229 /// \returns The minimum number of bits needed to store the value of \Op as a
230 /// signed integer. Truncating to this size and then sign-extending to
231 /// the original size will not change the value.
232 unsigned numBitsSigned(Value *Op) const;
233
234 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
235 /// SelectionDAG has an issue where an and asserting the bits are known
236 bool replaceMulWithMul24(BinaryOperator &I) const;
237
238 /// Perform same function as equivalently named function in DAGCombiner. Since
239 /// we expand some divisions here, we need to perform this before obscuring.
240 bool foldBinOpIntoSelect(BinaryOperator &I) const;
241
242 bool divHasSpecialOptimization(BinaryOperator &I,
243 Value *Num, Value *Den) const;
244 int getDivNumBits(BinaryOperator &I,
245 Value *Num, Value *Den,
246 unsigned AtLeast, bool Signed) const;
247
248 /// Expands 24 bit div or rem.
249 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
250 Value *Num, Value *Den,
251 bool IsDiv, bool IsSigned) const;
252
253 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
254 Value *Num, Value *Den, unsigned NumBits,
255 bool IsDiv, bool IsSigned) const;
256
257 /// Expands 32 bit div or rem.
258 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
259 Value *Num, Value *Den) const;
260
261 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
262 Value *Num, Value *Den) const;
263 void expandDivRem64(BinaryOperator &I) const;
264
265 /// Widen a scalar load.
266 ///
267 /// \details \p Widen scalar load for uniform, small type loads from constant
268 // memory / to a full 32-bits and then truncate the input to allow a scalar
269 // load instead of a vector load.
270 //
271 /// \returns True.
272
273 bool canWidenScalarExtLoad(LoadInst &I) const;
274
275 Value *matchFractPat(IntrinsicInst &I);
276 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
277
278 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
279 FastMathFlags SqrtFMF) const;
280
281 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
282 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
283 const Instruction *CtxI) const;
284
285 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
286 FastMathFlags FMF, const Instruction *CtxI) const;
287 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
288 float ReqdAccuracy) const;
289
290 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
291 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
292 Value *RsqOp, const Instruction *FDiv,
293 float ReqdAccuracy) const;
294
295 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
296 Value *Src) const;
297
298 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
299 bool IsNegative) const;
300 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
301 FastMathFlags FMF) const;
302 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
303 FastMathFlags FMF) const;
304
305public:
306 bool visitFDiv(BinaryOperator &I);
307
308 bool visitInstruction(Instruction &I) { return false; }
310 bool visitLoadInst(LoadInst &I);
311 bool visitICmpInst(ICmpInst &I);
313 bool visitPHINode(PHINode &I);
315
317 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
318 bool visitMinNum(IntrinsicInst &I);
319 bool visitSqrt(IntrinsicInst &I);
320 bool run(Function &F);
321};
322
323class AMDGPUCodeGenPrepare : public FunctionPass {
324private:
325 AMDGPUCodeGenPrepareImpl Impl;
326
327public:
328 static char ID;
329 AMDGPUCodeGenPrepare() : FunctionPass(ID) {
331 }
332 void getAnalysisUsage(AnalysisUsage &AU) const override {
336
337 // FIXME: Division expansion needs to preserve the dominator tree.
338 if (!ExpandDiv64InIR)
339 AU.setPreservesAll();
340 }
341 bool runOnFunction(Function &F) override;
342 bool doInitialization(Module &M) override;
343 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
344};
345
346} // end anonymous namespace
347
348bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
349 BreakPhiNodesCache.clear();
350 bool MadeChange = false;
351
352 Function::iterator NextBB;
353 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
354 BasicBlock *BB = &*FI;
355 NextBB = std::next(FI);
356
358 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
359 I = Next) {
360 Next = std::next(I);
361
362 MadeChange |= visit(*I);
363
364 if (Next != E) { // Control flow changed
365 BasicBlock *NextInstBB = Next->getParent();
366 if (NextInstBB != BB) {
367 BB = NextInstBB;
368 E = BB->end();
369 FE = F.end();
370 }
371 }
372 }
373 }
374 return MadeChange;
375}
376
377unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
378 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
379
380 if (T->isIntegerTy())
381 return T->getIntegerBitWidth();
382 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
383}
384
385Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
386 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
387
388 if (T->isIntegerTy())
389 return B.getInt32Ty();
390 return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
391}
392
393bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
394 return I.getOpcode() == Instruction::AShr ||
395 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
396}
397
398bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
399 return isa<ICmpInst>(I.getOperand(0)) ?
400 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
401}
402
403bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
404 if (!Widen16BitOps)
405 return false;
406
407 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
408 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
409 return true;
410
411 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
412 // TODO: The set of packed operations is more limited, so may want to
413 // promote some anyway.
414 if (ST->hasVOP3PInsts())
415 return false;
416
417 return needsPromotionToI32(VT->getElementType());
418 }
419
420 return false;
421}
422
423bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
424 return Ty->isFloatTy() || Ty->isDoubleTy() ||
425 (Ty->isHalfTy() && ST->has16BitInsts());
426}
427
428// Return true if the op promoted to i32 should have nsw set.
429static bool promotedOpIsNSW(const Instruction &I) {
430 switch (I.getOpcode()) {
431 case Instruction::Shl:
432 case Instruction::Add:
433 case Instruction::Sub:
434 return true;
435 case Instruction::Mul:
436 return I.hasNoUnsignedWrap();
437 default:
438 return false;
439 }
440}
441
442// Return true if the op promoted to i32 should have nuw set.
443static bool promotedOpIsNUW(const Instruction &I) {
444 switch (I.getOpcode()) {
445 case Instruction::Shl:
446 case Instruction::Add:
447 case Instruction::Mul:
448 return true;
449 case Instruction::Sub:
450 return I.hasNoUnsignedWrap();
451 default:
452 return false;
453 }
454}
455
456bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
457 Type *Ty = I.getType();
458 const DataLayout &DL = Mod->getDataLayout();
459 int TySize = DL.getTypeSizeInBits(Ty);
460 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
461
462 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);
463}
464
465bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
466 assert(needsPromotionToI32(I.getType()) &&
467 "I does not need promotion to i32");
468
469 if (I.getOpcode() == Instruction::SDiv ||
470 I.getOpcode() == Instruction::UDiv ||
471 I.getOpcode() == Instruction::SRem ||
472 I.getOpcode() == Instruction::URem)
473 return false;
474
475 IRBuilder<> Builder(&I);
476 Builder.SetCurrentDebugLocation(I.getDebugLoc());
477
478 Type *I32Ty = getI32Ty(Builder, I.getType());
479 Value *ExtOp0 = nullptr;
480 Value *ExtOp1 = nullptr;
481 Value *ExtRes = nullptr;
482 Value *TruncRes = nullptr;
483
484 if (isSigned(I)) {
485 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
486 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
487 } else {
488 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
489 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
490 }
491
492 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
493 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
494 if (promotedOpIsNSW(cast<Instruction>(I)))
495 Inst->setHasNoSignedWrap();
496
497 if (promotedOpIsNUW(cast<Instruction>(I)))
498 Inst->setHasNoUnsignedWrap();
499
500 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
501 Inst->setIsExact(ExactOp->isExact());
502 }
503
504 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
505
506 I.replaceAllUsesWith(TruncRes);
507 I.eraseFromParent();
508
509 return true;
510}
511
512bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
513 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
514 "I does not need promotion to i32");
515
516 IRBuilder<> Builder(&I);
517 Builder.SetCurrentDebugLocation(I.getDebugLoc());
518
519 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
520 Value *ExtOp0 = nullptr;
521 Value *ExtOp1 = nullptr;
522 Value *NewICmp = nullptr;
523
524 if (I.isSigned()) {
525 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
526 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
527 } else {
528 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
529 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
530 }
531 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
532
533 I.replaceAllUsesWith(NewICmp);
534 I.eraseFromParent();
535
536 return true;
537}
538
539bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
540 assert(needsPromotionToI32(I.getType()) &&
541 "I does not need promotion to i32");
542
543 IRBuilder<> Builder(&I);
544 Builder.SetCurrentDebugLocation(I.getDebugLoc());
545
546 Type *I32Ty = getI32Ty(Builder, I.getType());
547 Value *ExtOp1 = nullptr;
548 Value *ExtOp2 = nullptr;
549 Value *ExtRes = nullptr;
550 Value *TruncRes = nullptr;
551
552 if (isSigned(I)) {
553 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
554 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
555 } else {
556 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
557 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
558 }
559 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
560 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
561
562 I.replaceAllUsesWith(TruncRes);
563 I.eraseFromParent();
564
565 return true;
566}
567
568bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
569 IntrinsicInst &I) const {
570 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
571 "I must be bitreverse intrinsic");
572 assert(needsPromotionToI32(I.getType()) &&
573 "I does not need promotion to i32");
574
575 IRBuilder<> Builder(&I);
576 Builder.SetCurrentDebugLocation(I.getDebugLoc());
577
578 Type *I32Ty = getI32Ty(Builder, I.getType());
579 Function *I32 =
580 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
581 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
582 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
583 Value *LShrOp =
584 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
585 Value *TruncRes =
586 Builder.CreateTrunc(LShrOp, I.getType());
587
588 I.replaceAllUsesWith(TruncRes);
589 I.eraseFromParent();
590
591 return true;
592}
593
594unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
595 return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
596}
597
598unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
599 return ComputeMaxSignificantBits(Op, *DL, 0, AC);
600}
601
602static void extractValues(IRBuilder<> &Builder,
603 SmallVectorImpl<Value *> &Values, Value *V) {
604 auto *VT = dyn_cast<FixedVectorType>(V->getType());
605 if (!VT) {
606 Values.push_back(V);
607 return;
608 }
609
610 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
611 Values.push_back(Builder.CreateExtractElement(V, I));
612}
613
615 Type *Ty,
616 SmallVectorImpl<Value *> &Values) {
617 if (!Ty->isVectorTy()) {
618 assert(Values.size() == 1);
619 return Values[0];
620 }
621
622 Value *NewVal = PoisonValue::get(Ty);
623 for (int I = 0, E = Values.size(); I != E; ++I)
624 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
625
626 return NewVal;
627}
628
629bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
630 if (I.getOpcode() != Instruction::Mul)
631 return false;
632
633 Type *Ty = I.getType();
634 unsigned Size = Ty->getScalarSizeInBits();
635 if (Size <= 16 && ST->has16BitInsts())
636 return false;
637
638 // Prefer scalar if this could be s_mul_i32
639 if (UA->isUniform(&I))
640 return false;
641
642 Value *LHS = I.getOperand(0);
643 Value *RHS = I.getOperand(1);
644 IRBuilder<> Builder(&I);
645 Builder.SetCurrentDebugLocation(I.getDebugLoc());
646
647 unsigned LHSBits = 0, RHSBits = 0;
648 bool IsSigned = false;
649
650 if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
651 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
652 IsSigned = false;
653
654 } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
655 (RHSBits = numBitsSigned(RHS)) <= 24) {
656 IsSigned = true;
657
658 } else
659 return false;
660
663 SmallVector<Value *, 4> ResultVals;
664 extractValues(Builder, LHSVals, LHS);
665 extractValues(Builder, RHSVals, RHS);
666
667 IntegerType *I32Ty = Builder.getInt32Ty();
668 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
669 Type *DstTy = LHSVals[0]->getType();
670
671 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
672 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
673 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
674 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
675 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
677 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
678 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
679 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
680 : Builder.CreateZExtOrTrunc(Result, DstTy);
681 ResultVals.push_back(Result);
682 }
683
684 Value *NewVal = insertValues(Builder, Ty, ResultVals);
685 NewVal->takeName(&I);
686 I.replaceAllUsesWith(NewVal);
687 I.eraseFromParent();
688
689 return true;
690}
691
692// Find a select instruction, which may have been casted. This is mostly to deal
693// with cases where i16 selects were promoted here to i32.
695 Cast = nullptr;
696 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
697 return Sel;
698
699 if ((Cast = dyn_cast<CastInst>(V))) {
700 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
701 return Sel;
702 }
703
704 return nullptr;
705}
706
707bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
708 // Don't do this unless the old select is going away. We want to eliminate the
709 // binary operator, not replace a binop with a select.
710 int SelOpNo = 0;
711
712 CastInst *CastOp;
713
714 // TODO: Should probably try to handle some cases with multiple
715 // users. Duplicating the select may be profitable for division.
716 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
717 if (!Sel || !Sel->hasOneUse()) {
718 SelOpNo = 1;
719 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
720 }
721
722 if (!Sel || !Sel->hasOneUse())
723 return false;
724
725 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
726 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
727 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
728 if (!CBO || !CT || !CF)
729 return false;
730
731 if (CastOp) {
732 if (!CastOp->hasOneUse())
733 return false;
734 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
735 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
736 }
737
738 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
739 // need to handle divisions here.
740 Constant *FoldedT = SelOpNo ?
741 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
743 if (!FoldedT || isa<ConstantExpr>(FoldedT))
744 return false;
745
746 Constant *FoldedF = SelOpNo ?
747 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
749 if (!FoldedF || isa<ConstantExpr>(FoldedF))
750 return false;
751
752 IRBuilder<> Builder(&BO);
753 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
754 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
755 Builder.setFastMathFlags(FPOp->getFastMathFlags());
756
757 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
758 FoldedT, FoldedF);
759 NewSelect->takeName(&BO);
760 BO.replaceAllUsesWith(NewSelect);
761 BO.eraseFromParent();
762 if (CastOp)
763 CastOp->eraseFromParent();
764 Sel->eraseFromParent();
765 return true;
766}
767
768std::pair<Value *, Value *>
769AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
770 Value *Src) const {
771 Type *Ty = Src->getType();
772 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
773 {Ty, Builder.getInt32Ty()}, Src);
774 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
775
776 // Bypass the bug workaround for the exponent result since it doesn't matter.
777 // TODO: Does the bug workaround even really need to consider the exponent
778 // result? It's unspecified by the spec.
779
780 Value *FrexpExp =
781 ST->hasFractBug()
782 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
783 {Builder.getInt32Ty(), Ty}, Src)
784 : Builder.CreateExtractValue(Frexp, {1});
785 return {FrexpMant, FrexpExp};
786}
787
788/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
789Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
790 Value *Src,
791 bool IsNegative) const {
792 // Same as for 1.0, but expand the sign out of the constant.
793 // -1.0 / x -> rcp (fneg x)
794 if (IsNegative)
795 Src = Builder.CreateFNeg(Src);
796
797 // The rcp instruction doesn't support denormals, so scale the input
798 // out of the denormal range and convert at the end.
799 //
800 // Expand as 2^-n * (1.0 / (x * 2^n))
801
802 // TODO: Skip scaling if input is known never denormal and the input
803 // range won't underflow to denormal. The hard part is knowing the
804 // result. We need a range check, the result could be denormal for
805 // 0x1p+126 < den <= 0x1p+127.
806 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
807 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
808 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
809 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
810}
811
812/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
813Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
814 Value *RHS,
815 FastMathFlags FMF) const {
816 // If we have have to work around the fract/frexp bug, we're worse off than
817 // using the fdiv.fast expansion. The full safe expansion is faster if we have
818 // fast FMA.
819 if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
820 (!FMF.noNaNs() || !FMF.noInfs()))
821 return nullptr;
822
823 // We're scaling the LHS to avoid a denormal input, and scale the denominator
824 // to avoid large values underflowing the result.
825 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
826
827 Value *Rcp =
828 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
829
830 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
831 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
832
833 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
834 // result.
835 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
836 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
837}
838
839/// Emit a sqrt that handles denormals and is accurate to 2ulp.
840Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
841 Value *Src,
842 FastMathFlags FMF) const {
843 Type *Ty = Src->getType();
844 APFloat SmallestNormal =
846 Value *NeedScale =
847 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
848
849 ConstantInt *Zero = Builder.getInt32(0);
850 Value *InputScaleFactor =
851 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
852
853 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
854
855 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
856
857 Value *OutputScaleFactor =
858 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
859 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
860}
861
862/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
863static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
864 bool IsNegative) {
865 // bool need_scale = x < 0x1p-126f;
866 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
867 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
868 // rsq(x * input_scale) * output_scale;
869
870 Type *Ty = Src->getType();
871 APFloat SmallestNormal =
873 Value *NeedScale =
874 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
875 Constant *One = ConstantFP::get(Ty, 1.0);
876 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
877 Constant *OutputScale =
878 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
879
880 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
881
882 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
883 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
884 Value *OutputScaleFactor = Builder.CreateSelect(
885 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
886
887 return Builder.CreateFMul(Rsq, OutputScaleFactor);
888}
889
890bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
891 FastMathFlags DivFMF,
892 FastMathFlags SqrtFMF) const {
893 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
894 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
895 return false;
896
897 // v_rsq_f32 gives 1ulp
898 return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
899 SqrtOp->getFPAccuracy() >= 1.0f;
900}
901
902Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
903 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
904 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
905 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
906 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
907
908 // rsq_f16 is accurate to 0.51 ulp.
909 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
910 // rsq_f64 is never accurate.
911 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
912 if (!CLHS)
913 return nullptr;
914
915 assert(Den->getType()->isFloatTy());
916
917 bool IsNegative = false;
918
919 // TODO: Handle other numerator values with arcp.
920 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
921 // Add in the sqrt flags.
922 IRBuilder<>::FastMathFlagGuard Guard(Builder);
923 Builder.setFastMathFlags(DivFMF | SqrtFMF);
924
925 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
926 canIgnoreDenormalInput(Den, CtxI)) {
927 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
928 // -1.0 / sqrt(x) -> fneg(rsq(x))
929 return IsNegative ? Builder.CreateFNeg(Result) : Result;
930 }
931
932 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
933 }
934
935 return nullptr;
936}
937
938// Optimize fdiv with rcp:
939//
940// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
941// allowed with unsafe-fp-math or afn.
942//
943// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
944Value *
945AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
946 Value *Den, FastMathFlags FMF,
947 const Instruction *CtxI) const {
948 // rcp_f16 is accurate to 0.51 ulp.
949 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
950 // rcp_f64 is never accurate.
951 assert(Den->getType()->isFloatTy());
952
953 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
954 bool IsNegative = false;
955 if (CLHS->isExactlyValue(1.0) ||
956 (IsNegative = CLHS->isExactlyValue(-1.0))) {
957 Value *Src = Den;
958
959 if (HasFP32DenormalFlush || FMF.approxFunc()) {
960 // -1.0 / x -> 1.0 / fneg(x)
961 if (IsNegative)
962 Src = Builder.CreateFNeg(Src);
963
964 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
965 // the CI documentation has a worst case error of 1 ulp.
966 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
967 // to use it as long as we aren't trying to use denormals.
968 //
969 // v_rcp_f16 and v_rsq_f16 DO support denormals.
970
971 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
972 // insert rsq intrinsic here.
973
974 // 1.0 / x -> rcp(x)
975 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
976 }
977
978 // TODO: If the input isn't denormal, and we know the input exponent isn't
979 // big enough to introduce a denormal we can avoid the scaling.
980 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
981 }
982 }
983
984 if (FMF.allowReciprocal()) {
985 // x / y -> x * (1.0 / y)
986
987 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
988 // will never underflow.
989 if (HasFP32DenormalFlush || FMF.approxFunc()) {
990 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
991 return Builder.CreateFMul(Num, Recip);
992 }
993
994 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
995 return Builder.CreateFMul(Num, Recip);
996 }
997
998 return nullptr;
999}
1000
1001// optimize with fdiv.fast:
1002//
1003// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1004//
1005// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1006//
1007// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1008Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1009 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1010 // fdiv.fast can achieve 2.5 ULP accuracy.
1011 if (ReqdAccuracy < 2.5f)
1012 return nullptr;
1013
1014 // Only have fdiv.fast for f32.
1015 assert(Den->getType()->isFloatTy());
1016
1017 bool NumIsOne = false;
1018 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1019 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1020 NumIsOne = true;
1021 }
1022
1023 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1024 //
1025 // TODO: This works for any value with a specific known exponent range, don't
1026 // just limit to constant 1.
1027 if (!HasFP32DenormalFlush && !NumIsOne)
1028 return nullptr;
1029
1030 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1031}
1032
1033Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1034 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
1035 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
1036 float ReqdDivAccuracy) const {
1037 if (RsqOp) {
1038 Value *Rsq =
1039 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1040 if (Rsq)
1041 return Rsq;
1042 }
1043
1044 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1045 if (Rcp)
1046 return Rcp;
1047
1048 // In the basic case fdiv_fast has the same instruction count as the frexp div
1049 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1050 // potentially be fused into a user. Also, materialization of the constants
1051 // can be reused for multiple instances.
1052 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1053 if (FDivFast)
1054 return FDivFast;
1055
1056 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1057}
1058
1059// Optimizations is performed based on fpmath, fast math flags as well as
1060// denormals to optimize fdiv with either rcp or fdiv.fast.
1061//
1062// With rcp:
1063// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1064// allowed with unsafe-fp-math or afn.
1065//
1066// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1067//
1068// With fdiv.fast:
1069// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1070//
1071// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1072//
1073// NOTE: rcp is the preference in cases that both are legal.
1074bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1075 if (DisableFDivExpand)
1076 return false;
1077
1078 Type *Ty = FDiv.getType()->getScalarType();
1079 if (!Ty->isFloatTy())
1080 return false;
1081
1082 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1083 // expansion around them in codegen. f16 is good enough to always use.
1084
1085 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
1086 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1087 const float ReqdAccuracy = FPOp->getFPAccuracy();
1088
1089 FastMathFlags SqrtFMF;
1090
1091 Value *Num = FDiv.getOperand(0);
1092 Value *Den = FDiv.getOperand(1);
1093
1094 Value *RsqOp = nullptr;
1095 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1096 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1097 DenII->hasOneUse()) {
1098 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1099 SqrtFMF = SqrtOp->getFastMathFlags();
1100 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1101 RsqOp = SqrtOp->getOperand(0);
1102 }
1103
1104 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1105 //
1106 // Defer to codegen to handle this.
1107 //
1108 // TODO: Decide on an interpretation for interactions between afn + arcp +
1109 // !fpmath, and make it consistent between here and codegen. For now, defer
1110 // expansion of afn to codegen. The current interpretation is so aggressive we
1111 // don't need any pre-consideration here when we have better information. A
1112 // more conservative interpretation could use handling here.
1113 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1114 if (!RsqOp && AllowInaccurateRcp)
1115 return false;
1116
1117 // Defer the correct implementations to codegen.
1118 if (ReqdAccuracy < 1.0f)
1119 return false;
1120
1121 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
1122 Builder.setFastMathFlags(DivFMF);
1123 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1124
1127 SmallVector<Value *, 4> RsqDenVals;
1128 extractValues(Builder, NumVals, Num);
1129 extractValues(Builder, DenVals, Den);
1130
1131 if (RsqOp)
1132 extractValues(Builder, RsqDenVals, RsqOp);
1133
1134 SmallVector<Value *, 4> ResultVals(NumVals.size());
1135 for (int I = 0, E = NumVals.size(); I != E; ++I) {
1136 Value *NumElt = NumVals[I];
1137 Value *DenElt = DenVals[I];
1138 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
1139
1140 Value *NewElt =
1141 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1142 cast<Instruction>(FPOp), ReqdAccuracy);
1143 if (!NewElt) {
1144 // Keep the original, but scalarized.
1145
1146 // This has the unfortunate side effect of sometimes scalarizing when
1147 // we're not going to do anything.
1148 NewElt = Builder.CreateFDiv(NumElt, DenElt);
1149 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1150 NewEltInst->copyMetadata(FDiv);
1151 }
1152
1153 ResultVals[I] = NewElt;
1154 }
1155
1156 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
1157
1158 if (NewVal) {
1159 FDiv.replaceAllUsesWith(NewVal);
1160 NewVal->takeName(&FDiv);
1162 }
1163
1164 return true;
1165}
1166
1167static bool hasUnsafeFPMath(const Function &F) {
1168 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
1169 return Attr.getValueAsBool();
1170}
1171
1172static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1173 Value *LHS, Value *RHS) {
1174 Type *I32Ty = Builder.getInt32Ty();
1175 Type *I64Ty = Builder.getInt64Ty();
1176
1177 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1178 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1179 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1180 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1181 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1182 Hi = Builder.CreateTrunc(Hi, I32Ty);
1183 return std::pair(Lo, Hi);
1184}
1185
1186static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1187 return getMul64(Builder, LHS, RHS).second;
1188}
1189
1190/// Figure out how many bits are really needed for this division. \p AtLeast is
1191/// an optimization hint to bypass the second ComputeNumSignBits call if we the
1192/// first one is insufficient. Returns -1 on failure.
1193int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1194 Value *Den, unsigned AtLeast,
1195 bool IsSigned) const {
1196 const DataLayout &DL = Mod->getDataLayout();
1197 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
1198 if (LHSSignBits < AtLeast)
1199 return -1;
1200
1201 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
1202 if (RHSSignBits < AtLeast)
1203 return -1;
1204
1205 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1206 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
1207 if (IsSigned)
1208 ++DivBits;
1209 return DivBits;
1210}
1211
1212// The fractional part of a float is enough to accurately represent up to
1213// a 24-bit signed integer.
1214Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1215 BinaryOperator &I, Value *Num,
1216 Value *Den, bool IsDiv,
1217 bool IsSigned) const {
1218 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1219 // If Num bits <= 24, assume 0 signbits.
1220 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1221 int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
1222 if (DivBits == -1)
1223 return nullptr;
1224 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1225}
1226
1227Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1228 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1229 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1230 Type *I32Ty = Builder.getInt32Ty();
1231 Num = Builder.CreateTrunc(Num, I32Ty);
1232 Den = Builder.CreateTrunc(Den, I32Ty);
1233
1234 Type *F32Ty = Builder.getFloatTy();
1235 ConstantInt *One = Builder.getInt32(1);
1236 Value *JQ = One;
1237
1238 if (IsSigned) {
1239 // char|short jq = ia ^ ib;
1240 JQ = Builder.CreateXor(Num, Den);
1241
1242 // jq = jq >> (bitsize - 2)
1243 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1244
1245 // jq = jq | 0x1
1246 JQ = Builder.CreateOr(JQ, One);
1247 }
1248
1249 // int ia = (int)LHS;
1250 Value *IA = Num;
1251
1252 // int ib, (int)RHS;
1253 Value *IB = Den;
1254
1255 // float fa = (float)ia;
1256 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1257 : Builder.CreateUIToFP(IA, F32Ty);
1258
1259 // float fb = (float)ib;
1260 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1261 : Builder.CreateUIToFP(IB,F32Ty);
1262
1263 Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
1264 Builder.getFloatTy());
1265 Value *RCP = Builder.CreateCall(RcpDecl, { FB });
1266 Value *FQM = Builder.CreateFMul(FA, RCP);
1267
1268 // fq = trunc(fqm);
1269 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1270 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1271
1272 // float fqneg = -fq;
1273 Value *FQNeg = Builder.CreateFNeg(FQ);
1274
1275 // float fr = mad(fqneg, fb, fa);
1276 auto FMAD = !ST->hasMadMacF32Insts()
1277 ? Intrinsic::fma
1278 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1279 Value *FR = Builder.CreateIntrinsic(FMAD,
1280 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1281
1282 // int iq = (int)fq;
1283 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1284 : Builder.CreateFPToUI(FQ, I32Ty);
1285
1286 // fr = fabs(fr);
1287 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1288
1289 // fb = fabs(fb);
1290 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1291
1292 // int cv = fr >= fb;
1293 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1294
1295 // jq = (cv ? jq : 0);
1296 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1297
1298 // dst = iq + jq;
1299 Value *Div = Builder.CreateAdd(IQ, JQ);
1300
1301 Value *Res = Div;
1302 if (!IsDiv) {
1303 // Rem needs compensation, it's easier to recompute it
1304 Value *Rem = Builder.CreateMul(Div, Den);
1305 Res = Builder.CreateSub(Num, Rem);
1306 }
1307
1308 if (DivBits != 0 && DivBits < 32) {
1309 // Extend in register from the number of bits this divide really is.
1310 if (IsSigned) {
1311 int InRegBits = 32 - DivBits;
1312
1313 Res = Builder.CreateShl(Res, InRegBits);
1314 Res = Builder.CreateAShr(Res, InRegBits);
1315 } else {
1316 ConstantInt *TruncMask
1317 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1318 Res = Builder.CreateAnd(Res, TruncMask);
1319 }
1320 }
1321
1322 return Res;
1323}
1324
1325// Try to recognize special cases the DAG will emit special, better expansions
1326// than the general expansion we do here.
1327
1328// TODO: It would be better to just directly handle those optimizations here.
1329bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1330 Value *Num,
1331 Value *Den) const {
1332 if (Constant *C = dyn_cast<Constant>(Den)) {
1333 // Arbitrary constants get a better expansion as long as a wider mulhi is
1334 // legal.
1335 if (C->getType()->getScalarSizeInBits() <= 32)
1336 return true;
1337
1338 // TODO: Sdiv check for not exact for some reason.
1339
1340 // If there's no wider mulhi, there's only a better expansion for powers of
1341 // two.
1342 // TODO: Should really know for each vector element.
1343 if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1344 return true;
1345
1346 return false;
1347 }
1348
1349 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1350 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1351 if (BinOpDen->getOpcode() == Instruction::Shl &&
1352 isa<Constant>(BinOpDen->getOperand(0)) &&
1353 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1354 0, AC, &I, DT)) {
1355 return true;
1356 }
1357 }
1358
1359 return false;
1360}
1361
1362static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
1363 // Check whether the sign can be determined statically.
1364 KnownBits Known = computeKnownBits(V, *DL);
1365 if (Known.isNegative())
1366 return Constant::getAllOnesValue(V->getType());
1367 if (Known.isNonNegative())
1368 return Constant::getNullValue(V->getType());
1369 return Builder.CreateAShr(V, Builder.getInt32(31));
1370}
1371
1372Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1374 Value *Y) const {
1375 Instruction::BinaryOps Opc = I.getOpcode();
1376 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1377 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1378
1379 FastMathFlags FMF;
1380 FMF.setFast();
1381 Builder.setFastMathFlags(FMF);
1382
1383 if (divHasSpecialOptimization(I, X, Y))
1384 return nullptr; // Keep it for later optimization.
1385
1386 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1387 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1388
1389 Type *Ty = X->getType();
1390 Type *I32Ty = Builder.getInt32Ty();
1391 Type *F32Ty = Builder.getFloatTy();
1392
1393 if (Ty->getScalarSizeInBits() != 32) {
1394 if (IsSigned) {
1395 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1396 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1397 } else {
1398 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1399 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1400 }
1401 }
1402
1403 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1404 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1405 Builder.CreateZExtOrTrunc(Res, Ty);
1406 }
1407
1408 ConstantInt *Zero = Builder.getInt32(0);
1409 ConstantInt *One = Builder.getInt32(1);
1410
1411 Value *Sign = nullptr;
1412 if (IsSigned) {
1413 Value *SignX = getSign32(X, Builder, DL);
1414 Value *SignY = getSign32(Y, Builder, DL);
1415 // Remainder sign is the same as LHS
1416 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1417
1418 X = Builder.CreateAdd(X, SignX);
1419 Y = Builder.CreateAdd(Y, SignY);
1420
1421 X = Builder.CreateXor(X, SignX);
1422 Y = Builder.CreateXor(Y, SignY);
1423 }
1424
1425 // The algorithm here is based on ideas from "Software Integer Division", Tom
1426 // Rodeheffer, August 2008.
1427 //
1428 // unsigned udiv(unsigned x, unsigned y) {
1429 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1430 // // that this is a lower bound on inv(y), even if some of the calculations
1431 // // round up.
1432 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1433 //
1434 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1435 // // Empirically this is guaranteed to give a "two-y" lower bound on
1436 // // inv(y).
1437 // z += umulh(z, -y * z);
1438 //
1439 // // Quotient/remainder estimate.
1440 // unsigned q = umulh(x, z);
1441 // unsigned r = x - q * y;
1442 //
1443 // // Two rounds of quotient/remainder refinement.
1444 // if (r >= y) {
1445 // ++q;
1446 // r -= y;
1447 // }
1448 // if (r >= y) {
1449 // ++q;
1450 // r -= y;
1451 // }
1452 //
1453 // return q;
1454 // }
1455
1456 // Initial estimate of inv(y).
1457 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1458 Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1459 Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
1460 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1461 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1462 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1463
1464 // One round of UNR.
1465 Value *NegY = Builder.CreateSub(Zero, Y);
1466 Value *NegYZ = Builder.CreateMul(NegY, Z);
1467 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1468
1469 // Quotient/remainder estimate.
1470 Value *Q = getMulHu(Builder, X, Z);
1471 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1472
1473 // First quotient/remainder refinement.
1474 Value *Cond = Builder.CreateICmpUGE(R, Y);
1475 if (IsDiv)
1476 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1477 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1478
1479 // Second quotient/remainder refinement.
1480 Cond = Builder.CreateICmpUGE(R, Y);
1481 Value *Res;
1482 if (IsDiv)
1483 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1484 else
1485 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1486
1487 if (IsSigned) {
1488 Res = Builder.CreateXor(Res, Sign);
1489 Res = Builder.CreateSub(Res, Sign);
1490 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1491 } else {
1492 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1493 }
1494 return Res;
1495}
1496
1497Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1498 BinaryOperator &I, Value *Num,
1499 Value *Den) const {
1500 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1501 return nullptr; // Keep it for later optimization.
1502
1503 Instruction::BinaryOps Opc = I.getOpcode();
1504
1505 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1506 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1507
1508 int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1509 if (NumDivBits == -1)
1510 return nullptr;
1511
1512 Value *Narrowed = nullptr;
1513 if (NumDivBits <= 24) {
1514 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1515 IsDiv, IsSigned);
1516 } else if (NumDivBits <= 32) {
1517 Narrowed = expandDivRem32(Builder, I, Num, Den);
1518 }
1519
1520 if (Narrowed) {
1521 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1522 Builder.CreateZExt(Narrowed, Num->getType());
1523 }
1524
1525 return nullptr;
1526}
1527
1528void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1529 Instruction::BinaryOps Opc = I.getOpcode();
1530 // Do the general expansion.
1531 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1533 return;
1534 }
1535
1536 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1538 return;
1539 }
1540
1541 llvm_unreachable("not a division");
1542}
1543
1544bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1545 if (foldBinOpIntoSelect(I))
1546 return true;
1547
1548 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1549 UA->isUniform(&I) && promoteUniformOpToI32(I))
1550 return true;
1551
1552 if (UseMul24Intrin && replaceMulWithMul24(I))
1553 return true;
1554
1555 bool Changed = false;
1556 Instruction::BinaryOps Opc = I.getOpcode();
1557 Type *Ty = I.getType();
1558 Value *NewDiv = nullptr;
1559 unsigned ScalarSize = Ty->getScalarSizeInBits();
1560
1562
1563 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1564 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1565 ScalarSize <= 64 &&
1566 !DisableIDivExpand) {
1567 Value *Num = I.getOperand(0);
1568 Value *Den = I.getOperand(1);
1569 IRBuilder<> Builder(&I);
1570 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1571
1572 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1573 NewDiv = PoisonValue::get(VT);
1574
1575 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1576 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1577 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1578
1579 Value *NewElt;
1580 if (ScalarSize <= 32) {
1581 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1582 if (!NewElt)
1583 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1584 } else {
1585 // See if this 64-bit division can be shrunk to 32/24-bits before
1586 // producing the general expansion.
1587 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1588 if (!NewElt) {
1589 // The general 64-bit expansion introduces control flow and doesn't
1590 // return the new value. Just insert a scalar copy and defer
1591 // expanding it.
1592 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1593 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1594 }
1595 }
1596
1597 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1598 NewEltI->copyIRFlags(&I);
1599
1600 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1601 }
1602 } else {
1603 if (ScalarSize <= 32)
1604 NewDiv = expandDivRem32(Builder, I, Num, Den);
1605 else {
1606 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1607 if (!NewDiv)
1608 Div64ToExpand.push_back(&I);
1609 }
1610 }
1611
1612 if (NewDiv) {
1613 I.replaceAllUsesWith(NewDiv);
1614 I.eraseFromParent();
1615 Changed = true;
1616 }
1617 }
1618
1619 if (ExpandDiv64InIR) {
1620 // TODO: We get much worse code in specially handled constant cases.
1621 for (BinaryOperator *Div : Div64ToExpand) {
1622 expandDivRem64(*Div);
1623 FlowChanged = true;
1624 Changed = true;
1625 }
1626 }
1627
1628 return Changed;
1629}
1630
1631bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1632 if (!WidenLoads)
1633 return false;
1634
1635 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1636 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1637 canWidenScalarExtLoad(I)) {
1638 IRBuilder<> Builder(&I);
1639 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1640
1641 Type *I32Ty = Builder.getInt32Ty();
1642 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1643 WidenLoad->copyMetadata(I);
1644
1645 // If we have range metadata, we need to convert the type, and not make
1646 // assumptions about the high bits.
1647 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1649 mdconst::extract<ConstantInt>(Range->getOperand(0));
1650
1651 if (Lower->isNullValue()) {
1652 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1653 } else {
1654 Metadata *LowAndHigh[] = {
1655 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1656 // Don't make assumptions about the high bits.
1657 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1658 };
1659
1660 WidenLoad->setMetadata(LLVMContext::MD_range,
1661 MDNode::get(Mod->getContext(), LowAndHigh));
1662 }
1663 }
1664
1665 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1666 Type *IntNTy = Builder.getIntNTy(TySize);
1667 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1668 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1669 I.replaceAllUsesWith(ValOrig);
1670 I.eraseFromParent();
1671 return true;
1672 }
1673
1674 return false;
1675}
1676
1677bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1678 bool Changed = false;
1679
1680 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1681 UA->isUniform(&I))
1682 Changed |= promoteUniformOpToI32(I);
1683
1684 return Changed;
1685}
1686
1687bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1688 Value *Cond = I.getCondition();
1689 Value *TrueVal = I.getTrueValue();
1690 Value *FalseVal = I.getFalseValue();
1691 Value *CmpVal;
1693
1694 if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {
1695 if (UA->isUniform(&I))
1696 return promoteUniformOpToI32(I);
1697 return false;
1698 }
1699
1700 // Match fract pattern with nan check.
1701 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1702 return false;
1703
1704 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1705 if (!FPOp)
1706 return false;
1707
1708 IRBuilder<> Builder(&I);
1709 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1710
1711 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1712 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1713
1714 Value *Fract = nullptr;
1715 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1716 CmpVal == matchFractPat(*IIFalse)) {
1717 // isnan(x) ? x : fract(x)
1718 Fract = applyFractPat(Builder, CmpVal);
1719 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1720 CmpVal == matchFractPat(*IITrue)) {
1721 // !isnan(x) ? fract(x) : x
1722 Fract = applyFractPat(Builder, CmpVal);
1723 } else
1724 return false;
1725
1726 Fract->takeName(&I);
1727 I.replaceAllUsesWith(Fract);
1729 return true;
1730}
1731
1732static bool areInSameBB(const Value *A, const Value *B) {
1733 const auto *IA = dyn_cast<Instruction>(A);
1734 const auto *IB = dyn_cast<Instruction>(B);
1735 return IA && IB && IA->getParent() == IB->getParent();
1736}
1737
1738// Helper for breaking large PHIs that returns true when an extractelement on V
1739// is likely to be folded away by the DAG combiner.
1741 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1742 if (!FVT)
1743 return false;
1744
1745 const Value *CurVal = V;
1746
1747 // Check for insertelements, keeping track of the elements covered.
1748 BitVector EltsCovered(FVT->getNumElements());
1749 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1750 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1751
1752 // Non constant index/out of bounds index -> folding is unlikely.
1753 // The latter is more of a sanity check because canonical IR should just
1754 // have replaced those with poison.
1755 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1756 return false;
1757
1758 const auto *VecSrc = IE->getOperand(0);
1759
1760 // If the vector source is another instruction, it must be in the same basic
1761 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1762 // unlikely to be able to do anything interesting here.
1763 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1764 return false;
1765
1766 CurVal = VecSrc;
1767 EltsCovered.set(Idx->getZExtValue());
1768
1769 // All elements covered.
1770 if (EltsCovered.all())
1771 return true;
1772 }
1773
1774 // We either didn't find a single insertelement, or the insertelement chain
1775 // ended before all elements were covered. Check for other interesting values.
1776
1777 // Constants are always interesting because we can just constant fold the
1778 // extractelements.
1779 if (isa<Constant>(CurVal))
1780 return true;
1781
1782 // shufflevector is likely to be profitable if either operand is a constant,
1783 // or if either source is in the same block.
1784 // This is because shufflevector is most often lowered as a series of
1785 // insert/extract elements anyway.
1786 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1787 return isa<Constant>(SV->getOperand(1)) ||
1788 areInSameBB(SV, SV->getOperand(0)) ||
1789 areInSameBB(SV, SV->getOperand(1));
1790 }
1791
1792 return false;
1793}
1794
1795static void collectPHINodes(const PHINode &I,
1797 const auto [It, Inserted] = SeenPHIs.insert(&I);
1798 if (!Inserted)
1799 return;
1800
1801 for (const Value *Inc : I.incoming_values()) {
1802 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1803 collectPHINodes(*PhiInc, SeenPHIs);
1804 }
1805
1806 for (const User *U : I.users()) {
1807 if (const auto *PhiU = dyn_cast<PHINode>(U))
1808 collectPHINodes(*PhiU, SeenPHIs);
1809 }
1810}
1811
1812bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1813 // Check in the cache first.
1814 if (const auto It = BreakPhiNodesCache.find(&I);
1815 It != BreakPhiNodesCache.end())
1816 return It->second;
1817
1818 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1819 // recursively consider all its users and incoming values that are also PHI
1820 // nodes. We then make a decision about all of those PHIs at once. Either they
1821 // all get broken up, or none of them do. That way, we avoid cases where a
1822 // single PHI is/is not broken and we end up reforming/exploding a vector
1823 // multiple times, or even worse, doing it in a loop.
1825 collectPHINodes(I, WorkList);
1826
1827#ifndef NDEBUG
1828 // Check that none of the PHI nodes in the worklist are in the map. If some of
1829 // them are, it means we're not good enough at collecting related PHIs.
1830 for (const PHINode *WLP : WorkList) {
1831 assert(BreakPhiNodesCache.count(WLP) == 0);
1832 }
1833#endif
1834
1835 // To consider a PHI profitable to break, we need to see some interesting
1836 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1837 // must have one to consider all PHIs breakable.
1838 //
1839 // This threshold has been determined through performance testing.
1840 //
1841 // Note that the computation below is equivalent to
1842 //
1843 // (unsigned)ceil((K / 3.0) * 2)
1844 //
1845 // It's simply written this way to avoid mixing integral/FP arithmetic.
1846 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1847 unsigned NumBreakablePHIs = 0;
1848 bool CanBreak = false;
1849 for (const PHINode *Cur : WorkList) {
1850 // Don't break PHIs that have no interesting incoming values. That is, where
1851 // there is no clear opportunity to fold the "extractelement" instructions
1852 // we would add.
1853 //
1854 // Note: IC does not run after this pass, so we're only interested in the
1855 // foldings that the DAG combiner can do.
1856 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1857 if (++NumBreakablePHIs >= Threshold) {
1858 CanBreak = true;
1859 break;
1860 }
1861 }
1862 }
1863
1864 for (const PHINode *Cur : WorkList)
1865 BreakPhiNodesCache[Cur] = CanBreak;
1866
1867 return CanBreak;
1868}
1869
1870/// Helper class for "break large PHIs" (visitPHINode).
1871///
1872/// This represents a slice of a PHI's incoming value, which is made up of:
1873/// - The type of the slice (Ty)
1874/// - The index in the incoming value's vector where the slice starts (Idx)
1875/// - The number of elements in the slice (NumElts).
1876/// It also keeps track of the NewPHI node inserted for this particular slice.
1877///
1878/// Slice examples:
1879/// <4 x i64> -> Split into four i64 slices.
1880/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1881/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1882/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1884public:
1885 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1886 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1887
1888 Type *Ty = nullptr;
1889 unsigned Idx = 0;
1890 unsigned NumElts = 0;
1891 PHINode *NewPHI = nullptr;
1892
1893 /// Slice \p Inc according to the information contained within this slice.
1894 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1895 /// pair, it returns the same Sliced value as well.
1896 ///
1897 /// Note this *intentionally* does not return the same value for, say,
1898 /// [%bb.0, %0] & [%bb.1, %0] as:
1899 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1900 /// the value in bb.1 may not be reachable from bb.0 if it's its
1901 /// predecessor.)
1902 /// - We also want to make our extract instructions as local as possible so
1903 /// the DAG has better chances of folding them out. Duplicating them like
1904 /// that is beneficial in that regard.
1905 ///
1906 /// This is both a minor optimization to avoid creating duplicate
1907 /// instructions, but also a requirement for correctness. It is not forbidden
1908 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1909 /// returned a new value each time, those previously identical pairs would all
1910 /// have different incoming values (from the same block) and it'd cause a "PHI
1911 /// node has multiple entries for the same basic block with different incoming
1912 /// values!" verifier error.
1913 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1914 Value *&Res = SlicedVals[{BB, Inc}];
1915 if (Res)
1916 return Res;
1917
1919 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1920 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1921
1922 if (NumElts > 1) {
1924 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1925 Mask.push_back(K);
1926 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1927 } else
1928 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1929
1930 return Res;
1931 }
1932
1933private:
1935};
1936
1937bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1938 // Break-up fixed-vector PHIs into smaller pieces.
1939 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1940 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1941 //
1942 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1943 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1944 // With large, odd-sized PHIs we may end up needing many `build_vector`
1945 // operations with most elements being "undef". This inhibits a lot of
1946 // optimization opportunities and can result in unreasonably high register
1947 // pressure and the inevitable stack spilling.
1948 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1949 return false;
1950
1951 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1952 if (!FVT || FVT->getNumElements() == 1 ||
1953 DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1954 return false;
1955
1956 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1957 return false;
1958
1959 std::vector<VectorSlice> Slices;
1960
1961 Type *EltTy = FVT->getElementType();
1962 {
1963 unsigned Idx = 0;
1964 // For 8/16 bits type, don't scalarize fully but break it up into as many
1965 // 32-bit slices as we can, and scalarize the tail.
1966 const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
1967 const unsigned NumElts = FVT->getNumElements();
1968 if (EltSize == 8 || EltSize == 16) {
1969 const unsigned SubVecSize = (32 / EltSize);
1970 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1971 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1972 Idx += SubVecSize)
1973 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1974 }
1975
1976 // Scalarize all remaining elements.
1977 for (; Idx < NumElts; ++Idx)
1978 Slices.emplace_back(EltTy, Idx, 1);
1979 }
1980
1981 assert(Slices.size() > 1);
1982
1983 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1984 // creating the necessary instruction to extract the relevant slices of each
1985 // incoming value.
1986 IRBuilder<> B(I.getParent());
1987 B.SetCurrentDebugLocation(I.getDebugLoc());
1988
1989 unsigned IncNameSuffix = 0;
1990 for (VectorSlice &S : Slices) {
1991 // We need to reset the build on each iteration, because getSlicedVal may
1992 // have inserted something into I's BB.
1993 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1994 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1995
1996 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1997 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1998 "largephi.extractslice" +
1999 std::to_string(IncNameSuffix++)),
2000 BB);
2001 }
2002 }
2003
2004 // And replace this PHI with a vector of all the previous PHI values.
2005 Value *Vec = PoisonValue::get(FVT);
2006 unsigned NameSuffix = 0;
2007 for (VectorSlice &S : Slices) {
2008 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
2009 if (S.NumElts > 1)
2010 Vec =
2011 B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
2012 else
2013 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2014 }
2015
2016 I.replaceAllUsesWith(Vec);
2017 I.eraseFromParent();
2018 return true;
2019}
2020
2021/// \param V Value to check
2022/// \param DL DataLayout
2023/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2024/// \param AS Target Address Space
2025/// \return true if \p V cannot be the null value of \p AS, false otherwise.
2026static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2027 const AMDGPUTargetMachine &TM, unsigned AS) {
2028 // Pointer cannot be null if it's a block address, GV or alloca.
2029 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2030 // it as the symbol could be null in such cases.
2031 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2032 return true;
2033
2034 // Check nonnull arguments.
2035 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2036 return true;
2037
2038 // getUnderlyingObject may have looked through another addrspacecast, although
2039 // the optimizable situations most likely folded out by now.
2040 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2041 return false;
2042
2043 // TODO: Calls that return nonnull?
2044
2045 // For all other things, use KnownBits.
2046 // We either use 0 or all bits set to indicate null, so check whether the
2047 // value can be zero or all ones.
2048 //
2049 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2050 // address spaces have non-zero null values.
2051 auto SrcPtrKB = computeKnownBits(V, DL);
2052 const auto NullVal = TM.getNullPointerValue(AS);
2053
2054 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2055 assert((NullVal == 0 || NullVal == -1) &&
2056 "don't know how to check for this null value!");
2057 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2058}
2059
2060bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2061 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2062 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2063 // worth supporting.
2064 if (I.getType()->isVectorTy())
2065 return false;
2066
2067 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2068 // This is only worthwhile for casts from/to priv/local to flat.
2069 const unsigned SrcAS = I.getSrcAddressSpace();
2070 const unsigned DstAS = I.getDestAddressSpace();
2071
2072 bool CanLower = false;
2073 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2074 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2075 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2076 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2077 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2078 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2079 if (!CanLower)
2080 return false;
2081
2083 getUnderlyingObjects(I.getOperand(0), WorkList);
2084 if (!all_of(WorkList, [&](const Value *V) {
2085 return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
2086 }))
2087 return false;
2088
2089 IRBuilder<> B(&I);
2090 auto *Intrin = B.CreateIntrinsic(
2091 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2092 I.replaceAllUsesWith(Intrin);
2093 I.eraseFromParent();
2094 return true;
2095}
2096
2097bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2098 switch (I.getIntrinsicID()) {
2099 case Intrinsic::bitreverse:
2100 return visitBitreverseIntrinsicInst(I);
2101 case Intrinsic::minnum:
2102 return visitMinNum(I);
2103 case Intrinsic::sqrt:
2104 return visitSqrt(I);
2105 default:
2106 return false;
2107 }
2108}
2109
2110bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2111 bool Changed = false;
2112
2113 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
2114 UA->isUniform(&I))
2115 Changed |= promoteUniformBitreverseToI32(I);
2116
2117 return Changed;
2118}
2119
2120/// Match non-nan fract pattern.
2121/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2122///
2123/// If fract is a useful instruction for the subtarget. Does not account for the
2124/// nan handling; the instruction has a nan check on the input value.
2125Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2126 if (ST->hasFractBug())
2127 return nullptr;
2128
2129 if (I.getIntrinsicID() != Intrinsic::minnum)
2130 return nullptr;
2131
2132 Type *Ty = I.getType();
2133 if (!isLegalFloatingTy(Ty->getScalarType()))
2134 return nullptr;
2135
2136 Value *Arg0 = I.getArgOperand(0);
2137 Value *Arg1 = I.getArgOperand(1);
2138
2139 const APFloat *C;
2140 if (!match(Arg1, m_APFloat(C)))
2141 return nullptr;
2142
2143 APFloat One(1.0);
2144 bool LosesInfo;
2145 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2146
2147 // Match nextafter(1.0, -1)
2148 One.next(true);
2149 if (One != *C)
2150 return nullptr;
2151
2152 Value *FloorSrc;
2153 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2154 m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
2155 return FloorSrc;
2156 return nullptr;
2157}
2158
2159Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2160 Value *FractArg) {
2161 SmallVector<Value *, 4> FractVals;
2162 extractValues(Builder, FractVals, FractArg);
2163
2164 SmallVector<Value *, 4> ResultVals(FractVals.size());
2165
2166 Type *Ty = FractArg->getType()->getScalarType();
2167 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2168 ResultVals[I] =
2169 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2170 }
2171
2172 return insertValues(Builder, FractArg->getType(), ResultVals);
2173}
2174
2175bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
2176 Value *FractArg = matchFractPat(I);
2177 if (!FractArg)
2178 return false;
2179
2180 // Match pattern for fract intrinsic in contexts where the nan check has been
2181 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2182 if (!I.hasNoNaNs() &&
2183 !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(*DL, TLInfo)))
2184 return false;
2185
2186 IRBuilder<> Builder(&I);
2187 FastMathFlags FMF = I.getFastMathFlags();
2188 FMF.setNoNaNs();
2189 Builder.setFastMathFlags(FMF);
2190
2191 Value *Fract = applyFractPat(Builder, FractArg);
2192 Fract->takeName(&I);
2193 I.replaceAllUsesWith(Fract);
2194
2196 return true;
2197}
2198
2199static bool isOneOrNegOne(const Value *Val) {
2200 const APFloat *C;
2201 return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
2202}
2203
2204// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2205bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2206 Type *Ty = Sqrt.getType()->getScalarType();
2207 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))
2208 return false;
2209
2210 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2211 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2212
2213 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2214 // of fast llvm.sqrt will give the raw instruction anyway.
2215 if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2216 return false;
2217
2218 const float ReqdAccuracy = FPOp->getFPAccuracy();
2219
2220 // Defer correctly rounded expansion to codegen.
2221 if (ReqdAccuracy < 1.0f)
2222 return false;
2223
2224 // FIXME: This is an ugly hack for this pass using forward iteration instead
2225 // of reverse. If it worked like a normal combiner, the rsq would form before
2226 // we saw a sqrt call.
2227 auto *FDiv =
2228 dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
2229 if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2230 FDiv->getFPAccuracy() >= 1.0f &&
2231 canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
2232 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2233 isOneOrNegOne(FDiv->getOperand(0)))
2234 return false;
2235
2236 Value *SrcVal = Sqrt.getOperand(0);
2237 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2238
2239 // The raw instruction is 1 ulp, but the correction for denormal handling
2240 // brings it to 2.
2241 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2242 return false;
2243
2244 IRBuilder<> Builder(&Sqrt);
2246 extractValues(Builder, SrcVals, SrcVal);
2247
2248 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2249 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2250 if (CanTreatAsDAZ)
2251 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2252 else
2253 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2254 }
2255
2256 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2257 NewSqrt->takeName(&Sqrt);
2258 Sqrt.replaceAllUsesWith(NewSqrt);
2259 Sqrt.eraseFromParent();
2260 return true;
2261}
2262
2263bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
2264 Impl.Mod = &M;
2265 Impl.DL = &Impl.Mod->getDataLayout();
2266 Impl.SqrtF32 = nullptr;
2267 Impl.LdexpF32 = nullptr;
2268 return false;
2269}
2270
2271bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2272 if (skipFunction(F))
2273 return false;
2274
2275 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2276 if (!TPC)
2277 return false;
2278
2279 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2280 Impl.TM = &TM;
2281 Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2282 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2283 Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2284 Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2285 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2286 Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
2287 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2288 SIModeRegisterDefaults Mode(F, *Impl.ST);
2289 Impl.HasFP32DenormalFlush =
2290 Mode.FP32Denormals == DenormalMode::getPreserveSign();
2291 return Impl.run(F);
2292}
2293
2296 AMDGPUCodeGenPrepareImpl Impl;
2297 Impl.Mod = F.getParent();
2298 Impl.DL = &Impl.Mod->getDataLayout();
2299 Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
2300 Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
2301 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2302 Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
2305 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2306 SIModeRegisterDefaults Mode(F, *Impl.ST);
2307 Impl.HasFP32DenormalFlush =
2308 Mode.FP32Denormals == DenormalMode::getPreserveSign();
2310 if (!Impl.FlowChanged)
2312 return Impl.run(F) ? PA : PreservedAnalyses::all();
2313}
2314
2315INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2316 "AMDGPU IR optimizations", false, false)
2320INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2322
2323char AMDGPUCodeGenPrepare::ID = 0;
2324
2326 return new AMDGPUCodeGenPrepare();
2327}
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool hasUnsafeFPMath(const Function &F)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define DEBUG_TYPE
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
Generic memory optimizations
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
BinaryOperator * Mul
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1070
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:424
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
BitVector & set()
Definition: BitVector.h:351
bool all() const
all - Returns true if all bits are set.
Definition: BitVector.h:175
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:530
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:694
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
Definition: Constants.cpp:1100
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:320
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void setFast(bool B=true)
Definition: FMF.h:97
bool noInfs() const
Definition: FMF.h:67
bool allowReciprocal() const
Definition: FMF.h:69
bool approxFunc() const
Definition: FMF.h:71
void setNoNaNs(bool B=true)
Definition: FMF.h:79
bool noNaNs() const
Definition: FMF.h:66
bool allowContract() const
Definition: FMF.h:70
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
Definition: Function.h:69
This instruction compares its operands according to the predicate given to the constructor.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:914
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2492
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1631
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2114
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2480
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2059
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2536
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2087
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2053
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1454
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2101
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2316
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1738
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1807
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1433
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition: IRBuilder.h:297
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2041
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1492
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:551
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2027
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1514
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1683
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2432
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1473
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1536
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1604
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1747
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2074
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1378
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2311
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2094
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIntrinsicInst(IntrinsicInst &I)
Definition: InstVisitor.h:219
RetTy visitPHINode(PHINode &I)
Definition: InstVisitor.h:175
RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I)
Definition: InstVisitor.h:188
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:261
RetTy visitICmpInst(ICmpInst &I)
Definition: InstVisitor.h:166
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:280
RetTy visitLoadInst(LoadInst &I)
Definition: InstVisitor.h:169
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:381
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1642
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1542
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
Definition: Pass.h:119
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
static IntegerType * getInt32Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:436
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1539
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
CmpClass_match< LHS, RHS, FCmpInst, FCmpInst::Predicate > m_FCmp(FCmpInst::Predicate &Pred, const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:893
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
Definition: PatternMatch.h:717
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
Definition: PatternMatch.h:316
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2431
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isKnownNeverNaN(const Value *V, unsigned Depth, const SimplifyQuery &SQ)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, unsigned Depth, const SimplifyQuery &SQ)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
CGPassBuilderOption getCGPassBuilderOption()
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:97
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:94
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.