LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
20#include "SIDefines.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Sequence.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include <optional>
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
38namespace {
39
40struct AMDGPUImageDMaskIntrinsic {
41 unsigned Intr;
42};
43
44#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
45#include "AMDGPUGenSearchableTables.inc"
46
47} // end anonymous namespace
48
49// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
50//
51// A single NaN input is folded to minnum, so we rely on that folding for
52// handling NaNs.
53static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
54 const APFloat &Src2) {
55 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
56
57 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
58 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
59 if (Cmp0 == APFloat::cmpEqual)
60 return maxnum(Src1, Src2);
61
62 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
63 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
64 if (Cmp1 == APFloat::cmpEqual)
65 return maxnum(Src0, Src2);
66
67 return maxnum(Src0, Src1);
68}
69
70// Check if a value can be converted to a 16-bit value without losing
71// precision.
72// The value is expected to be either a float (IsFloat = true) or an unsigned
73// integer (IsFloat = false).
74static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
75 Type *VTy = V.getType();
76 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
77 // The value is already 16-bit, so we don't want to convert to 16-bit again!
78 return false;
79 }
80 if (IsFloat) {
81 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
82 // We need to check that if we cast the index down to a half, we do not
83 // lose precision.
84 APFloat FloatValue(ConstFloat->getValueAPF());
85 bool LosesInfo = true;
87 &LosesInfo);
88 return !LosesInfo;
89 }
90 } else {
91 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
92 // We need to check that if we cast the index down to an i16, we do not
93 // lose precision.
94 APInt IntValue(ConstInt->getValue());
95 return IntValue.getActiveBits() <= 16;
96 }
97 }
98
99 Value *CastSrc;
100 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
101 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
102 if (IsExt) {
103 Type *CastSrcTy = CastSrc->getType();
104 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
105 return true;
106 }
107
108 return false;
109}
110
111// Convert a value to 16-bit.
113 Type *VTy = V.getType();
115 return cast<Instruction>(&V)->getOperand(0);
116 if (VTy->isIntegerTy())
117 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
118 if (VTy->isFloatingPointTy())
119 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
120
121 llvm_unreachable("Should never be called!");
122}
123
124/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
125/// modified arguments (based on OldIntr) and replaces InstToReplace with
126/// this newly created intrinsic call.
127static std::optional<Instruction *> modifyIntrinsicCall(
128 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
129 InstCombiner &IC,
130 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
131 Func) {
132 SmallVector<Type *, 4> OverloadTys;
133 if (!Intrinsic::isSignatureValid(OldIntr.getCalledFunction(), OverloadTys))
134 return std::nullopt;
135
136 SmallVector<Value *, 8> Args(OldIntr.args());
137
138 // Modify arguments and types
139 Func(Args, OverloadTys);
140
141 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, OverloadTys, Args);
142 NewCall->takeName(&OldIntr);
143 NewCall->copyMetadata(OldIntr);
144 if (isa<FPMathOperator>(NewCall))
145 NewCall->copyFastMathFlags(&OldIntr);
146
147 // Erase and replace uses
148 if (!InstToReplace.getType()->isVoidTy())
149 IC.replaceInstUsesWith(InstToReplace, NewCall);
150
151 bool RemoveOldIntr = &OldIntr != &InstToReplace;
152
153 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
154 if (RemoveOldIntr)
155 IC.eraseInstFromFunction(OldIntr);
156
157 return RetValue;
158}
159
160static std::optional<Instruction *>
162 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
164 // Optimize _L to _LZ when _L is zero
165 if (const auto *LZMappingInfo =
167 if (auto *ConstantLod =
168 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
169 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
170 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
172 ImageDimIntr->Dim);
173 return modifyIntrinsicCall(
174 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
175 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
176 });
177 }
178 }
179 }
180
181 // Optimize _mip away, when 'lod' is zero
182 if (const auto *MIPMappingInfo =
184 if (auto *ConstantMip =
185 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
186 if (ConstantMip->isZero()) {
187 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
188 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
189 ImageDimIntr->Dim);
190 return modifyIntrinsicCall(
191 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
192 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
193 });
194 }
195 }
196 }
197
198 // Optimize _bias away when 'bias' is zero
199 if (const auto *BiasMappingInfo =
201 if (auto *ConstantBias =
202 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
203 if (ConstantBias->isZero()) {
204 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
205 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
206 ImageDimIntr->Dim);
207 return modifyIntrinsicCall(
208 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
209 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
210 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
211 });
212 }
213 }
214 }
215
216 // Optimize _offset away when 'offset' is zero
217 if (const auto *OffsetMappingInfo =
219 if (auto *ConstantOffset =
220 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
221 if (ConstantOffset->isZero()) {
222 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
224 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
225 return modifyIntrinsicCall(
226 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
227 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
228 });
229 }
230 }
231 }
232
233 // Try to use D16
234 if (ST->hasD16Images()) {
235
236 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
238
239 if (BaseOpcode->HasD16) {
240
241 // If the only use of image intrinsic is a fptrunc (with conversion to
242 // half) then both fptrunc and image intrinsic will be replaced with image
243 // intrinsic with D16 flag.
244 if (II.hasOneUse()) {
245 Instruction *User = II.user_back();
246
247 if (User->getOpcode() == Instruction::FPTrunc &&
249
250 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
251 [&](auto &Args, auto &ArgTys) {
252 // Change return type of image intrinsic.
253 // Set it to return type of fptrunc.
254 ArgTys[0] = User->getType();
255 });
256 }
257 }
258
259 // Only perform D16 folding if every user of the image sample is
260 // an ExtractElementInst immediately followed by an FPTrunc to half.
262 ExtractTruncPairs;
263 bool AllHalfExtracts = true;
264
265 for (User *U : II.users()) {
266 auto *Ext = dyn_cast<ExtractElementInst>(U);
267 if (!Ext || !Ext->hasOneUse()) {
268 AllHalfExtracts = false;
269 break;
270 }
271
272 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
273 if (!Tr || !Tr->getType()->isHalfTy()) {
274 AllHalfExtracts = false;
275 break;
276 }
277
278 ExtractTruncPairs.emplace_back(Ext, Tr);
279 }
280
281 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
282 auto *VecTy = cast<VectorType>(II.getType());
283 Type *HalfVecTy =
284 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
285
286 // Obtain the original image sample intrinsic's signature
287 // and replace its return type with the half-vector for D16 folding
288 SmallVector<Type *, 8> OverloadTys;
289 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
290 return std::nullopt;
291
292 OverloadTys[0] = HalfVecTy;
293 Module *M = II.getModule();
295 M, ImageDimIntr->Intr, OverloadTys);
296
297 II.mutateType(HalfVecTy);
298 II.setCalledFunction(HalfDecl);
299
300 IRBuilder<> Builder(II.getContext());
301 for (auto &[Ext, Tr] : ExtractTruncPairs) {
302 Value *Idx = Ext->getIndexOperand();
303
304 Builder.SetInsertPoint(Tr);
305
306 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
307 HalfExtract->takeName(Tr);
308
309 Tr->replaceAllUsesWith(HalfExtract);
310 }
311
312 for (auto &[Ext, Tr] : ExtractTruncPairs) {
313 IC.eraseInstFromFunction(*Tr);
314 IC.eraseInstFromFunction(*Ext);
315 }
316
317 return &II;
318 }
319 }
320 }
321
322 // Try to use A16 or G16
323 if (!ST->hasA16() && !ST->hasG16())
324 return std::nullopt;
325
326 // Address is interpreted as float if the instruction has a sampler or as
327 // unsigned int if there is no sampler.
328 bool HasSampler =
330 bool FloatCoord = false;
331 // true means derivatives can be converted to 16 bit, coordinates not
332 bool OnlyDerivatives = false;
333
334 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
335 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
336 Value *Coord = II.getOperand(OperandIndex);
337 // If the values are not derived from 16-bit values, we cannot optimize.
338 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
339 if (OperandIndex < ImageDimIntr->CoordStart ||
340 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
341 return std::nullopt;
342 }
343 // All gradients can be converted, so convert only them
344 OnlyDerivatives = true;
345 break;
346 }
347
348 assert(OperandIndex == ImageDimIntr->GradientStart ||
349 FloatCoord == Coord->getType()->isFloatingPointTy());
350 FloatCoord = Coord->getType()->isFloatingPointTy();
351 }
352
353 if (!OnlyDerivatives && !ST->hasA16())
354 OnlyDerivatives = true; // Only supports G16
355
356 // Check if there is a bias parameter and if it can be converted to f16
357 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
358 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
359 assert(HasSampler &&
360 "Only image instructions with a sampler can have a bias");
361 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
362 OnlyDerivatives = true;
363 }
364
365 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
366 ImageDimIntr->CoordStart))
367 return std::nullopt;
368
369 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
370 : Type::getInt16Ty(II.getContext());
371
372 return modifyIntrinsicCall(
373 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
374 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
375 if (!OnlyDerivatives) {
376 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
377
378 // Change the bias type
379 if (ImageDimIntr->NumBiasArgs != 0)
380 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
381 }
382
383 unsigned EndIndex =
384 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
385 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
386 OperandIndex < EndIndex; OperandIndex++) {
387 Args[OperandIndex] =
388 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
389 }
390
391 // Convert the bias
392 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
393 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
394 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
395 }
396 });
397}
398
400 const Value *Op0, const Value *Op1,
401 InstCombiner &IC) const {
402 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
403 // infinity, gives +0.0. If we can prove we don't have one of the special
404 // cases then we can use a normal multiply instead.
405 // TODO: Create and use isKnownFiniteNonZero instead of just matching
406 // constants here.
409 // One operand is not zero or infinity or NaN.
410 return true;
411 }
412
414 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
415 // Neither operand is infinity or NaN.
416 return true;
417 }
418 return false;
419}
420
421/// Match an fpext from half to float, or a constant we can convert.
423 Value *Src = nullptr;
424 ConstantFP *CFP = nullptr;
425 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
426 if (Src->getType()->isHalfTy())
427 return Src;
428 } else if (match(Arg, m_ConstantFP(CFP))) {
429 bool LosesInfo;
430 APFloat Val(CFP->getValueAPF());
432 if (!LosesInfo)
433 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
434 }
435 return nullptr;
436}
437
438// Trim all zero components from the end of the vector \p UseV and return
439// an appropriate bitset with known elements.
441 Instruction *I) {
442 auto *VTy = cast<FixedVectorType>(UseV->getType());
443 unsigned VWidth = VTy->getNumElements();
444 APInt DemandedElts = APInt::getAllOnes(VWidth);
445
446 for (int i = VWidth - 1; i > 0; --i) {
447 auto *Elt = findScalarElement(UseV, i);
448 if (!Elt)
449 break;
450
451 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
452 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
453 break;
454 } else {
455 break;
456 }
457
458 DemandedElts.clearBit(i);
459 }
460
461 return DemandedElts;
462}
463
464// Trim elements of the end of the vector \p V, if they are
465// equal to the first element of the vector.
467 auto *VTy = cast<FixedVectorType>(V->getType());
468 unsigned VWidth = VTy->getNumElements();
469 APInt DemandedElts = APInt::getAllOnes(VWidth);
470 Value *FirstComponent = findScalarElement(V, 0);
471
472 SmallVector<int> ShuffleMask;
473 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
474 SVI->getShuffleMask(ShuffleMask);
475
476 for (int I = VWidth - 1; I > 0; --I) {
477 if (ShuffleMask.empty()) {
478 auto *Elt = findScalarElement(V, I);
479 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
480 break;
481 } else {
482 // Detect identical elements in the shufflevector result, even though
483 // findScalarElement cannot tell us what that element is.
484 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
485 break;
486 }
487 DemandedElts.clearBit(I);
488 }
489
490 return DemandedElts;
491}
492
495 APInt DemandedElts,
496 int DMaskIdx = -1,
497 bool IsLoad = true);
498
499/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
500static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
501 return (SqrtOp->getType()->isFloatTy() &&
502 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
503 SqrtOp->getType()->isHalfTy();
504}
505
506/// Return true if we can easily prove that use U is uniform.
507static bool isTriviallyUniform(const Use &U) {
508 Value *V = U.get();
509 if (isa<Constant>(V))
510 return true;
511 if (const auto *A = dyn_cast<Argument>(V))
513 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
514 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
515 return false;
516 // If II and U are in different blocks then there is a possibility of
517 // temporal divergence.
518 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
519 }
520 return false;
521}
522
523/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
524///
525/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
528 unsigned LaneArgIdx) const {
529 unsigned MaskBits = ST->getWavefrontSizeLog2();
530 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
531
532 KnownBits Known(32);
533 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
534 return true;
535
536 if (!Known.isConstant())
537 return false;
538
539 // Out of bounds indexes may appear in wave64 code compiled for wave32.
540 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
541 // manually fix it up.
542
543 Value *LaneArg = II.getArgOperand(LaneArgIdx);
544 Constant *MaskedConst =
545 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
546 if (MaskedConst != LaneArg) {
547 II.getOperandUse(LaneArgIdx).set(MaskedConst);
548 return true;
549 }
550
551 return false;
552}
553
555 Function &NewCallee, ArrayRef<Value *> Ops) {
557 Old.getOperandBundlesAsDefs(OpBundles);
558
559 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
560 NewCall->takeName(&Old);
561 return NewCall;
562}
563
564// Return true for sequences of instructions that effectively assign
565// each lane to its thread ID
566static bool isThreadID(const GCNSubtarget &ST, Value *V) {
567 // Case 1:
568 // wave32: mbcnt_lo(-1, 0)
569 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
575 if (ST.isWave32() && match(V, W32Pred))
576 return true;
577 if (ST.isWave64() && match(V, W64Pred))
578 return true;
579
580 return false;
581}
582
585 IntrinsicInst &II) const {
586 const auto IID = II.getIntrinsicID();
587 assert(IID == Intrinsic::amdgcn_readlane ||
588 IID == Intrinsic::amdgcn_readfirstlane ||
589 IID == Intrinsic::amdgcn_permlane64);
590
591 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
592
593 // Only do this if both instructions are in the same block
594 // (so the exec mask won't change) and the readlane is the only user of its
595 // operand.
596 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
597 return nullptr;
598
599 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
600
601 // If this is a readlane, check that the second operand is a constant, or is
602 // defined before OpInst so we know it's safe to move this intrinsic higher.
603 Value *LaneID = nullptr;
604 if (IsReadLane) {
605 LaneID = II.getOperand(1);
606
607 // readlane take an extra operand for the lane ID, so we must check if that
608 // LaneID value can be used at the point where we want to move the
609 // intrinsic.
610 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
611 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
612 return nullptr;
613 }
614 }
615
616 // Hoist the intrinsic (II) through OpInst.
617 //
618 // (II (OpInst x)) -> (OpInst (II x))
619 const auto DoIt = [&](unsigned OpIdx,
620 Function *NewIntrinsic) -> Instruction * {
622 if (IsReadLane)
623 Ops.push_back(LaneID);
624
625 // Rewrite the intrinsic call.
626 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
627
628 // Rewrite OpInst so it takes the result of the intrinsic now.
629 Instruction &NewOp = *OpInst->clone();
630 NewOp.setOperand(OpIdx, NewII);
631 return &NewOp;
632 };
633
634 // TODO(?): Should we do more with permlane64?
635 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
636 return nullptr;
637
638 if (isa<UnaryOperator>(OpInst))
639 return DoIt(0, II.getCalledFunction());
640
641 if (isa<CastInst>(OpInst)) {
642 Value *Src = OpInst->getOperand(0);
643 Type *SrcTy = Src->getType();
644 if (!isTypeLegal(SrcTy))
645 return nullptr;
646
647 Function *Remangled =
648 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
649 return DoIt(0, Remangled);
650 }
651
652 // We can also hoist through binary operators if the other operand is uniform.
653 if (isa<BinaryOperator>(OpInst)) {
654 // FIXME: If we had access to UniformityInfo here we could just check
655 // if the operand is uniform.
656 if (isTriviallyUniform(OpInst->getOperandUse(0)))
657 return DoIt(1, II.getCalledFunction());
658 if (isTriviallyUniform(OpInst->getOperandUse(1)))
659 return DoIt(0, II.getCalledFunction());
660 }
661
662 return nullptr;
663}
664
665/// Evaluate V as a function of the lane ID and return its value on Lane, or
666/// std::nullopt if V is not a closed-form expression of the lane ID.
667static std::optional<unsigned> evalLaneExpr(Value *V, unsigned Lane,
668 const GCNSubtarget &ST,
669 const DataLayout &DL,
670 unsigned Depth = 0) {
672 return std::nullopt;
673
674 // Poison/undef in the index expression: bail and let InstCombine fold the
675 // intrinsic the usual way.
676 if (isa<UndefValue>(V))
677 return std::nullopt;
678
679 if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
680 return CI->getZExtValue();
681
682 if (isThreadID(ST, V))
683 return Lane;
684
686 if (!BO)
687 return std::nullopt;
688
689 std::optional<unsigned> LHS =
690 evalLaneExpr(BO->getOperand(0), Lane, ST, DL, Depth + 1);
691 if (!LHS)
692 return std::nullopt;
693 std::optional<unsigned> RHS =
694 evalLaneExpr(BO->getOperand(1), Lane, ST, DL, Depth + 1);
695 if (!RHS)
696 return std::nullopt;
697
698 Type *Ty = BO->getType();
699 Constant *Ops[] = {ConstantInt::get(Ty, *LHS), ConstantInt::get(Ty, *RHS)};
700 auto *CI =
702 return CI ? std::optional<unsigned>(CI->getZExtValue()) : std::nullopt;
703}
704
705/// Build the per-lane shuffle map by evaluating Index for every lane in the
706/// wave. Returns false if any lane index is non-constant or out of range.
707static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST,
709 const DataLayout &DL) {
710 unsigned WaveSize = ST.getWavefrontSize();
711 Ids.resize(WaveSize);
712 for (unsigned Lane : seq(WaveSize)) {
713 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
714 if (!Val || *Val >= WaveSize)
715 return false;
716 Ids[Lane] = *Val;
717 }
718 return true;
719}
720
721/// Lanes are partitioned into groups of Period; each group is a translated
722/// copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
723template <unsigned Period>
725 static_assert(isPowerOf2_32(Period), "Period must be a power of two");
726 for (unsigned I = Period, E = Ids.size(); I < E; ++I)
727 if (Ids[I] != Ids[I % Period] + (I & ~(Period - 1)))
728 return false;
729 return true;
730}
731
732/// Match an N-lane row pattern: each lane in [0, N) reads from a source lane
733/// in the same N-lane row, and the pattern repeats periodically across rows.
734template <unsigned N> static bool isRowPattern(ArrayRef<uint8_t> Ids) {
735 for (unsigned I = 0; I < N; ++I)
736 if (Ids[I] >= N)
737 return false;
738 return hasPeriodicLayout<N>(Ids);
739}
740
741static constexpr auto isQuadPattern = isRowPattern<4>;
742static constexpr auto isHalfRowPattern = isRowPattern<8>;
743static constexpr auto isFullRowPattern = isRowPattern<16>;
744
745/// Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp
746/// QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2],
747/// [7:6]=Ids[3].
748static std::optional<unsigned> matchQuadPermPattern(ArrayRef<uint8_t> Ids) {
749 if (!isQuadPattern(Ids))
750 return std::nullopt;
751 return Ids[3] << 6 | Ids[2] << 4 | Ids[1] << 2 | Ids[0];
752}
753
754/// Match an N-lane reversal (mirror) pattern.
755template <unsigned N> static bool matchMirrorPattern(ArrayRef<uint8_t> Ids) {
756 if (!isRowPattern<N>(Ids))
757 return false;
758 for (unsigned J = 0; J < N; ++J)
759 if (Ids[J] != (N - 1) - J)
760 return false;
761 return true;
762}
763
766
767/// Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
768static std::optional<unsigned> matchRowRotatePattern(ArrayRef<uint8_t> Ids) {
769 if (Ids[0] == 0 || !isFullRowPattern(Ids))
770 return std::nullopt;
771 for (unsigned J = 1; J < 16; ++J)
772 if (Ids[J] != (Ids[0] + J) % 16)
773 return std::nullopt;
774 return 16u - Ids[0];
775}
776
777/// Match a row-share pattern: all 16 lanes of each row read the same source
778/// lane. Returns the shared source lane index in [0, 16).
779static std::optional<unsigned> matchRowSharePattern(ArrayRef<uint8_t> Ids) {
780 if (!isFullRowPattern(Ids))
781 return std::nullopt;
782 if (!all_equal(Ids.take_front(16)))
783 return std::nullopt;
784 return Ids[0];
785}
786
787/// Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J,
788/// with Mask in [1, 15].
789static std::optional<unsigned> matchRowXMaskPattern(ArrayRef<uint8_t> Ids) {
790 unsigned Mask = Ids[0];
791 if (Mask == 0 || !isFullRowPattern(Ids))
792 return std::nullopt;
793 for (unsigned J = 0; J < 16; ++J)
794 if (Ids[J] != (Mask ^ J))
795 return std::nullopt;
796 return Mask;
797}
798
799/// Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8
800/// 24-bit selector (three bits per output lane).
801static std::optional<unsigned> matchHalfRowPermPattern(ArrayRef<uint8_t> Ids) {
802 if (!isHalfRowPattern(Ids))
803 return std::nullopt;
804 unsigned Selector = 0;
805 for (unsigned J = 0; J < 8; ++J)
806 Selector |= Ids[J] << (J * 3);
807 return Selector;
808}
809
810/// Pack a 16-lane permutation into a single 64-bit value: four bits per output
811/// lane, lane J in bits [J*4 + 3 : J*4]. The caller splits it into the low and
812/// high 32-bit selector operands of v_permlane16 / v_permlanex16.
814 uint64_t Sel = 0;
815 for (unsigned J = 0; J < 16; ++J)
816 Sel |= static_cast<uint64_t>(Ids[J] & 0xF) << (J * 4);
817 return Sel;
818}
819
820/// Match a half-wave swap: lane J reads from lane J ^ 32. Only meaningful on
821/// wave64 targets.
823 if (Ids.size() != 64)
824 return false;
825 for (unsigned J = 0; J < 64; ++J)
826 if (Ids[J] != (J ^ 32))
827 return false;
828 return true;
829}
830
831/// Match a cross-row permutation suitable for v_permlanex16: every lane in
832/// the low 16-lane half reads from the high half of its own row, and vice
833/// versa.
835 if (!hasPeriodicLayout<32>(Ids))
836 return false;
837 for (unsigned J = 0; J < 16; ++J) {
838 if (Ids[J] < 16 || Ids[J] >= 32)
839 return false;
840 if (Ids[J + 16] != Ids[J] - 16)
841 return false;
842 }
843 return true;
844}
845
846/// Match a DS_SWIZZLE bitmask-mode permutation:
847/// dst_lane = ((src_lane & AND) | OR) ^ XOR
848/// with each mask being five bits. Returns the encoded swizzle immediate.
849/// The hardware applies the formula independently within each 32-lane group,
850/// so on wave64 the high group must replicate the low one (translated by 32).
851static std::optional<unsigned>
853 if (!hasPeriodicLayout<32>(Ids))
854 return std::nullopt;
855
856 // The formula is per-bit: output bit B depends only on input bit B. Probe
857 // each bit with src=0 and src=(1<<B); if the output bit flipped, AND[B]=1
858 // and XOR[B] carries the constant offset; otherwise it is a constant bit
859 // encoded in OR (with AND[B]=0, XOR[B]=0).
860 unsigned AndMask = 0, OrMask = 0, XorMask = 0;
861 for (unsigned B = 0; B < 5; ++B) {
862 unsigned Bit0 = (Ids[0] >> B) & 1;
863 unsigned Bit1 = (Ids[1u << B] >> B) & 1;
864 if (Bit0 != Bit1) {
865 AndMask |= 1u << B;
866 XorMask |= Bit0 << B;
867 } else {
868 OrMask |= Bit0 << B;
869 }
870 }
871
872 // The per-bit derivation assumes bit independence; verify the masks
873 // actually reproduce every lane in the 32-lane group.
874 for (unsigned I : seq(32u)) {
875 unsigned Expected = ((I & AndMask) | OrMask) ^ XorMask;
876 if (Ids[I] != Expected)
877 return std::nullopt;
878 }
879
884}
885
886/// Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and
887/// bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can
888/// be folded into a consuming VALU op by GCNDPPCombine.
889static Value *createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl) {
890 Type *Ty = Val->getType();
891 return B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, {Ty},
892 {PoisonValue::get(Ty), Val, B.getInt32(Ctrl),
893 B.getInt32(0xF), B.getInt32(0xF), B.getTrue()});
894}
895
896/// Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
897static Value *createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector) {
898 return B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp8, {Val->getType()},
899 {Val, B.getInt32(Selector)});
900}
901
902/// Emit v_permlane16 with the precomputed lane-select halves.
904 uint32_t Hi) {
905 Type *Ty = Val->getType();
906 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane16, {Ty},
907 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
908 B.getInt32(Hi), B.getFalse(), B.getFalse()});
909}
910
911/// Emit v_permlanex16 with the precomputed lane-select halves. Each output
912/// lane reads from the other 16-lane half of the same row.
914 uint32_t Hi) {
915 Type *Ty = Val->getType();
916 return B.CreateIntrinsic(Intrinsic::amdgcn_permlanex16, {Ty},
917 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
918 B.getInt32(Hi), B.getFalse(), B.getFalse()});
919}
920
921/// Emit ds_swizzle with the given immediate, bitcasting/converting between
922/// pointer/float types and i32 as required by the intrinsic signature.
924 const DataLayout &DL) {
925 Type *OrigTy = Val->getType();
926 assert(DL.getTypeSizeInBits(OrigTy) == 32 &&
927 "ds_swizzle only supports 32-bit operands");
928 IntegerType *I32Ty = B.getInt32Ty();
929 Value *Src = Val;
930 if (OrigTy->isPointerTy())
931 Src = B.CreatePtrToInt(Src, I32Ty);
932 else if (OrigTy != I32Ty)
933 Src = B.CreateBitCast(Src, I32Ty);
934 Value *Result = B.CreateIntrinsic(Intrinsic::amdgcn_ds_swizzle, {},
935 {Src, B.getInt32(Offset)});
936 if (OrigTy->isPointerTy())
937 return B.CreateIntToPtr(Result, OrigTy);
938 if (OrigTy != I32Ty)
939 return B.CreateBitCast(Result, OrigTy);
940 return Result;
941}
942
943/// Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
945 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {Val->getType()},
946 {Val});
947}
948
949/// Given a shuffle map, try to emit the best hardware intrinsic.
952 const GCNSubtarget &ST,
953 const DataLayout &DL) {
954 // Uniform shuffle (all lanes read the same value) is handled by cheaper
955 // broadcast/readlane intrinsics.
956 if (all_equal(Ids))
957 return nullptr;
958
959 if (std::optional<unsigned> QP = matchQuadPermPattern(Ids)) {
960 if (ST.hasDPP())
961 return createUpdateDpp(B, Src, *QP);
963 }
964
965 if (ST.hasDPP()) {
970 if (std::optional<unsigned> Amt = matchRowRotatePattern(Ids))
971 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_ROR_FIRST + *Amt - 1);
972 }
973
974 // row_share is supported on GFX90A and GFX10+; row_xmask is GFX10+ only.
975 if (ST.hasDPPRowShare()) {
976 if (std::optional<unsigned> Lane = matchRowSharePattern(Ids))
977 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_SHARE_FIRST + *Lane);
978 }
979
980 if (ST.hasDPP() && ST.hasGFX10Insts()) {
981 if (std::optional<unsigned> Mask = matchRowXMaskPattern(Ids))
982 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_XMASK_FIRST + *Mask);
983 }
984
985 if (ST.hasDPP8()) {
986 if (std::optional<unsigned> Sel = matchHalfRowPermPattern(Ids))
987 return createMovDpp8(B, Src, *Sel);
988 }
989
990 if (ST.hasPermLaneX16()) {
991 if (isFullRowPattern(Ids)) {
993 return createPermlane16(B, Src, Lo_32(Sel), Hi_32(Sel));
994 }
995 // Cross-row shuffles (e.g. XOR 16..31) — covered by permlanex16.
996 if (isCrossRowPattern(Ids)) {
998 return createPermlaneX16(B, Src, Lo_32(Sel), Hi_32(Sel));
999 }
1000 }
1001
1002 // Generic DS_SWIZZLE bitmask-mode fallback: handles any 32-lane shuffle that
1003 // can be expressed as dst = ((src & AND) | OR) ^ XOR with 5-bit masks. This
1004 // is available on every target that has ds_swizzle.
1005 if (std::optional<unsigned> Imm = matchDsSwizzleBitmaskPattern(Ids))
1006 return createDsSwizzle(B, Src, *Imm, DL);
1007
1008 if (ST.hasPermLane64() && matchHalfWaveSwapPattern(Ids))
1009 return createPermlane64(B, Src);
1010
1011 return nullptr;
1012}
1013
1014/// Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant
1015/// function of the lane ID into a hardware-specific lane permutation intrinsic.
1016static std::optional<Instruction *>
1018 const GCNSubtarget &ST) {
1019 const DataLayout &DL = IC.getDataLayout();
1020 if (DL.getTypeSizeInBits(II.getType()) != 32)
1021 return std::nullopt;
1022
1023 if (!ST.isWaveSizeKnown())
1024 return std::nullopt;
1025
1026 unsigned WaveSize = ST.getWavefrontSize();
1027 bool IsBpermute = II.getIntrinsicID() == Intrinsic::amdgcn_ds_bpermute;
1028 Value *Src = II.getArgOperand(IsBpermute ? 1 : 0);
1029 Value *Index = II.getArgOperand(IsBpermute ? 0 : 1);
1030
1032 if (IsBpermute) {
1033 Ids.resize(WaveSize);
1034 for (unsigned Lane : seq(WaveSize)) {
1035 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
1036 if (!Val || (*Val & 3) || (*Val >> 2) >= WaveSize)
1037 return std::nullopt;
1038 Ids[Lane] = *Val >> 2;
1039 }
1040 } else {
1041 if (!tryBuildShuffleMap(Index, ST, Ids, DL))
1042 return std::nullopt;
1043 }
1044
1045 Value *Result = matchShuffleToHWIntrinsic(IC.Builder, Src, Ids, ST, DL);
1046 if (!Result)
1047 return std::nullopt;
1048
1049 return IC.replaceInstUsesWith(II, Result);
1050}
1051
1052std::optional<Instruction *>
1054 Intrinsic::ID IID = II.getIntrinsicID();
1055 switch (IID) {
1056 case Intrinsic::amdgcn_implicitarg_ptr: {
1057 if (II.getFunction()->hasFnAttribute("amdgpu-no-implicitarg-ptr"))
1058 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1059 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(*II.getFunction());
1060
1061 uint64_t CurrentOrNullBytes =
1062 II.getAttributes().getRetDereferenceableOrNullBytes();
1063 if (CurrentOrNullBytes != 0) {
1064 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
1065 // into dereferenceable(max(A, B))
1066 uint64_t NewBytes = std::max(CurrentOrNullBytes, ImplicitArgBytes);
1067 II.addRetAttr(
1068 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1069 II.removeRetAttr(Attribute::DereferenceableOrNull);
1070 return &II;
1071 }
1072
1073 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
1074 uint64_t NewBytes = std::max(CurrentBytes, ImplicitArgBytes);
1075 if (NewBytes != CurrentBytes) {
1076 II.addRetAttr(
1077 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1078 return &II;
1079 }
1080
1081 return std::nullopt;
1082 }
1083 case Intrinsic::amdgcn_rcp: {
1084 Value *Src = II.getArgOperand(0);
1085 if (isa<PoisonValue>(Src))
1086 return IC.replaceInstUsesWith(II, Src);
1087
1088 // TODO: Move to ConstantFolding/InstSimplify?
1089 if (isa<UndefValue>(Src)) {
1090 Type *Ty = II.getType();
1091 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1092 return IC.replaceInstUsesWith(II, QNaN);
1093 }
1094
1095 if (II.isStrictFP())
1096 break;
1097
1098 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1099 const APFloat &ArgVal = C->getValueAPF();
1100 APFloat Val(ArgVal.getSemantics(), 1);
1102
1103 // This is more precise than the instruction may give.
1104 //
1105 // TODO: The instruction always flushes denormal results (except for f16),
1106 // should this also?
1107 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
1108 }
1109
1110 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
1111 if (!FMF.allowContract())
1112 break;
1113 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
1114 if (!SrcCI)
1115 break;
1116
1117 auto IID = SrcCI->getIntrinsicID();
1118 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
1119 //
1120 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
1121 // relaxed.
1122 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
1123 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
1124 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
1125 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
1126 break;
1127
1128 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
1129 break;
1130
1132 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
1133
1134 InnerFMF |= FMF;
1135 II.setFastMathFlags(InnerFMF);
1136
1137 II.setCalledFunction(NewDecl);
1138 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
1139 }
1140
1141 break;
1142 }
1143 case Intrinsic::amdgcn_sqrt:
1144 case Intrinsic::amdgcn_rsq:
1145 case Intrinsic::amdgcn_tanh: {
1146 Value *Src = II.getArgOperand(0);
1147 if (isa<PoisonValue>(Src))
1148 return IC.replaceInstUsesWith(II, Src);
1149
1150 // TODO: Move to ConstantFolding/InstSimplify?
1151 if (isa<UndefValue>(Src)) {
1152 Type *Ty = II.getType();
1153 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1154 return IC.replaceInstUsesWith(II, QNaN);
1155 }
1156
1157 // f16 amdgcn.sqrt is identical to regular sqrt.
1158 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
1160 II.getModule(), Intrinsic::sqrt, {II.getType()});
1161 II.setCalledFunction(NewDecl);
1162 return &II;
1163 }
1164
1165 break;
1166 }
1167 case Intrinsic::amdgcn_log:
1168 case Intrinsic::amdgcn_exp2: {
1169 const bool IsLog = IID == Intrinsic::amdgcn_log;
1170 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
1171 Value *Src = II.getArgOperand(0);
1172 Type *Ty = II.getType();
1173
1174 if (isa<PoisonValue>(Src))
1175 return IC.replaceInstUsesWith(II, Src);
1176
1177 if (IC.getSimplifyQuery().isUndefValue(Src))
1179
1180 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1181 if (C->isInfinity()) {
1182 // exp2(+inf) -> +inf
1183 // log2(+inf) -> +inf
1184 if (!C->isNegative())
1185 return IC.replaceInstUsesWith(II, C);
1186
1187 // exp2(-inf) -> 0
1188 if (IsExp && C->isNegative())
1190 }
1191
1192 if (II.isStrictFP())
1193 break;
1194
1195 if (C->isNaN()) {
1196 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
1197 return IC.replaceInstUsesWith(II, Quieted);
1198 }
1199
1200 // f32 instruction doesn't handle denormals, f16 does.
1201 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
1202 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
1203 : ConstantFP::get(Ty, 1.0);
1204 return IC.replaceInstUsesWith(II, FoldedValue);
1205 }
1206
1207 if (IsLog && C->isNegative())
1209
1210 // TODO: Full constant folding matching hardware behavior.
1211 }
1212
1213 break;
1214 }
1215 case Intrinsic::amdgcn_frexp_mant:
1216 case Intrinsic::amdgcn_frexp_exp: {
1217 Value *Src = II.getArgOperand(0);
1218 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1219 int Exp;
1220 APFloat Significand =
1221 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
1222
1223 if (IID == Intrinsic::amdgcn_frexp_mant) {
1224 return IC.replaceInstUsesWith(
1225 II, ConstantFP::get(II.getContext(), Significand));
1226 }
1227
1228 // Match instruction special case behavior.
1229 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
1230 Exp = 0;
1231
1232 return IC.replaceInstUsesWith(II,
1233 ConstantInt::getSigned(II.getType(), Exp));
1234 }
1235
1236 if (isa<PoisonValue>(Src))
1237 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1238
1239 if (isa<UndefValue>(Src)) {
1240 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1241 }
1242
1243 break;
1244 }
1245 case Intrinsic::amdgcn_class: {
1246 Value *Src0 = II.getArgOperand(0);
1247 Value *Src1 = II.getArgOperand(1);
1248 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
1249 if (CMask) {
1250 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1251 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
1252
1253 // Clamp any excess bits, as they're illegal for the generic intrinsic.
1254 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
1255 CMask->getZExtValue() & fcAllFlags));
1256 return &II;
1257 }
1258
1259 // Propagate poison.
1260 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
1261 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1262
1263 // llvm.amdgcn.class(_, undef) -> false
1264 if (IC.getSimplifyQuery().isUndefValue(Src1))
1265 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
1266
1267 // llvm.amdgcn.class(undef, mask) -> mask != 0
1268 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
1269 Value *CmpMask = IC.Builder.CreateICmpNE(
1270 Src1, ConstantInt::getNullValue(Src1->getType()));
1271 return IC.replaceInstUsesWith(II, CmpMask);
1272 }
1273 break;
1274 }
1275 case Intrinsic::amdgcn_cvt_pkrtz: {
1276 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
1277 Type *HalfTy = Type::getHalfTy(Arg->getContext());
1278
1279 if (isa<PoisonValue>(Arg))
1280 return PoisonValue::get(HalfTy);
1281 if (isa<UndefValue>(Arg))
1282 return UndefValue::get(HalfTy);
1283
1284 ConstantFP *CFP = nullptr;
1285 if (match(Arg, m_ConstantFP(CFP))) {
1286 bool LosesInfo;
1287 APFloat Val(CFP->getValueAPF());
1289 return ConstantFP::get(HalfTy, Val);
1290 }
1291
1292 Value *Src = nullptr;
1293 if (match(Arg, m_FPExt(m_Value(Src)))) {
1294 if (Src->getType()->isHalfTy())
1295 return Src;
1296 }
1297
1298 return nullptr;
1299 };
1300
1301 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
1302 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
1303 Value *V = PoisonValue::get(II.getType());
1304 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
1305 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
1306 return IC.replaceInstUsesWith(II, V);
1307 }
1308 }
1309
1310 break;
1311 }
1312 case Intrinsic::amdgcn_cvt_pknorm_i16:
1313 case Intrinsic::amdgcn_cvt_pknorm_u16:
1314 case Intrinsic::amdgcn_cvt_pk_i16:
1315 case Intrinsic::amdgcn_cvt_pk_u16: {
1316 Value *Src0 = II.getArgOperand(0);
1317 Value *Src1 = II.getArgOperand(1);
1318
1319 // TODO: Replace call with scalar operation if only one element is poison.
1320 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
1321 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1322
1323 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
1324 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1325 }
1326
1327 break;
1328 }
1329 case Intrinsic::amdgcn_cvt_off_f32_i4: {
1330 Value* Arg = II.getArgOperand(0);
1331 Type *Ty = II.getType();
1332
1333 if (isa<PoisonValue>(Arg))
1334 return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
1335
1336 if(IC.getSimplifyQuery().isUndefValue(Arg))
1338
1339 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
1340 if (!CArg)
1341 break;
1342
1343 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1344 constexpr size_t ResValsSize = 16;
1345 static constexpr float ResVals[ResValsSize] = {
1346 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1347 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1348 Constant *Res =
1349 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1350 return IC.replaceInstUsesWith(II, Res);
1351 }
1352 case Intrinsic::amdgcn_ubfe:
1353 case Intrinsic::amdgcn_sbfe: {
1354 // Decompose simple cases into standard shifts.
1355 Value *Src = II.getArgOperand(0);
1356 if (isa<UndefValue>(Src)) {
1357 return IC.replaceInstUsesWith(II, Src);
1358 }
1359
1360 unsigned Width;
1361 Type *Ty = II.getType();
1362 unsigned IntSize = Ty->getIntegerBitWidth();
1363
1364 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
1365 if (CWidth) {
1366 Width = CWidth->getZExtValue();
1367 if ((Width & (IntSize - 1)) == 0) {
1369 }
1370
1371 // Hardware ignores high bits, so remove those.
1372 if (Width >= IntSize) {
1373 return IC.replaceOperand(
1374 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
1375 }
1376 }
1377
1378 unsigned Offset;
1379 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
1380 if (COffset) {
1381 Offset = COffset->getZExtValue();
1382 if (Offset >= IntSize) {
1383 return IC.replaceOperand(
1384 II, 1,
1385 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
1386 }
1387 }
1388
1389 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1390
1391 if (!CWidth || !COffset)
1392 break;
1393
1394 // The case of Width == 0 is handled above, which makes this transformation
1395 // safe. If Width == 0, then the ashr and lshr instructions become poison
1396 // value since the shift amount would be equal to the bit size.
1397 assert(Width != 0);
1398
1399 // TODO: This allows folding to undef when the hardware has specific
1400 // behavior?
1401 if (Offset + Width < IntSize) {
1402 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
1403 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
1404 : IC.Builder.CreateLShr(Shl, IntSize - Width);
1405 RightShift->takeName(&II);
1406 return IC.replaceInstUsesWith(II, RightShift);
1407 }
1408
1409 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
1410 : IC.Builder.CreateLShr(Src, Offset);
1411
1412 RightShift->takeName(&II);
1413 return IC.replaceInstUsesWith(II, RightShift);
1414 }
1415 case Intrinsic::amdgcn_exp:
1416 case Intrinsic::amdgcn_exp_row:
1417 case Intrinsic::amdgcn_exp_compr: {
1418 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
1419 unsigned EnBits = En->getZExtValue();
1420 if (EnBits == 0xf)
1421 break; // All inputs enabled.
1422
1423 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1424 bool Changed = false;
1425 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1426 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1427 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1428 Value *Src = II.getArgOperand(I + 2);
1429 if (!isa<PoisonValue>(Src)) {
1430 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
1431 Changed = true;
1432 }
1433 }
1434 }
1435
1436 if (Changed) {
1437 return &II;
1438 }
1439
1440 break;
1441 }
1442 case Intrinsic::amdgcn_fmed3: {
1443 Value *Src0 = II.getArgOperand(0);
1444 Value *Src1 = II.getArgOperand(1);
1445 Value *Src2 = II.getArgOperand(2);
1446
1447 for (Value *Src : {Src0, Src1, Src2}) {
1448 if (isa<PoisonValue>(Src))
1449 return IC.replaceInstUsesWith(II, Src);
1450 }
1451
1452 if (II.isStrictFP())
1453 break;
1454
1455 // med3 with a nan input acts like
1456 // v_min_f32(v_min_f32(s0, s1), s2)
1457 //
1458 // Signalingness is ignored with ieee=0, so we fold to
1459 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1460 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1461 // returned signaling nan will not be quieted.
1462
1463 // ieee=1
1464 // s0 snan: s2
1465 // s1 snan: s2
1466 // s2 snan: qnan
1467
1468 // s0 qnan: min(s1, s2)
1469 // s1 qnan: min(s0, s2)
1470 // s2 qnan: min(s0, s1)
1471
1472 // ieee=0
1473 // s0 _nan: min(s1, s2)
1474 // s1 _nan: min(s0, s2)
1475 // s2 _nan: min(s0, s1)
1476
1477 // med3 behavior with infinity
1478 // s0 +inf: max(s1, s2)
1479 // s1 +inf: max(s0, s2)
1480 // s2 +inf: max(s0, s1)
1481 // s0 -inf: min(s1, s2)
1482 // s1 -inf: min(s0, s2)
1483 // s2 -inf: min(s0, s1)
1484
1485 // Checking for NaN before canonicalization provides better fidelity when
1486 // mapping other operations onto fmed3 since the order of operands is
1487 // unchanged.
1488 Value *V = nullptr;
1489 const APFloat *ConstSrc0 = nullptr;
1490 const APFloat *ConstSrc1 = nullptr;
1491 const APFloat *ConstSrc2 = nullptr;
1492
1493 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1494 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1495 isa<UndefValue>(Src0)) {
1496 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1497 switch (fpenvIEEEMode(II)) {
1498 case KnownIEEEMode::On:
1499 // TODO: If Src2 is snan, does it need quieting?
1500 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1501 return IC.replaceInstUsesWith(II, Src2);
1502
1503 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1504 : IC.Builder.CreateMinNum(Src1, Src2);
1505 break;
1506 case KnownIEEEMode::Off:
1507 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1508 : IC.Builder.CreateMinimumNum(Src1, Src2);
1509 break;
1511 break;
1512 }
1513 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1514 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1515 isa<UndefValue>(Src1)) {
1516 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1517 switch (fpenvIEEEMode(II)) {
1518 case KnownIEEEMode::On:
1519 // TODO: If Src2 is snan, does it need quieting?
1520 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1521 return IC.replaceInstUsesWith(II, Src2);
1522
1523 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1524 : IC.Builder.CreateMinNum(Src0, Src2);
1525 break;
1526 case KnownIEEEMode::Off:
1527 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1528 : IC.Builder.CreateMinimumNum(Src0, Src2);
1529 break;
1531 break;
1532 }
1533 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1534 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1535 isa<UndefValue>(Src2)) {
1536 switch (fpenvIEEEMode(II)) {
1537 case KnownIEEEMode::On:
1538 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1539 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1540 return IC.replaceInstUsesWith(II, Quieted);
1541 }
1542
1543 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1544 ? IC.Builder.CreateMaxNum(Src0, Src1)
1545 : IC.Builder.CreateMinNum(Src0, Src1);
1546 break;
1547 case KnownIEEEMode::Off:
1548 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1549 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1550 : IC.Builder.CreateMaximumNum(Src0, Src1);
1551 break;
1553 break;
1554 }
1555 }
1556
1557 if (V) {
1558 if (auto *CI = dyn_cast<CallInst>(V)) {
1559 CI->copyFastMathFlags(&II);
1560 CI->takeName(&II);
1561 }
1562 return IC.replaceInstUsesWith(II, V);
1563 }
1564
1565 bool Swap = false;
1566 // Canonicalize constants to RHS operands.
1567 //
1568 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1569 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1570 std::swap(Src0, Src1);
1571 Swap = true;
1572 }
1573
1574 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1575 std::swap(Src1, Src2);
1576 Swap = true;
1577 }
1578
1579 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1580 std::swap(Src0, Src1);
1581 Swap = true;
1582 }
1583
1584 if (Swap) {
1585 II.setArgOperand(0, Src0);
1586 II.setArgOperand(1, Src1);
1587 II.setArgOperand(2, Src2);
1588 return &II;
1589 }
1590
1591 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1592 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1593 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1594 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1595 C2->getValueAPF());
1596 return IC.replaceInstUsesWith(II,
1597 ConstantFP::get(II.getType(), Result));
1598 }
1599 }
1600 }
1601
1602 if (!ST->hasMed3_16())
1603 break;
1604
1605 // Repeat floating-point width reduction done for minnum/maxnum.
1606 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1607 if (Value *X = matchFPExtFromF16(Src0)) {
1608 if (Value *Y = matchFPExtFromF16(Src1)) {
1609 if (Value *Z = matchFPExtFromF16(Src2)) {
1610 Value *NewCall = IC.Builder.CreateIntrinsic(
1611 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1612 return new FPExtInst(NewCall, II.getType());
1613 }
1614 }
1615 }
1616
1617 break;
1618 }
1619 case Intrinsic::amdgcn_icmp:
1620 case Intrinsic::amdgcn_fcmp: {
1621 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1622 // Guard against invalid arguments.
1623 int64_t CCVal = CC->getZExtValue();
1624 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1625 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1626 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1627 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1629 break;
1630
1631 Value *Src0 = II.getArgOperand(0);
1632 Value *Src1 = II.getArgOperand(1);
1633
1634 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1635 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1637 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1638 if (CCmp && CCmp->isNullValue()) {
1639 return IC.replaceInstUsesWith(
1640 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1641 }
1642
1643 // The result of V_ICMP/V_FCMP assembly instructions (which this
1644 // intrinsic exposes) is one bit per thread, masked with the EXEC
1645 // register (which contains the bitmask of live threads). So a
1646 // comparison that always returns true is the same as a read of the
1647 // EXEC register.
1648 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
1649 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
1650 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
1651 CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
1652 II.getType(), Args);
1653 NewCall->addFnAttr(Attribute::Convergent);
1654 NewCall->takeName(&II);
1655 return IC.replaceInstUsesWith(II, NewCall);
1656 }
1657
1658 // Canonicalize constants to RHS.
1659 CmpInst::Predicate SwapPred =
1661 II.setArgOperand(0, Src1);
1662 II.setArgOperand(1, Src0);
1663 II.setArgOperand(
1664 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1665 return &II;
1666 }
1667
1668 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1669 break;
1670
1671 // Canonicalize compare eq with true value to compare != 0
1672 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1673 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1674 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1675 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1676 Value *ExtSrc;
1677 if (CCVal == CmpInst::ICMP_EQ &&
1678 ((match(Src1, PatternMatch::m_One()) &&
1679 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1680 (match(Src1, PatternMatch::m_AllOnes()) &&
1681 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1682 ExtSrc->getType()->isIntegerTy(1)) {
1684 IC.replaceOperand(II, 2,
1685 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1686 return &II;
1687 }
1688
1689 CmpPredicate SrcPred;
1690 Value *SrcLHS;
1691 Value *SrcRHS;
1692
1693 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1694 // intrinsic. The typical use is a wave vote function in the library, which
1695 // will be fed from a user code condition compared with 0. Fold in the
1696 // redundant compare.
1697
1698 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1699 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1700 //
1701 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1702 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1703 if (match(Src1, PatternMatch::m_Zero()) &&
1705 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1706 PatternMatch::m_Value(SrcRHS))))) {
1707 if (CCVal == CmpInst::ICMP_EQ)
1708 SrcPred = CmpInst::getInversePredicate(SrcPred);
1709
1710 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1711 ? Intrinsic::amdgcn_fcmp
1712 : Intrinsic::amdgcn_icmp;
1713
1714 Type *Ty = SrcLHS->getType();
1715 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1716 // Promote to next legal integer type.
1717 unsigned Width = CmpType->getBitWidth();
1718 unsigned NewWidth = Width;
1719
1720 // Don't do anything for i1 comparisons.
1721 if (Width == 1)
1722 break;
1723
1724 if (Width <= 16)
1725 NewWidth = 16;
1726 else if (Width <= 32)
1727 NewWidth = 32;
1728 else if (Width <= 64)
1729 NewWidth = 64;
1730 else
1731 break; // Can't handle this.
1732
1733 if (Width != NewWidth) {
1734 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1735 if (CmpInst::isSigned(SrcPred)) {
1736 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1737 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1738 } else {
1739 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1740 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1741 }
1742 }
1743 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1744 break;
1745
1746 Value *Args[] = {SrcLHS, SrcRHS,
1747 ConstantInt::get(CC->getType(), SrcPred)};
1748 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1749 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1750 NewCall->takeName(&II);
1751 return IC.replaceInstUsesWith(II, NewCall);
1752 }
1753
1754 break;
1755 }
1756 case Intrinsic::amdgcn_mbcnt_hi:
1757 // exec_hi is all 0, so this is just a copy.
1758 if (ST->isWave32())
1759 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1760 [[fallthrough]];
1761 case Intrinsic::amdgcn_mbcnt_lo: {
1762 ConstantRange AccRange =
1763 computeConstantRange(II.getArgOperand(1),
1764 /*ForSigned=*/false, IC.getSimplifyQuery());
1765 if (AccRange.isFullSet())
1766 return nullptr;
1767
1768 // TODO: Can raise lower bound by inspecting first argument.
1769 ConstantRange MbcntRange(APInt(32, 0), APInt(32, 32 + 1));
1770 ConstantRange ComputedRange = AccRange.add(MbcntRange);
1771 if (ComputedRange.isFullSet())
1772 return nullptr;
1773
1774 if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1775 ComputedRange = ComputedRange.intersectWith(*ExistingRange);
1776 if (ComputedRange == *ExistingRange)
1777 return nullptr;
1778 }
1779
1780 II.addRangeRetAttr(ComputedRange);
1781 return nullptr;
1782 }
1783 case Intrinsic::amdgcn_ballot: {
1784 Value *Arg = II.getArgOperand(0);
1785 if (isa<PoisonValue>(Arg))
1786 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1787
1788 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1789 if (Src->isZero()) {
1790 // amdgcn.ballot(i1 0) is zero.
1791 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1792 }
1793 }
1794 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1795 // %b64 = call i64 ballot.i64(...)
1796 // =>
1797 // %b32 = call i32 ballot.i32(...)
1798 // %b64 = zext i32 %b32 to i64
1800 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1801 {IC.Builder.getInt32Ty()},
1802 {II.getArgOperand(0)}),
1803 II.getType());
1804 Call->takeName(&II);
1805 return IC.replaceInstUsesWith(II, Call);
1806 }
1807 break;
1808 }
1809 case Intrinsic::amdgcn_wavefrontsize: {
1810 if (ST->isWaveSizeKnown())
1811 return IC.replaceInstUsesWith(
1812 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1813 break;
1814 }
1815 case Intrinsic::amdgcn_wqm_vote: {
1816 // wqm_vote is identity when the argument is constant.
1817 if (!isa<Constant>(II.getArgOperand(0)))
1818 break;
1819
1820 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1821 }
1822 case Intrinsic::amdgcn_kill: {
1823 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1824 if (!C || !C->getZExtValue())
1825 break;
1826
1827 // amdgcn.kill(i1 1) is a no-op
1828 return IC.eraseInstFromFunction(II);
1829 }
1830 case Intrinsic::amdgcn_s_sendmsg:
1831 case Intrinsic::amdgcn_s_sendmsghalt: {
1832 // The second operand is copied to m0, but is only actually used for
1833 // certain message types. For message types that are known to not use m0,
1834 // fold it to poison.
1835 using namespace AMDGPU::SendMsg;
1836
1837 Value *M0Val = II.getArgOperand(1);
1838 if (isa<PoisonValue>(M0Val))
1839 break;
1840
1841 auto *MsgImm = cast<ConstantInt>(II.getArgOperand(0));
1842 uint16_t MsgId, OpId, StreamId;
1843 decodeMsg(MsgImm->getZExtValue(), MsgId, OpId, StreamId, *ST);
1844
1845 if (!msgDoesNotUseM0(MsgId, *ST))
1846 break;
1847
1848 // Drop UB-implying attributes since we're replacing with poison.
1849 II.dropUBImplyingAttrsAndMetadata();
1850 IC.replaceOperand(II, 1, PoisonValue::get(M0Val->getType()));
1851 return nullptr;
1852 }
1853 case Intrinsic::amdgcn_update_dpp: {
1854 Value *Old = II.getArgOperand(0);
1855
1856 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1857 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1858 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1859 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1860 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1861 break;
1862
1863 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1864 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1865 }
1866 case Intrinsic::amdgcn_permlane16:
1867 case Intrinsic::amdgcn_permlane16_var:
1868 case Intrinsic::amdgcn_permlanex16:
1869 case Intrinsic::amdgcn_permlanex16_var: {
1870 // Discard vdst_in if it's not going to be read.
1871 Value *VDstIn = II.getArgOperand(0);
1872 if (isa<PoisonValue>(VDstIn))
1873 break;
1874
1875 // FetchInvalid operand idx.
1876 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1877 IID == Intrinsic::amdgcn_permlanex16)
1878 ? 4 /* for permlane16 and permlanex16 */
1879 : 3; /* for permlane16_var and permlanex16_var */
1880
1881 // BoundCtrl operand idx.
1882 // For permlane16 and permlanex16 it should be 5
1883 // For Permlane16_var and permlanex16_var it should be 4
1884 unsigned int BcIdx = FiIdx + 1;
1885
1886 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1887 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1888 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1889 break;
1890
1891 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1892 }
1893 case Intrinsic::amdgcn_wave_shuffle:
1894 return tryOptimizeShufflePattern(IC, II, *ST);
1895 case Intrinsic::amdgcn_permlane64:
1896 case Intrinsic::amdgcn_readfirstlane:
1897 case Intrinsic::amdgcn_readlane:
1898 case Intrinsic::amdgcn_ds_bpermute: {
1899 // If the data argument is uniform these intrinsics return it unchanged.
1900 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1901 const Use &Src = II.getArgOperandUse(SrcIdx);
1902 if (isTriviallyUniform(Src))
1903 return IC.replaceInstUsesWith(II, Src.get());
1904
1905 if (IID == Intrinsic::amdgcn_readlane &&
1907 return &II;
1908
1909 // If the lane argument of bpermute is uniform, change it to readlane. This
1910 // generates better code and can enable further optimizations because
1911 // readlane is AlwaysUniform.
1912 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1913 const Use &Lane = II.getArgOperandUse(0);
1914 if (isTriviallyUniform(Lane)) {
1915 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1917 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1918 II.setCalledFunction(NewDecl);
1919 II.setOperand(0, Src);
1920 II.setOperand(1, NewLane);
1921 return &II;
1922 }
1923 }
1924
1925 if (IID == Intrinsic::amdgcn_ds_bpermute)
1926 return tryOptimizeShufflePattern(IC, II, *ST);
1927
1929 return Res;
1930
1931 return std::nullopt;
1932 }
1933 case Intrinsic::amdgcn_writelane: {
1934 // TODO: Fold bitcast like readlane.
1935 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1936 return &II;
1937 return std::nullopt;
1938 }
1939 case Intrinsic::amdgcn_trig_preop: {
1940 // The intrinsic is declared with name mangling, but currently the
1941 // instruction only exists for f64
1942 if (!II.getType()->isDoubleTy())
1943 break;
1944
1945 Value *Src = II.getArgOperand(0);
1946 Value *Segment = II.getArgOperand(1);
1947 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1948 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1949
1950 if (isa<UndefValue>(Segment))
1951 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1952
1953 // Sign bit is not used.
1954 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Src);
1955 if (StrippedSign != Src)
1956 return IC.replaceOperand(II, 0, StrippedSign);
1957
1958 if (II.isStrictFP())
1959 break;
1960
1961 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
1962 if (!CSrc && !isa<UndefValue>(Src))
1963 break;
1964
1965 // The instruction ignores special cases, and literally just extracts the
1966 // exponents. Fold undef to nan, and index the table as normal.
1967 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
1968 : APFloat::getQNaN(II.getType()->getFltSemantics())
1969 .bitcastToAPInt();
1970
1971 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1972 if (!Cseg) {
1973 if (isa<UndefValue>(Src))
1974 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1975 break;
1976 }
1977
1978 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
1979 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1980 unsigned Shift = SegmentVal * 53;
1981 if (Exponent > 1077)
1982 Shift += Exponent - 1077;
1983
1984 // 2.0/PI table.
1985 static const uint32_t TwoByPi[] = {
1986 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1987 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1988 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1989 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1990 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1991 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1992 0x56033046};
1993
1994 // Return 0 for outbound segment (hardware behavior).
1995 unsigned Idx = Shift >> 5;
1996 if (Idx + 2 >= std::size(TwoByPi)) {
1997 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1998 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1999 }
2000
2001 unsigned BShift = Shift & 0x1f;
2002 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
2003 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
2004 if (BShift)
2005 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
2006 Thi = Thi >> 11;
2007 APFloat Result = APFloat((double)Thi);
2008
2009 int Scale = -53 - Shift;
2010 if (Exponent >= 1968)
2011 Scale += 128;
2012
2013 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
2014 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
2015 }
2016 case Intrinsic::amdgcn_fmul_legacy: {
2017 Value *Op0 = II.getArgOperand(0);
2018 Value *Op1 = II.getArgOperand(1);
2019
2020 for (Value *Src : {Op0, Op1}) {
2021 if (isa<PoisonValue>(Src))
2022 return IC.replaceInstUsesWith(II, Src);
2023 }
2024
2025 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2026 // infinity, gives +0.0.
2027 // TODO: Move to InstSimplify?
2028 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2030 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2031
2032 // If we can prove we don't have one of the special cases then we can use a
2033 // normal fmul instruction instead.
2034 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2035 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
2036 FMul->takeName(&II);
2037 return IC.replaceInstUsesWith(II, FMul);
2038 }
2039 break;
2040 }
2041 case Intrinsic::amdgcn_fma_legacy: {
2042 Value *Op0 = II.getArgOperand(0);
2043 Value *Op1 = II.getArgOperand(1);
2044 Value *Op2 = II.getArgOperand(2);
2045
2046 for (Value *Src : {Op0, Op1, Op2}) {
2047 if (isa<PoisonValue>(Src))
2048 return IC.replaceInstUsesWith(II, Src);
2049 }
2050
2051 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2052 // infinity, gives +0.0.
2053 // TODO: Move to InstSimplify?
2054 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2056 // It's tempting to just return Op2 here, but that would give the wrong
2057 // result if Op2 was -0.0.
2058 auto *Zero = ConstantFP::getZero(II.getType());
2059 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
2060 FAdd->takeName(&II);
2061 return IC.replaceInstUsesWith(II, FAdd);
2062 }
2063
2064 // If we can prove we don't have one of the special cases then we can use a
2065 // normal fma instead.
2066 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2067 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
2068 II.getModule(), Intrinsic::fma, II.getType()));
2069 return &II;
2070 }
2071 break;
2072 }
2073 case Intrinsic::amdgcn_is_shared:
2074 case Intrinsic::amdgcn_is_private: {
2075 Value *Src = II.getArgOperand(0);
2076 if (isa<PoisonValue>(Src))
2077 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2078 if (isa<UndefValue>(Src))
2079 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
2080
2081 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
2082 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
2083 break;
2084 }
2085 case Intrinsic::amdgcn_make_buffer_rsrc: {
2086 Value *Src = II.getArgOperand(0);
2087 if (isa<PoisonValue>(Src))
2088 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2089 return std::nullopt;
2090 }
2091 case Intrinsic::amdgcn_raw_buffer_store_format:
2092 case Intrinsic::amdgcn_struct_buffer_store_format:
2093 case Intrinsic::amdgcn_raw_tbuffer_store:
2094 case Intrinsic::amdgcn_struct_tbuffer_store:
2095 case Intrinsic::amdgcn_image_store_1d:
2096 case Intrinsic::amdgcn_image_store_1darray:
2097 case Intrinsic::amdgcn_image_store_2d:
2098 case Intrinsic::amdgcn_image_store_2darray:
2099 case Intrinsic::amdgcn_image_store_2darraymsaa:
2100 case Intrinsic::amdgcn_image_store_2dmsaa:
2101 case Intrinsic::amdgcn_image_store_3d:
2102 case Intrinsic::amdgcn_image_store_cube:
2103 case Intrinsic::amdgcn_image_store_mip_1d:
2104 case Intrinsic::amdgcn_image_store_mip_1darray:
2105 case Intrinsic::amdgcn_image_store_mip_2d:
2106 case Intrinsic::amdgcn_image_store_mip_2darray:
2107 case Intrinsic::amdgcn_image_store_mip_3d:
2108 case Intrinsic::amdgcn_image_store_mip_cube: {
2109 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
2110 break;
2111
2112 APInt DemandedElts;
2113 if (ST->hasDefaultComponentBroadcast())
2114 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
2115 else if (ST->hasDefaultComponentZero())
2116 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
2117 else
2118 break;
2119
2120 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
2121 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
2122 false)) {
2123 return IC.eraseInstFromFunction(II);
2124 }
2125
2126 break;
2127 }
2128 case Intrinsic::amdgcn_prng_b32: {
2129 auto *Src = II.getArgOperand(0);
2130 if (isa<UndefValue>(Src)) {
2131 return IC.replaceInstUsesWith(II, Src);
2132 }
2133 return std::nullopt;
2134 }
2135 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
2136 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
2137 Value *Src0 = II.getArgOperand(0);
2138 Value *Src1 = II.getArgOperand(1);
2139 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
2140 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
2141 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2142 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2143
2144 auto getFormatNumRegs = [](unsigned FormatVal) {
2145 switch (FormatVal) {
2148 return 6u;
2150 return 4u;
2153 return 8u;
2154 default:
2155 llvm_unreachable("invalid format value");
2156 }
2157 };
2158
2159 bool MadeChange = false;
2160 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
2161 unsigned Src1NumElts = getFormatNumRegs(BLGP);
2162
2163 // Depending on the used format, fewer registers are required so shrink the
2164 // vector type.
2165 if (Src0Ty->getNumElements() > Src0NumElts) {
2166 Src0 = IC.Builder.CreateExtractVector(
2167 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2168 uint64_t(0));
2169 MadeChange = true;
2170 }
2171
2172 if (Src1Ty->getNumElements() > Src1NumElts) {
2173 Src1 = IC.Builder.CreateExtractVector(
2174 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2175 uint64_t(0));
2176 MadeChange = true;
2177 }
2178
2179 if (!MadeChange)
2180 return std::nullopt;
2181
2182 SmallVector<Value *, 10> Args(II.args());
2183 Args[0] = Src0;
2184 Args[1] = Src1;
2185
2186 CallInst *NewII = IC.Builder.CreateIntrinsic(
2187 IID, {Src0->getType(), Src1->getType()}, Args, &II);
2188 NewII->takeName(&II);
2189 return IC.replaceInstUsesWith(II, NewII);
2190 }
2191 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
2192 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
2193 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
2194 Value *Src0 = II.getArgOperand(1);
2195 Value *Src1 = II.getArgOperand(3);
2196 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2197 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
2198 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2199 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2200
2201 bool MadeChange = false;
2202 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
2203 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
2204
2205 // Depending on the used format, fewer registers are required so shrink the
2206 // vector type.
2207 if (Src0Ty->getNumElements() > Src0NumElts) {
2208 Src0 = IC.Builder.CreateExtractVector(
2209 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2210 IC.Builder.getInt64(0));
2211 MadeChange = true;
2212 }
2213
2214 if (Src1Ty->getNumElements() > Src1NumElts) {
2215 Src1 = IC.Builder.CreateExtractVector(
2216 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2217 IC.Builder.getInt64(0));
2218 MadeChange = true;
2219 }
2220
2221 if (!MadeChange)
2222 return std::nullopt;
2223
2224 SmallVector<Value *, 13> Args(II.args());
2225 Args[1] = Src0;
2226 Args[3] = Src1;
2227
2228 CallInst *NewII = IC.Builder.CreateIntrinsic(
2229 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
2230 Args, &II);
2231 NewII->takeName(&II);
2232 return IC.replaceInstUsesWith(II, NewII);
2233 }
2234 }
2235 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
2236 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
2237 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
2238 }
2239 return std::nullopt;
2240}
2241
2242/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
2243///
2244/// The result of simplifying amdgcn image and buffer store intrinsics is updating
2245/// definitions of the intrinsics vector argument, not Uses of the result like
2246/// image and buffer loads.
2247/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
2248/// struct returns.
2251 APInt DemandedElts,
2252 int DMaskIdx, bool IsLoad) {
2253
2254 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
2255 : II.getOperand(0)->getType());
2256 unsigned VWidth = IIVTy->getNumElements();
2257 if (VWidth == 1)
2258 return nullptr;
2259 Type *EltTy = IIVTy->getElementType();
2260
2263
2264 // Assume the arguments are unchanged and later override them, if needed.
2265 SmallVector<Value *, 16> Args(II.args());
2266
2267 if (DMaskIdx < 0) {
2268 // Buffer case.
2269
2270 const unsigned ActiveBits = DemandedElts.getActiveBits();
2271 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
2272
2273 // Start assuming the prefix of elements is demanded, but possibly clear
2274 // some other bits if there are trailing zeros (unused components at front)
2275 // and update offset.
2276 DemandedElts = (1 << ActiveBits) - 1;
2277
2278 if (UnusedComponentsAtFront > 0) {
2279 static const unsigned InvalidOffsetIdx = 0xf;
2280
2281 unsigned OffsetIdx;
2282 switch (II.getIntrinsicID()) {
2283 case Intrinsic::amdgcn_raw_buffer_load:
2284 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2285 OffsetIdx = 1;
2286 break;
2287 case Intrinsic::amdgcn_s_buffer_load:
2288 // If resulting type is vec3, there is no point in trimming the
2289 // load with updated offset, as the vec3 would most likely be widened to
2290 // vec4 anyway during lowering.
2291 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
2292 OffsetIdx = InvalidOffsetIdx;
2293 else
2294 OffsetIdx = 1;
2295 break;
2296 case Intrinsic::amdgcn_struct_buffer_load:
2297 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2298 OffsetIdx = 2;
2299 break;
2300 default:
2301 // TODO: handle tbuffer* intrinsics.
2302 OffsetIdx = InvalidOffsetIdx;
2303 break;
2304 }
2305
2306 if (OffsetIdx != InvalidOffsetIdx) {
2307 // Clear demanded bits and update the offset.
2308 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
2309 auto *Offset = Args[OffsetIdx];
2310 unsigned SingleComponentSizeInBits =
2311 IC.getDataLayout().getTypeSizeInBits(EltTy);
2312 unsigned OffsetAdd =
2313 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
2314 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
2315 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
2316 }
2317 }
2318 } else {
2319 // Image case.
2320
2321 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
2322 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
2323
2324 // dmask 0 has special semantics, do not simplify.
2325 if (DMaskVal == 0)
2326 return nullptr;
2327
2328 // Mask off values that are undefined because the dmask doesn't cover them
2329 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
2330
2331 unsigned NewDMaskVal = 0;
2332 unsigned OrigLdStIdx = 0;
2333 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
2334 const unsigned Bit = 1 << SrcIdx;
2335 if (!!(DMaskVal & Bit)) {
2336 if (!!DemandedElts[OrigLdStIdx])
2337 NewDMaskVal |= Bit;
2338 OrigLdStIdx++;
2339 }
2340 }
2341
2342 if (DMaskVal != NewDMaskVal)
2343 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
2344 }
2345
2346 unsigned NewNumElts = DemandedElts.popcount();
2347 if (!NewNumElts)
2348 return PoisonValue::get(IIVTy);
2349
2350 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2351 if (DMaskIdx >= 0)
2352 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
2353 return nullptr;
2354 }
2355
2356 // Validate function argument and return types, extracting overloaded types
2357 // along the way.
2358 SmallVector<Type *, 6> OverloadTys;
2359 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
2360 return nullptr;
2361
2362 Type *NewTy =
2363 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
2364 OverloadTys[0] = NewTy;
2365
2366 if (!IsLoad) {
2367 SmallVector<int, 8> EltMask;
2368 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2369 if (DemandedElts[OrigStoreIdx])
2370 EltMask.push_back(OrigStoreIdx);
2371
2372 if (NewNumElts == 1)
2373 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
2374 else
2375 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
2376 }
2377
2378 CallInst *NewCall =
2379 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
2380 NewCall->takeName(&II);
2381 NewCall->copyMetadata(II);
2382
2383 if (IsLoad) {
2384 if (NewNumElts == 1) {
2385 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
2386 DemandedElts.countr_zero());
2387 }
2388
2389 SmallVector<int, 8> EltMask;
2390 unsigned NewLoadIdx = 0;
2391 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2392 if (!!DemandedElts[OrigLoadIdx])
2393 EltMask.push_back(NewLoadIdx++);
2394 else
2395 EltMask.push_back(NewNumElts);
2396 }
2397
2398 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
2399
2400 return Shuffle;
2401 }
2402
2403 return NewCall;
2404}
2405
2407 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2408 APInt &UndefElts) const {
2409 auto *VT = dyn_cast<FixedVectorType>(II.getType());
2410 if (!VT)
2411 return nullptr;
2412
2413 const unsigned FirstElt = DemandedElts.countr_zero();
2414 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2415 const unsigned MaskLen = LastElt - FirstElt + 1;
2416
2417 unsigned OldNumElts = VT->getNumElements();
2418 if (MaskLen == OldNumElts && MaskLen != 1)
2419 return nullptr;
2420
2421 Type *EltTy = VT->getElementType();
2422 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
2423
2424 // Theoretically we should support these intrinsics for any legal type. Avoid
2425 // introducing cases that aren't direct register types like v3i16.
2426 if (!isTypeLegal(NewVT))
2427 return nullptr;
2428
2429 Value *Src = II.getArgOperand(0);
2430
2431 // Make sure convergence tokens are preserved.
2432 // TODO: CreateIntrinsic should allow directly copying bundles
2434 II.getOperandBundlesAsDefs(OpBundles);
2435
2437 Function *Remangled =
2438 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
2439
2440 if (MaskLen == 1) {
2441 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
2442
2443 // TODO: Preserve callsite attributes?
2444 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2445
2446 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
2447 NewCall, FirstElt);
2448 }
2449
2450 SmallVector<int> ExtractMask(MaskLen, -1);
2451 for (unsigned I = 0; I != MaskLen; ++I) {
2452 if (DemandedElts[FirstElt + I])
2453 ExtractMask[I] = FirstElt + I;
2454 }
2455
2456 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
2457
2458 // TODO: Preserve callsite attributes?
2459 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2460
2461 SmallVector<int> InsertMask(OldNumElts, -1);
2462 for (unsigned I = 0; I != MaskLen; ++I) {
2463 if (DemandedElts[FirstElt + I])
2464 InsertMask[FirstElt + I] = I;
2465 }
2466
2467 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2468 // call behind.
2469 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
2470}
2471
2473 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2474 APInt &UndefElts2, APInt &UndefElts3,
2475 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2476 SimplifyAndSetOp) const {
2477 switch (II.getIntrinsicID()) {
2478 case Intrinsic::amdgcn_readfirstlane:
2479 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2480 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2481 case Intrinsic::amdgcn_raw_buffer_load:
2482 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2483 case Intrinsic::amdgcn_raw_buffer_load_format:
2484 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2485 case Intrinsic::amdgcn_raw_tbuffer_load:
2486 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2487 case Intrinsic::amdgcn_s_buffer_load:
2488 case Intrinsic::amdgcn_struct_buffer_load:
2489 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2490 case Intrinsic::amdgcn_struct_buffer_load_format:
2491 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2492 case Intrinsic::amdgcn_struct_tbuffer_load:
2493 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2494 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2495 default: {
2496 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2497 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2498 }
2499 break;
2500 }
2501 }
2502 return std::nullopt;
2503}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * createPermlane16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlane16 with the precomputed lane-select halves.
static std::optional< unsigned > matchRowSharePattern(ArrayRef< uint8_t > Ids)
Match a row-share pattern: all 16 lanes of each row read the same source lane.
static bool matchMirrorPattern(ArrayRef< uint8_t > Ids)
Match an N-lane reversal (mirror) pattern.
static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST, SmallVectorImpl< uint8_t > &Ids, const DataLayout &DL)
Build the per-lane shuffle map by evaluating Index for every lane in the wave.
static std::optional< unsigned > matchQuadPermPattern(ArrayRef< uint8_t > Ids)
Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp QUAD_PERM control word: bits[1:0]=Ids...
static std::optional< unsigned > matchHalfRowPermPattern(ArrayRef< uint8_t > Ids)
Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8 24-bit selector (three bits per ...
static std::optional< unsigned > matchRowXMaskPattern(ArrayRef< uint8_t > Ids)
Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J, with Mask in [1,...
static constexpr auto matchHalfRowMirrorPattern
static Value * createPermlaneX16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlanex16 with the precomputed lane-select halves.
static bool isRowPattern(ArrayRef< uint8_t > Ids)
Match an N-lane row pattern: each lane in [0, N) reads from a source lane in the same N-lane row,...
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static constexpr auto isFullRowPattern
static constexpr auto isQuadPattern
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static uint64_t computePermlane16Masks(ArrayRef< uint8_t > Ids)
Pack a 16-lane permutation into a single 64-bit value: four bits per output lane, lane J in bits [J*4...
static bool matchHalfWaveSwapPattern(ArrayRef< uint8_t > Ids)
Match a half-wave swap: lane J reads from lane J ^ 32.
static bool hasPeriodicLayout(ArrayRef< uint8_t > Ids)
Lanes are partitioned into groups of Period; each group is a translated copy of the first: Ids[I] = I...
static std::optional< Instruction * > tryOptimizeShufflePattern(InstCombiner &IC, IntrinsicInst &II, const GCNSubtarget &ST)
Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant function of the lane ID into a ...
static constexpr auto isHalfRowPattern
static APInt defaultComponentBroadcast(Value *V)
static std::optional< unsigned > matchDsSwizzleBitmaskPattern(ArrayRef< uint8_t > Ids)
Match a DS_SWIZZLE bitmask-mode permutation: dst_lane = ((src_lane & AND) | OR) ^ XOR with each mask ...
static Value * createDsSwizzle(IRBuilderBase &B, Value *Val, unsigned Offset, const DataLayout &DL)
Emit ds_swizzle with the given immediate, bitcasting/converting between pointer/float types and i32 a...
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static Value * matchShuffleToHWIntrinsic(IRBuilderBase &B, Value *Src, ArrayRef< uint8_t > Ids, const GCNSubtarget &ST, const DataLayout &DL)
Given a shuffle map, try to emit the best hardware intrinsic.
static std::optional< unsigned > matchRowRotatePattern(ArrayRef< uint8_t > Ids)
Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
static bool isCrossRowPattern(ArrayRef< uint8_t > Ids)
Match a cross-row permutation suitable for v_permlanex16: every lane in the low 16-lane half reads fr...
static bool isThreadID(const GCNSubtarget &ST, Value *V)
static Value * createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl)
Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and bound_ctrl=1 so out-of-bounds...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector)
Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
static constexpr auto matchFullRowMirrorPattern
static std::optional< unsigned > evalLaneExpr(Value *V, unsigned Lane, const GCNSubtarget &ST, const DataLayout &DL, unsigned Depth=0)
Evaluate V as a function of the lane ID and return its value on Lane, or std::nullopt if V is not a c...
static Value * createPermlane64(IRBuilderBase &B, Value *Val)
Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition APFloat.h:334
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1179
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1267
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5899
bool isPosInfinity() const
Definition APFloat.h:1551
const fltSemantics & getSemantics() const
Definition APFloat.h:1546
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1375
bool isNaN() const
Definition APFloat.h:1536
bool isSignaling() const
Definition APFloat.h:1540
APInt bitcastToAPInt() const
Definition APFloat.h:1430
bool isNegInfinity() const
Definition APFloat.h:1552
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1138
cmpResult compare(const APFloat &RHS) const
Definition APFloat.h:1481
bool isInfinity() const
Definition APFloat.h:1535
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1429
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:521
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
bool isMask(unsigned numBits) const
Definition APInt.h:489
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:218
size_t size() const
Get the array size.
Definition ArrayRef.h:141
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
bool isSigned() const
Definition InstrTypes.h:993
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:890
bool isFPPredicate() const
Definition InstrTypes.h:845
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
const APFloat & getValueAPF() const
Definition Constants.h:463
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantFP * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange add(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an addition of a value in this ran...
LLVM_ABI bool isFullSet() const
Return true if this set contains all of the elements possible for this data-type.
LLVM_ABI ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constants.cpp:84
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Tagged union holding either a T or a Error.
Definition Error.h:485
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:289
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:286
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:72
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1135
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2132
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1554
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2378
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1066
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2120
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2649
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1094
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1054
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2553
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1663
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1088
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1701
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
static Value * stripSignOnlyFPOps(Value *Val)
Ignore all operations which only change the sign of a value, returning the underlying magnitude value...
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Metadata node.
Definition Metadata.h:1080
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
static LLVM_ABI MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition Metadata.cpp:110
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:288
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool isSignatureValid(Intrinsic::ID ID, FunctionType *FT, SmallVectorImpl< Type * > &OverloadTys, raw_ostream &OS=nulls())
Returns true if FT is a valid function type for intrinsic ID.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1652
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1695
constexpr unsigned MaxAnalysisRecursionDepth
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1640
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Constant * ConstantFoldInstOperands(const Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:58
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.