LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
20#include "SIDefines.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Sequence.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include <optional>
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
38namespace {
39
40struct AMDGPUImageDMaskIntrinsic {
41 unsigned Intr;
42};
43
44#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
45#include "AMDGPUGenSearchableTables.inc"
46
47} // end anonymous namespace
48
49// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
50//
51// A single NaN input is folded to minnum, so we rely on that folding for
52// handling NaNs.
53static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
54 const APFloat &Src2) {
55 assert(!Src0.isNaN() && !Src1.isNaN() && !Src2.isNaN() &&
56 "nans handled separately");
57 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
58
59 if (Max3.bitwiseIsEqual(Src0))
60 return maxnum(Src1, Src2);
61
62 if (Max3.bitwiseIsEqual(Src1))
63 return maxnum(Src0, Src2);
64
65 return maxnum(Src0, Src1);
66}
67
68// Check if a value can be converted to a 16-bit value without losing precision.
69// The value is expected to be either a float (IsFloat = true) or an unsigned
70// integer (IsFloat = false). When AllowI16SExt is set, a sext from i16 is also
71// accepted: for unsigned addresses sext and zext only differ for a negative
72// i16, which is out of bounds anyway (see caller).
73static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat,
74 bool AllowI16SExt = false) {
75 Type *VTy = V.getType();
76 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
77 // The value is already 16-bit, so we don't want to convert to 16-bit again!
78 return false;
79 }
80 if (IsFloat) {
81 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
82 // We need to check that if we cast the index down to a half, we do not
83 // lose precision.
84 APFloat FloatValue(ConstFloat->getValueAPF());
85 bool LosesInfo = true;
87 &LosesInfo);
88 return !LosesInfo;
89 }
90 } else {
91 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
92 // We need to check that if we cast the index down to an i16, we do not
93 // lose precision.
94 APInt IntValue(ConstInt->getValue());
95 return IntValue.getActiveBits() <= 16;
96 }
97 }
98
99 Value *CastSrc;
100 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
101 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
102 if (!IsExt && !IsFloat && AllowI16SExt)
103 IsExt = match(&V, m_SExt(PatternMatch::m_Value(CastSrc)));
104 if (IsExt) {
105 Type *CastSrcTy = CastSrc->getType();
106 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
107 return true;
108 }
109
110 return false;
111}
112
113// Convert a value to 16-bit.
115 Type *VTy = V.getType();
117 return cast<Instruction>(&V)->getOperand(0);
118 if (VTy->isIntegerTy())
119 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
120 if (VTy->isFloatingPointTy())
121 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
122
123 llvm_unreachable("Should never be called!");
124}
125
126/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
127/// modified arguments (based on OldIntr) and replaces InstToReplace with
128/// this newly created intrinsic call.
129static std::optional<Instruction *> modifyIntrinsicCall(
130 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
131 InstCombiner &IC,
132 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
133 Func) {
134 SmallVector<Type *, 4> OverloadTys;
135 if (!Intrinsic::isSignatureValid(OldIntr.getCalledFunction(), OverloadTys))
136 return std::nullopt;
137
138 SmallVector<Value *, 8> Args(OldIntr.args());
139
140 // Modify arguments and types
141 Func(Args, OverloadTys);
142
143 CallInst *NewCall =
144 IC.Builder.CreateIntrinsicWithoutFolding(NewIntr, OverloadTys, Args);
145 NewCall->takeName(&OldIntr);
146 NewCall->copyMetadata(OldIntr);
147 if (isa<FPMathOperator>(NewCall))
148 NewCall->copyFastMathFlags(&OldIntr);
149 // Copy attributes
150 AttributeList OldAttrList = OldIntr.getAttributes();
151 NewCall->setAttributes(OldAttrList);
152
153 // Erase and replace uses
154 if (!InstToReplace.getType()->isVoidTy())
155 IC.replaceInstUsesWith(InstToReplace, NewCall);
156
157 bool RemoveOldIntr = &OldIntr != &InstToReplace;
158
159 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
160 if (RemoveOldIntr)
161 IC.eraseInstFromFunction(OldIntr);
162
163 return RetValue;
164}
165
166static std::optional<Instruction *>
168 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
170 // Optimize _L to _LZ when _L is zero
171 if (const auto *LZMappingInfo =
173 if (auto *ConstantLod =
174 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
175 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
176 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
178 ImageDimIntr->Dim);
179 return modifyIntrinsicCall(
180 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
181 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
182 });
183 }
184 }
185 }
186
187 // Optimize _mip away, when 'lod' is zero
188 if (const auto *MIPMappingInfo =
190 if (auto *ConstantMip =
191 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
192 if (ConstantMip->isZero()) {
193 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
194 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
195 ImageDimIntr->Dim);
196 return modifyIntrinsicCall(
197 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
198 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
199 });
200 }
201 }
202 }
203
204 // Optimize _bias away when 'bias' is zero
205 if (const auto *BiasMappingInfo =
207 if (auto *ConstantBias =
208 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
209 if (ConstantBias->isZero()) {
210 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
211 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
212 ImageDimIntr->Dim);
213 return modifyIntrinsicCall(
214 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
215 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
216 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
217 });
218 }
219 }
220 }
221
222 // Optimize _offset away when 'offset' is zero
223 if (const auto *OffsetMappingInfo =
225 if (auto *ConstantOffset =
226 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
227 if (ConstantOffset->isZero()) {
228 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
230 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
231 return modifyIntrinsicCall(
232 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
233 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
234 });
235 }
236 }
237 }
238
239 // Try to use D16
240 if (ST->hasD16Images()) {
241
242 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
244
245 if (BaseOpcode->HasD16) {
246
247 // If the only use of image intrinsic is a fptrunc (with conversion to
248 // half) then both fptrunc and image intrinsic will be replaced with image
249 // intrinsic with D16 flag.
250 if (II.hasOneUse()) {
251 Instruction *User = II.user_back();
252
253 if (User->getOpcode() == Instruction::FPTrunc &&
255
256 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
257 [&](auto &Args, auto &ArgTys) {
258 // Change return type of image intrinsic.
259 // Set it to return type of fptrunc.
260 ArgTys[0] = User->getType();
261 });
262 }
263 }
264
265 // Only perform D16 folding if every user of the image sample is
266 // an ExtractElementInst immediately followed by an FPTrunc to half.
268 ExtractTruncPairs;
269 bool AllHalfExtracts = true;
270
271 for (User *U : II.users()) {
272 auto *Ext = dyn_cast<ExtractElementInst>(U);
273 if (!Ext || !Ext->hasOneUse()) {
274 AllHalfExtracts = false;
275 break;
276 }
277
278 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
279 if (!Tr || !Tr->getType()->isHalfTy()) {
280 AllHalfExtracts = false;
281 break;
282 }
283
284 ExtractTruncPairs.emplace_back(Ext, Tr);
285 }
286
287 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
288 auto *VecTy = cast<VectorType>(II.getType());
289 Type *HalfVecTy =
290 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
291
292 // Obtain the original image sample intrinsic's signature
293 // and replace its return type with the half-vector for D16 folding
294 SmallVector<Type *, 8> OverloadTys;
295 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
296 return std::nullopt;
297
298 OverloadTys[0] = HalfVecTy;
299 Module *M = II.getModule();
301 M, ImageDimIntr->Intr, OverloadTys);
302
303 II.mutateType(HalfVecTy);
304 II.setCalledFunction(HalfDecl);
305
306 IRBuilder<> Builder(II.getContext());
307 for (auto &[Ext, Tr] : ExtractTruncPairs) {
308 Value *Idx = Ext->getIndexOperand();
309
310 Builder.SetInsertPoint(Tr);
311
312 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
313 HalfExtract->takeName(Tr);
314
315 Tr->replaceAllUsesWith(HalfExtract);
316 }
317
318 for (auto &[Ext, Tr] : ExtractTruncPairs) {
319 IC.eraseInstFromFunction(*Tr);
320 IC.eraseInstFromFunction(*Ext);
321 }
322
323 return &II;
324 }
325 }
326 }
327
328 // Try to use A16 or G16
329 if (!ST->hasA16() && !ST->hasG16())
330 return std::nullopt;
331
332 // Address is interpreted as float if the instruction has a sampler or as
333 // unsigned int if there is no sampler.
334 bool HasSampler =
336 bool FloatCoord = false;
337 // true means derivatives can be converted to 16 bit, coordinates not
338 bool OnlyDerivatives = false;
339
340 // Sampler-less addresses are unsigned, so a sext from i16 folds to a16 like a
341 // zext: they only disagree for a negative i16 (>= 0x8000), which is out of
342 // bounds while the max image dimension is <= 0x8000.
343 bool AllowI16SExt = !HasSampler;
344
345 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
346 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
347 Value *Coord = II.getOperand(OperandIndex);
348 // If the values are not derived from 16-bit values, we cannot optimize.
349 if (!canSafelyConvertTo16Bit(*Coord, HasSampler, AllowI16SExt)) {
350 if (OperandIndex < ImageDimIntr->CoordStart ||
351 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
352 return std::nullopt;
353 }
354 // All gradients can be converted, so convert only them
355 OnlyDerivatives = true;
356 break;
357 }
358
359 assert(OperandIndex == ImageDimIntr->GradientStart ||
360 FloatCoord == Coord->getType()->isFloatingPointTy());
361 FloatCoord = Coord->getType()->isFloatingPointTy();
362 }
363
364 if (!OnlyDerivatives && !ST->hasA16())
365 OnlyDerivatives = true; // Only supports G16
366
367 // Check if there is a bias parameter and if it can be converted to f16
368 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
369 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
370 assert(HasSampler &&
371 "Only image instructions with a sampler can have a bias");
372 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
373 OnlyDerivatives = true;
374 }
375
376 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
377 ImageDimIntr->CoordStart))
378 return std::nullopt;
379
380 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
381 : Type::getInt16Ty(II.getContext());
382
383 return modifyIntrinsicCall(
384 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
385 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
386 if (!OnlyDerivatives) {
387 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
388
389 // Change the bias type
390 if (ImageDimIntr->NumBiasArgs != 0)
391 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
392 }
393
394 unsigned EndIndex =
395 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
396 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
397 OperandIndex < EndIndex; OperandIndex++) {
398 Args[OperandIndex] =
399 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
400 }
401
402 // Convert the bias
403 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
404 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
405 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
406 }
407 });
408}
409
411 const Value *Op0, const Value *Op1,
412 InstCombiner &IC) const {
413 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
414 // infinity, gives +0.0. If we can prove we don't have one of the special
415 // cases then we can use a normal multiply instead.
416 // TODO: Create and use isKnownFiniteNonZero instead of just matching
417 // constants here.
420 // One operand is not zero or infinity or NaN.
421 return true;
422 }
423
425 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
426 // Neither operand is infinity or NaN.
427 return true;
428 }
429 return false;
430}
431
432/// Match an fpext from half to float, or a constant we can convert.
434 Value *Src = nullptr;
435 ConstantFP *CFP = nullptr;
436 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
437 if (Src->getType()->isHalfTy())
438 return Src;
439 } else if (match(Arg, m_ConstantFP(CFP))) {
440 bool LosesInfo;
441 APFloat Val(CFP->getValueAPF());
443 if (!LosesInfo)
444 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
445 }
446 return nullptr;
447}
448
449// Trim all zero components from the end of the vector \p UseV and return
450// an appropriate bitset with known elements.
452 Instruction *I) {
453 auto *VTy = cast<FixedVectorType>(UseV->getType());
454 unsigned VWidth = VTy->getNumElements();
455 APInt DemandedElts = APInt::getAllOnes(VWidth);
456
457 for (int i = VWidth - 1; i > 0; --i) {
458 auto *Elt = findScalarElement(UseV, i);
459 if (!Elt)
460 break;
461
462 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
463 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
464 break;
465 } else {
466 break;
467 }
468
469 DemandedElts.clearBit(i);
470 }
471
472 return DemandedElts;
473}
474
475// Trim elements of the end of the vector \p V, if they are
476// equal to the first element of the vector.
478 auto *VTy = cast<FixedVectorType>(V->getType());
479 unsigned VWidth = VTy->getNumElements();
480 APInt DemandedElts = APInt::getAllOnes(VWidth);
481 Value *FirstComponent = findScalarElement(V, 0);
482
483 SmallVector<int> ShuffleMask;
484 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
485 SVI->getShuffleMask(ShuffleMask);
486
487 for (int I = VWidth - 1; I > 0; --I) {
488 if (ShuffleMask.empty()) {
489 auto *Elt = findScalarElement(V, I);
490 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
491 break;
492 } else {
493 // Detect identical elements in the shufflevector result, even though
494 // findScalarElement cannot tell us what that element is.
495 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
496 break;
497 }
498 DemandedElts.clearBit(I);
499 }
500
501 return DemandedElts;
502}
503
506 APInt DemandedElts,
507 int DMaskIdx = -1,
508 bool IsLoad = true);
509
510/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
511static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
512 return (SqrtOp->getType()->isFloatTy() &&
513 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
514 SqrtOp->getType()->isHalfTy();
515}
516
517/// Return true if we can easily prove that use U is uniform.
518static bool isTriviallyUniform(const Use &U) {
519 Value *V = U.get();
520 if (isa<Constant>(V))
521 return true;
522 if (const auto *A = dyn_cast<Argument>(V))
524 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
525 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
526 return false;
527 // If II and U are in different blocks then there is a possibility of
528 // temporal divergence.
529 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
530 }
531 return false;
532}
533
534/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
535///
536/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
539 unsigned LaneArgIdx) const {
540 unsigned MaskBits = ST->getWavefrontSizeLog2();
541 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
542
543 KnownBits Known(32);
544 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
545 return true;
546
547 if (!Known.isConstant())
548 return false;
549
550 // Out of bounds indexes may appear in wave64 code compiled for wave32.
551 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
552 // manually fix it up.
553
554 Value *LaneArg = II.getArgOperand(LaneArgIdx);
555 Constant *MaskedConst =
556 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
557 if (MaskedConst != LaneArg) {
558 II.getOperandUse(LaneArgIdx).set(MaskedConst);
559 return true;
560 }
561
562 return false;
563}
564
566 Function &NewCallee, ArrayRef<Value *> Ops) {
568 Old.getOperandBundlesAsDefs(OpBundles);
569
570 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
571 NewCall->takeName(&Old);
572 return NewCall;
573}
574
575// Return true for sequences of instructions that effectively assign
576// each lane to its thread ID
577static bool isThreadID(const GCNSubtarget &ST, Value *V) {
578 // Case 1:
579 // wave32: mbcnt_lo(-1, 0)
580 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
586 if (ST.isWave32() && match(V, W32Pred))
587 return true;
588 if (ST.isWave64() && match(V, W64Pred))
589 return true;
590
591 return false;
592}
593
596 IntrinsicInst &II) const {
597 const auto IID = II.getIntrinsicID();
598 assert(IID == Intrinsic::amdgcn_readlane ||
599 IID == Intrinsic::amdgcn_readfirstlane ||
600 IID == Intrinsic::amdgcn_permlane64);
601
602 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
603
604 // Only do this if both instructions are in the same block
605 // (so the exec mask won't change) and the readlane is the only user of its
606 // operand.
607 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
608 return nullptr;
609
610 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
611
612 // If this is a readlane, check that the second operand is a constant, or is
613 // defined before OpInst so we know it's safe to move this intrinsic higher.
614 Value *LaneID = nullptr;
615 if (IsReadLane) {
616 LaneID = II.getOperand(1);
617
618 // readlane take an extra operand for the lane ID, so we must check if that
619 // LaneID value can be used at the point where we want to move the
620 // intrinsic.
621 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
622 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
623 return nullptr;
624 }
625 }
626
627 // Hoist the intrinsic (II) through OpInst.
628 //
629 // (II (OpInst x)) -> (OpInst (II x))
630 const auto DoIt = [&](unsigned OpIdx,
631 Function *NewIntrinsic) -> Instruction * {
633 if (IsReadLane)
634 Ops.push_back(LaneID);
635
636 // Rewrite the intrinsic call.
637 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
638
639 // Rewrite OpInst so it takes the result of the intrinsic now.
640 Instruction &NewOp = *OpInst->clone();
641 NewOp.setOperand(OpIdx, NewII);
642 return &NewOp;
643 };
644
645 // TODO(?): Should we do more with permlane64?
646 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
647 return nullptr;
648
649 if (isa<UnaryOperator>(OpInst))
650 return DoIt(0, II.getCalledFunction());
651
652 if (isa<CastInst>(OpInst)) {
653 Value *Src = OpInst->getOperand(0);
654 Type *SrcTy = Src->getType();
655 if (!isTypeLegal(SrcTy))
656 return nullptr;
657
658 Function *Remangled =
659 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
660 return DoIt(0, Remangled);
661 }
662
663 // We can also hoist through binary operators if the other operand is uniform.
664 if (isa<BinaryOperator>(OpInst)) {
665 // FIXME: If we had access to UniformityInfo here we could just check
666 // if the operand is uniform.
667 if (isTriviallyUniform(OpInst->getOperandUse(0)))
668 return DoIt(1, II.getCalledFunction());
669 if (isTriviallyUniform(OpInst->getOperandUse(1)))
670 return DoIt(0, II.getCalledFunction());
671 }
672
673 return nullptr;
674}
675
676/// Evaluate V as a function of the lane ID and return its value on Lane, or
677/// std::nullopt if V is not a closed-form expression of the lane ID.
678static std::optional<unsigned> evalLaneExpr(Value *V, unsigned Lane,
679 const GCNSubtarget &ST,
680 const DataLayout &DL,
681 unsigned Depth = 0) {
683 return std::nullopt;
684
685 // Poison/undef in the index expression: bail and let InstCombine fold the
686 // intrinsic the usual way.
687 if (isa<UndefValue>(V))
688 return std::nullopt;
689
690 if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
691 return CI->getZExtValue();
692
693 if (isThreadID(ST, V))
694 return Lane;
695
697 if (!BO)
698 return std::nullopt;
699
700 std::optional<unsigned> LHS =
701 evalLaneExpr(BO->getOperand(0), Lane, ST, DL, Depth + 1);
702 if (!LHS)
703 return std::nullopt;
704 std::optional<unsigned> RHS =
705 evalLaneExpr(BO->getOperand(1), Lane, ST, DL, Depth + 1);
706 if (!RHS)
707 return std::nullopt;
708
709 Type *Ty = BO->getType();
710 Constant *Ops[] = {ConstantInt::get(Ty, *LHS), ConstantInt::get(Ty, *RHS)};
711 auto *CI =
713 return CI ? std::optional<unsigned>(CI->getZExtValue()) : std::nullopt;
714}
715
716/// Build the per-lane shuffle map by evaluating Index for every lane in the
717/// wave. Returns false if any lane index is non-constant or out of range.
718static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST,
720 const DataLayout &DL) {
721 unsigned WaveSize = ST.getWavefrontSize();
722 Ids.resize(WaveSize);
723 for (unsigned Lane : seq(WaveSize)) {
724 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
725 if (!Val || *Val >= WaveSize)
726 return false;
727 Ids[Lane] = *Val;
728 }
729 return true;
730}
731
732/// Lanes are partitioned into groups of Period; each group is a translated
733/// copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
734template <unsigned Period>
736 static_assert(isPowerOf2_32(Period), "Period must be a power of two");
737 for (unsigned I = Period, E = Ids.size(); I < E; ++I)
738 if (Ids[I] != Ids[I % Period] + (I & ~(Period - 1)))
739 return false;
740 return true;
741}
742
743/// Match an N-lane row pattern: each lane in [0, N) reads from a source lane
744/// in the same N-lane row, and the pattern repeats periodically across rows.
745template <unsigned N> static bool isRowPattern(ArrayRef<uint8_t> Ids) {
746 for (unsigned I = 0; I < N; ++I)
747 if (Ids[I] >= N)
748 return false;
749 return hasPeriodicLayout<N>(Ids);
750}
751
752static constexpr auto isQuadPattern = isRowPattern<4>;
753static constexpr auto isHalfRowPattern = isRowPattern<8>;
754static constexpr auto isFullRowPattern = isRowPattern<16>;
755
756/// Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp
757/// QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2],
758/// [7:6]=Ids[3].
759static std::optional<unsigned> matchQuadPermPattern(ArrayRef<uint8_t> Ids) {
760 if (!isQuadPattern(Ids))
761 return std::nullopt;
762 return Ids[3] << 6 | Ids[2] << 4 | Ids[1] << 2 | Ids[0];
763}
764
765/// Match an N-lane reversal (mirror) pattern.
766template <unsigned N> static bool matchMirrorPattern(ArrayRef<uint8_t> Ids) {
767 if (!isRowPattern<N>(Ids))
768 return false;
769 for (unsigned J = 0; J < N; ++J)
770 if (Ids[J] != (N - 1) - J)
771 return false;
772 return true;
773}
774
777
778/// Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
779static std::optional<unsigned> matchRowRotatePattern(ArrayRef<uint8_t> Ids) {
780 if (Ids[0] == 0 || !isFullRowPattern(Ids))
781 return std::nullopt;
782 for (unsigned J = 1; J < 16; ++J)
783 if (Ids[J] != (Ids[0] + J) % 16)
784 return std::nullopt;
785 return 16u - Ids[0];
786}
787
788/// Match a row-share pattern: all 16 lanes of each row read the same source
789/// lane. Returns the shared source lane index in [0, 16).
790static std::optional<unsigned> matchRowSharePattern(ArrayRef<uint8_t> Ids) {
791 if (!isFullRowPattern(Ids))
792 return std::nullopt;
793 if (!all_equal(Ids.take_front(16)))
794 return std::nullopt;
795 return Ids[0];
796}
797
798/// Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J,
799/// with Mask in [1, 15].
800static std::optional<unsigned> matchRowXMaskPattern(ArrayRef<uint8_t> Ids) {
801 unsigned Mask = Ids[0];
802 if (Mask == 0 || !isFullRowPattern(Ids))
803 return std::nullopt;
804 for (unsigned J = 0; J < 16; ++J)
805 if (Ids[J] != (Mask ^ J))
806 return std::nullopt;
807 return Mask;
808}
809
810/// Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8
811/// 24-bit selector (three bits per output lane).
812static std::optional<unsigned> matchHalfRowPermPattern(ArrayRef<uint8_t> Ids) {
813 if (!isHalfRowPattern(Ids))
814 return std::nullopt;
815 unsigned Selector = 0;
816 for (unsigned J = 0; J < 8; ++J)
817 Selector |= Ids[J] << (J * 3);
818 return Selector;
819}
820
821/// Pack a 16-lane permutation into a single 64-bit value: four bits per output
822/// lane, lane J in bits [J*4 + 3 : J*4]. The caller splits it into the low and
823/// high 32-bit selector operands of v_permlane16 / v_permlanex16.
825 uint64_t Sel = 0;
826 for (unsigned J = 0; J < 16; ++J)
827 Sel |= static_cast<uint64_t>(Ids[J] & 0xF) << (J * 4);
828 return Sel;
829}
830
831/// Match a half-wave swap: lane J reads from lane J ^ 32. Only meaningful on
832/// wave64 targets.
834 if (Ids.size() != 64)
835 return false;
836 for (unsigned J = 0; J < 64; ++J)
837 if (Ids[J] != (J ^ 32))
838 return false;
839 return true;
840}
841
842/// Match a cross-row permutation suitable for v_permlanex16: every lane in
843/// the low 16-lane half reads from the high half of its own row, and vice
844/// versa.
846 if (!hasPeriodicLayout<32>(Ids))
847 return false;
848 for (unsigned J = 0; J < 16; ++J) {
849 if (Ids[J] < 16 || Ids[J] >= 32)
850 return false;
851 if (Ids[J + 16] != Ids[J] - 16)
852 return false;
853 }
854 return true;
855}
856
857/// Match a DS_SWIZZLE bitmask-mode permutation:
858/// dst_lane = ((src_lane & AND) | OR) ^ XOR
859/// with each mask being five bits. Returns the encoded swizzle immediate.
860/// The hardware applies the formula independently within each 32-lane group,
861/// so on wave64 the high group must replicate the low one (translated by 32).
862static std::optional<unsigned>
864 if (!hasPeriodicLayout<32>(Ids))
865 return std::nullopt;
866
867 // The formula is per-bit: output bit B depends only on input bit B. Probe
868 // each bit with src=0 and src=(1<<B); if the output bit flipped, AND[B]=1
869 // and XOR[B] carries the constant offset; otherwise it is a constant bit
870 // encoded in OR (with AND[B]=0, XOR[B]=0).
871 unsigned AndMask = 0, OrMask = 0, XorMask = 0;
872 for (unsigned B = 0; B < 5; ++B) {
873 unsigned Bit0 = (Ids[0] >> B) & 1;
874 unsigned Bit1 = (Ids[1u << B] >> B) & 1;
875 if (Bit0 != Bit1) {
876 AndMask |= 1u << B;
877 XorMask |= Bit0 << B;
878 } else {
879 OrMask |= Bit0 << B;
880 }
881 }
882
883 // The per-bit derivation assumes bit independence; verify the masks
884 // actually reproduce every lane in the 32-lane group.
885 for (unsigned I : seq(32u)) {
886 unsigned Expected = ((I & AndMask) | OrMask) ^ XorMask;
887 if (Ids[I] != Expected)
888 return std::nullopt;
889 }
890
895}
896
897/// Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation
898/// of all 32 lanes within each 32-lane group by a constant N in [0, 31],
899/// i.e. dst_lane = (src_lane + N) % 32. On wave64, hasPeriodicLayout<32>
900/// ensures both 32-lane groups rotate by the same amount.
901static std::optional<unsigned>
903 if (!hasPeriodicLayout<32>(Ids))
904 return std::nullopt;
905
906 // Determine the rotation amount from lane 0: every lane must read from
907 // lane (I + N) % 32 where N = Ids[0] and 0 <= N <= 31.
908 unsigned N = Ids[0];
909 if (N >= 32)
910 return std::nullopt;
911
912 for (unsigned I = 0; I < 32; ++I)
913 if (Ids[I] != (I + N) % 32)
914 return std::nullopt;
915
918}
919
920/// Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and
921/// bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can
922/// be folded into a consuming VALU op by GCNDPPCombine.
923static Value *createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl) {
924 Type *Ty = Val->getType();
925 return B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, {Ty},
926 {PoisonValue::get(Ty), Val, B.getInt32(Ctrl),
927 B.getInt32(0xF), B.getInt32(0xF), B.getTrue()});
928}
929
930/// Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
931static Value *createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector) {
932 return B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp8, {Val->getType()},
933 {Val, B.getInt32(Selector)});
934}
935
936/// Emit v_permlane16 with the precomputed lane-select halves.
938 uint32_t Hi) {
939 Type *Ty = Val->getType();
940 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane16, {Ty},
941 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
942 B.getInt32(Hi), B.getFalse(), B.getFalse()});
943}
944
945/// Emit v_permlanex16 with the precomputed lane-select halves. Each output
946/// lane reads from the other 16-lane half of the same row.
948 uint32_t Hi) {
949 Type *Ty = Val->getType();
950 return B.CreateIntrinsic(Intrinsic::amdgcn_permlanex16, {Ty},
951 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
952 B.getInt32(Hi), B.getFalse(), B.getFalse()});
953}
954
955/// Emit ds_swizzle with the given immediate, bitcasting/converting between
956/// pointer/float types and i32 as required by the intrinsic signature.
958 const DataLayout &DL) {
959 Type *OrigTy = Val->getType();
960 assert(DL.getTypeSizeInBits(OrigTy) == 32 &&
961 "ds_swizzle only supports 32-bit operands");
962 IntegerType *I32Ty = B.getInt32Ty();
963 Value *Src = Val;
964 if (OrigTy->isPointerTy())
965 Src = B.CreatePtrToInt(Src, I32Ty);
966 else if (OrigTy != I32Ty)
967 Src = B.CreateBitCast(Src, I32Ty);
968 Value *Result = B.CreateIntrinsic(Intrinsic::amdgcn_ds_swizzle, {},
969 {Src, B.getInt32(Offset)});
970 if (OrigTy->isPointerTy())
971 return B.CreateIntToPtr(Result, OrigTy);
972 if (OrigTy != I32Ty)
973 return B.CreateBitCast(Result, OrigTy);
974 return Result;
975}
976
977/// Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
979 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {Val->getType()},
980 {Val});
981}
982
983/// Given a shuffle map, try to emit the best hardware intrinsic.
986 const GCNSubtarget &ST,
987 const DataLayout &DL) {
988 // Identity shuffle (every lane reads itself) folds to the source value.
989 if (all_of(enumerate(Ids),
990 [](const auto &E) { return E.value() == E.index(); }))
991 return Src;
992
993 // Uniform shuffle (all lanes read the same value) is handled by cheaper
994 // broadcast/readlane intrinsics.
995 if (all_equal(Ids))
996 return nullptr;
997
998 if (std::optional<unsigned> QP = matchQuadPermPattern(Ids)) {
999 if (ST.hasDPP())
1000 return createUpdateDpp(B, Src, *QP);
1002 }
1003
1004 if (ST.hasDPP()) {
1009 if (std::optional<unsigned> Amt = matchRowRotatePattern(Ids))
1010 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_ROR_FIRST + *Amt - 1);
1011 }
1012
1013 // row_share is supported on GFX90A and GFX10+; row_xmask is GFX10+ only.
1014 if (ST.hasDPPRowShare()) {
1015 if (std::optional<unsigned> Lane = matchRowSharePattern(Ids))
1016 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_SHARE_FIRST + *Lane);
1017 }
1018
1019 if (ST.hasDPP() && ST.hasGFX10Insts()) {
1020 if (std::optional<unsigned> Mask = matchRowXMaskPattern(Ids))
1021 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_XMASK_FIRST + *Mask);
1022 }
1023
1024 if (ST.hasDPP8()) {
1025 if (std::optional<unsigned> Sel = matchHalfRowPermPattern(Ids))
1026 return createMovDpp8(B, Src, *Sel);
1027 }
1028
1029 if (ST.hasPermlane16Insts()) {
1030 if (isFullRowPattern(Ids)) {
1032 return createPermlane16(B, Src, Lo_32(Sel), Hi_32(Sel));
1033 }
1034 // Cross-row shuffles (e.g. XOR 16..31) — covered by permlanex16.
1035 if (isCrossRowPattern(Ids)) {
1037 return createPermlaneX16(B, Src, Lo_32(Sel), Hi_32(Sel));
1038 }
1039 }
1040
1041 // Generic DS_SWIZZLE bitmask-mode fallback: handles any 32-lane shuffle that
1042 // can be expressed as dst = ((src & AND) | OR) ^ XOR with 5-bit masks. This
1043 // is available on every target that has ds_swizzle.
1044 if (std::optional<unsigned> Imm = matchDsSwizzleBitmaskPattern(Ids))
1045 return createDsSwizzle(B, Src, *Imm, DL);
1046
1047 // DS_SWIZZLE rotate mode (GFX9+): handles cyclic 32-lane rotations that
1048 // bitmask mode cannot express (e.g. +1 mod 32 requires inter-bit carry).
1049 if (ST.hasDsSwizzleRotateMode()) {
1050 if (std::optional<unsigned> Imm = matchDsSwizzleRotatePattern(Ids))
1051 return createDsSwizzle(B, Src, *Imm, DL);
1052 }
1053
1054 if (ST.hasPermLane64() && matchHalfWaveSwapPattern(Ids))
1055 return createPermlane64(B, Src);
1056
1057 return nullptr;
1058}
1059
1060/// Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant
1061/// function of the lane ID into a hardware-specific lane permutation intrinsic.
1062static std::optional<Instruction *>
1064 const GCNSubtarget &ST) {
1065 const DataLayout &DL = IC.getDataLayout();
1066 if (DL.getTypeSizeInBits(II.getType()) != 32)
1067 return std::nullopt;
1068
1069 if (!ST.isWaveSizeKnown())
1070 return std::nullopt;
1071
1072 unsigned WaveSize = ST.getWavefrontSize();
1073 bool IsBpermute = II.getIntrinsicID() == Intrinsic::amdgcn_ds_bpermute;
1074 Value *Src = II.getArgOperand(IsBpermute ? 1 : 0);
1075 Value *Index = II.getArgOperand(IsBpermute ? 0 : 1);
1076
1078 if (IsBpermute) {
1079 Ids.resize(WaveSize);
1080 for (unsigned Lane : seq(WaveSize)) {
1081 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
1082 if (!Val || (*Val & 3) || (*Val >> 2) >= WaveSize)
1083 return std::nullopt;
1084 Ids[Lane] = *Val >> 2;
1085 }
1086 } else {
1087 if (!tryBuildShuffleMap(Index, ST, Ids, DL))
1088 return std::nullopt;
1089 }
1090
1091 Value *Result = matchShuffleToHWIntrinsic(IC.Builder, Src, Ids, ST, DL);
1092 if (!Result)
1093 return std::nullopt;
1094
1095 return IC.replaceInstUsesWith(II, Result);
1096}
1097std::optional<Instruction *>
1099 Intrinsic::ID IID = II.getIntrinsicID();
1100 switch (IID) {
1101 case Intrinsic::amdgcn_implicitarg_ptr: {
1102 if (II.getFunction()->hasFnAttribute("amdgpu-no-implicitarg-ptr"))
1103 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1104 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(*II.getFunction());
1105
1106 uint64_t CurrentOrNullBytes =
1107 II.getAttributes().getRetDereferenceableOrNullBytes();
1108 if (CurrentOrNullBytes != 0) {
1109 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
1110 // into dereferenceable(max(A, B))
1111 uint64_t NewBytes = std::max(CurrentOrNullBytes, ImplicitArgBytes);
1112 II.addRetAttr(
1113 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1114 II.removeRetAttr(Attribute::DereferenceableOrNull);
1115 return &II;
1116 }
1117
1118 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
1119 uint64_t NewBytes = std::max(CurrentBytes, ImplicitArgBytes);
1120 if (NewBytes != CurrentBytes) {
1121 II.addRetAttr(
1122 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1123 return &II;
1124 }
1125
1126 return std::nullopt;
1127 }
1128 case Intrinsic::amdgcn_rcp: {
1129 Value *Src = II.getArgOperand(0);
1130 if (isa<PoisonValue>(Src))
1131 return IC.replaceInstUsesWith(II, Src);
1132
1133 // TODO: Move to ConstantFolding/InstSimplify?
1134 if (isa<UndefValue>(Src)) {
1135 Type *Ty = II.getType();
1136 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1137 return IC.replaceInstUsesWith(II, QNaN);
1138 }
1139
1140 if (II.isStrictFP())
1141 break;
1142
1143 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1144 const APFloat &ArgVal = C->getValueAPF();
1145 APFloat Val(ArgVal.getSemantics(), 1);
1147
1148 // This is more precise than the instruction may give.
1149 //
1150 // TODO: The instruction always flushes denormal results (except for f16),
1151 // should this also?
1152 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
1153 }
1154
1155 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
1156 if (!FMF.allowContract())
1157 break;
1158 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
1159 if (!SrcCI)
1160 break;
1161
1162 auto IID = SrcCI->getIntrinsicID();
1163 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
1164 //
1165 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
1166 // relaxed.
1167 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
1168 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
1169 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
1170 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
1171 break;
1172
1173 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
1174 break;
1175
1177 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
1178
1179 InnerFMF |= FMF;
1180 II.setFastMathFlags(InnerFMF);
1181
1182 II.setCalledFunction(NewDecl);
1183 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
1184 }
1185
1186 break;
1187 }
1188 case Intrinsic::amdgcn_sqrt:
1189 case Intrinsic::amdgcn_rsq:
1190 case Intrinsic::amdgcn_tanh: {
1191 Value *Src = II.getArgOperand(0);
1192 if (isa<PoisonValue>(Src))
1193 return IC.replaceInstUsesWith(II, Src);
1194
1195 // TODO: Move to ConstantFolding/InstSimplify?
1196 if (isa<UndefValue>(Src)) {
1197 Type *Ty = II.getType();
1198 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1199 return IC.replaceInstUsesWith(II, QNaN);
1200 }
1201
1202 // f16 amdgcn.sqrt is identical to regular sqrt.
1203 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
1205 II.getModule(), Intrinsic::sqrt, {II.getType()});
1206 II.setCalledFunction(NewDecl);
1207 return &II;
1208 }
1209
1210 break;
1211 }
1212 case Intrinsic::amdgcn_log:
1213 case Intrinsic::amdgcn_exp2: {
1214 const bool IsLog = IID == Intrinsic::amdgcn_log;
1215 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
1216 Value *Src = II.getArgOperand(0);
1217 Type *Ty = II.getType();
1218
1219 if (isa<PoisonValue>(Src))
1220 return IC.replaceInstUsesWith(II, Src);
1221
1222 if (IC.getSimplifyQuery().isUndefValue(Src))
1224
1225 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1226 if (C->isInfinity()) {
1227 // exp2(+inf) -> +inf
1228 // log2(+inf) -> +inf
1229 if (!C->isNegative())
1230 return IC.replaceInstUsesWith(II, C);
1231
1232 // exp2(-inf) -> 0
1233 if (IsExp && C->isNegative())
1235 }
1236
1237 if (II.isStrictFP())
1238 break;
1239
1240 if (C->isNaN()) {
1241 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
1242 return IC.replaceInstUsesWith(II, Quieted);
1243 }
1244
1245 // f32 instruction doesn't handle denormals, f16 does.
1246 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
1247 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
1248 : ConstantFP::get(Ty, 1.0);
1249 return IC.replaceInstUsesWith(II, FoldedValue);
1250 }
1251
1252 if (IsLog && C->isNegative())
1254
1255 // TODO: Full constant folding matching hardware behavior.
1256 }
1257
1258 break;
1259 }
1260 case Intrinsic::amdgcn_frexp_mant:
1261 case Intrinsic::amdgcn_frexp_exp: {
1262 Value *Src = II.getArgOperand(0);
1263 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1264 int Exp;
1265 APFloat Significand =
1266 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
1267
1268 if (IID == Intrinsic::amdgcn_frexp_mant) {
1269 return IC.replaceInstUsesWith(
1270 II, ConstantFP::get(II.getContext(), Significand));
1271 }
1272
1273 // Match instruction special case behavior.
1274 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
1275 Exp = 0;
1276
1277 return IC.replaceInstUsesWith(II,
1278 ConstantInt::getSigned(II.getType(), Exp));
1279 }
1280
1281 if (isa<PoisonValue>(Src))
1282 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1283
1284 if (isa<UndefValue>(Src)) {
1285 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1286 }
1287
1288 break;
1289 }
1290 case Intrinsic::amdgcn_class: {
1291 Value *Src0 = II.getArgOperand(0);
1292 Value *Src1 = II.getArgOperand(1);
1293 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
1294 if (CMask) {
1295 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1296 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
1297
1298 // Clamp any excess bits, as they're illegal for the generic intrinsic.
1299 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
1300 CMask->getZExtValue() & fcAllFlags));
1301 return &II;
1302 }
1303
1304 // Propagate poison.
1305 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
1306 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1307
1308 // llvm.amdgcn.class(_, undef) -> false
1309 if (IC.getSimplifyQuery().isUndefValue(Src1))
1310 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
1311
1312 // llvm.amdgcn.class(undef, mask) -> mask != 0
1313 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
1314 Value *CmpMask = IC.Builder.CreateICmpNE(
1315 Src1, ConstantInt::getNullValue(Src1->getType()));
1316 return IC.replaceInstUsesWith(II, CmpMask);
1317 }
1318 break;
1319 }
1320 case Intrinsic::amdgcn_cvt_pkrtz: {
1321 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
1322 Type *HalfTy = Type::getHalfTy(Arg->getContext());
1323
1324 if (isa<PoisonValue>(Arg))
1325 return PoisonValue::get(HalfTy);
1326 if (isa<UndefValue>(Arg))
1327 return UndefValue::get(HalfTy);
1328
1329 ConstantFP *CFP = nullptr;
1330 if (match(Arg, m_ConstantFP(CFP))) {
1331 bool LosesInfo;
1332 APFloat Val(CFP->getValueAPF());
1334 return ConstantFP::get(HalfTy, Val);
1335 }
1336
1337 Value *Src = nullptr;
1338 if (match(Arg, m_FPExt(m_Value(Src)))) {
1339 if (Src->getType()->isHalfTy())
1340 return Src;
1341 }
1342
1343 return nullptr;
1344 };
1345
1346 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
1347 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
1348 Value *V = PoisonValue::get(II.getType());
1349 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
1350 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
1351 return IC.replaceInstUsesWith(II, V);
1352 }
1353 }
1354
1355 break;
1356 }
1357 case Intrinsic::amdgcn_cvt_pknorm_i16:
1358 case Intrinsic::amdgcn_cvt_pknorm_u16:
1359 case Intrinsic::amdgcn_cvt_pk_i16:
1360 case Intrinsic::amdgcn_cvt_pk_u16: {
1361 Value *Src0 = II.getArgOperand(0);
1362 Value *Src1 = II.getArgOperand(1);
1363
1364 // TODO: Replace call with scalar operation if only one element is poison.
1365 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
1366 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1367
1368 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
1369 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1370 }
1371
1372 break;
1373 }
1374 case Intrinsic::amdgcn_cvt_off_f32_i4: {
1375 Value* Arg = II.getArgOperand(0);
1376 Type *Ty = II.getType();
1377
1378 if (isa<PoisonValue>(Arg))
1379 return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
1380
1381 if(IC.getSimplifyQuery().isUndefValue(Arg))
1383
1384 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
1385 if (!CArg)
1386 break;
1387
1388 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1389 constexpr size_t ResValsSize = 16;
1390 static constexpr float ResVals[ResValsSize] = {
1391 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1392 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1393 Constant *Res =
1394 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1395 return IC.replaceInstUsesWith(II, Res);
1396 }
1397 case Intrinsic::amdgcn_ubfe:
1398 case Intrinsic::amdgcn_sbfe: {
1399 // Decompose simple cases into standard shifts.
1400 Value *Src = II.getArgOperand(0);
1401 if (isa<UndefValue>(Src)) {
1402 return IC.replaceInstUsesWith(II, Src);
1403 }
1404
1405 unsigned Width;
1406 Type *Ty = II.getType();
1407 unsigned IntSize = Ty->getIntegerBitWidth();
1408
1409 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
1410 if (CWidth) {
1411 Width = CWidth->getZExtValue();
1412 if ((Width & (IntSize - 1)) == 0) {
1414 }
1415
1416 // Hardware ignores high bits, so remove those.
1417 if (Width >= IntSize) {
1418 return IC.replaceOperand(
1419 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
1420 }
1421 }
1422
1423 unsigned Offset;
1424 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
1425 if (COffset) {
1426 Offset = COffset->getZExtValue();
1427 if (Offset >= IntSize) {
1428 return IC.replaceOperand(
1429 II, 1,
1430 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
1431 }
1432 }
1433
1434 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1435
1436 if (!CWidth || !COffset)
1437 break;
1438
1439 // The case of Width == 0 is handled above, which makes this transformation
1440 // safe. If Width == 0, then the ashr and lshr instructions become poison
1441 // value since the shift amount would be equal to the bit size.
1442 assert(Width != 0);
1443
1444 // TODO: This allows folding to undef when the hardware has specific
1445 // behavior?
1446 if (Offset + Width < IntSize) {
1447 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
1448 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
1449 : IC.Builder.CreateLShr(Shl, IntSize - Width);
1450 RightShift->takeName(&II);
1451 return IC.replaceInstUsesWith(II, RightShift);
1452 }
1453
1454 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
1455 : IC.Builder.CreateLShr(Src, Offset);
1456
1457 RightShift->takeName(&II);
1458 return IC.replaceInstUsesWith(II, RightShift);
1459 }
1460 case Intrinsic::amdgcn_exp:
1461 case Intrinsic::amdgcn_exp_row:
1462 case Intrinsic::amdgcn_exp_compr: {
1463 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
1464 unsigned EnBits = En->getZExtValue();
1465 if (EnBits == 0xf)
1466 break; // All inputs enabled.
1467
1468 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1469 bool Changed = false;
1470 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1471 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1472 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1473 Value *Src = II.getArgOperand(I + 2);
1474 if (!isa<PoisonValue>(Src)) {
1475 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
1476 Changed = true;
1477 }
1478 }
1479 }
1480
1481 if (Changed) {
1482 return &II;
1483 }
1484
1485 break;
1486 }
1487 case Intrinsic::amdgcn_fmed3: {
1488 Value *Src0 = II.getArgOperand(0);
1489 Value *Src1 = II.getArgOperand(1);
1490 Value *Src2 = II.getArgOperand(2);
1491
1492 for (Value *Src : {Src0, Src1, Src2}) {
1493 if (isa<PoisonValue>(Src))
1494 return IC.replaceInstUsesWith(II, Src);
1495 }
1496
1497 if (II.isStrictFP())
1498 break;
1499
1500 // med3 with a nan input acts like
1501 // v_min_f32(v_min_f32(s0, s1), s2)
1502 //
1503 // Signalingness is ignored with ieee=0, so we fold to
1504 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1505 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1506 // returned signaling nan will not be quieted.
1507
1508 // ieee=1
1509 // s0 snan: s2
1510 // s1 snan: s2
1511 // s2 snan: qnan
1512
1513 // s0 qnan: min(s1, s2)
1514 // s1 qnan: min(s0, s2)
1515 // s2 qnan: min(s0, s1)
1516
1517 // ieee=0
1518 // s0 _nan: min(s1, s2)
1519 // s1 _nan: min(s0, s2)
1520 // s2 _nan: min(s0, s1)
1521
1522 // med3 behavior with infinity
1523 // s0 +inf: max(s1, s2)
1524 // s1 +inf: max(s0, s2)
1525 // s2 +inf: max(s0, s1)
1526 // s0 -inf: min(s1, s2)
1527 // s1 -inf: min(s0, s2)
1528 // s2 -inf: min(s0, s1)
1529
1530 // Checking for NaN before canonicalization provides better fidelity when
1531 // mapping other operations onto fmed3 since the order of operands is
1532 // unchanged.
1533 Value *V = nullptr;
1534 const APFloat *ConstSrc0 = nullptr;
1535 const APFloat *ConstSrc1 = nullptr;
1536 const APFloat *ConstSrc2 = nullptr;
1537
1538 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1539 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1540 isa<UndefValue>(Src0)) {
1541 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1542 switch (fpenvIEEEMode(II)) {
1543 case KnownIEEEMode::On:
1544 // TODO: If Src2 is snan, does it need quieting?
1545 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1546 return IC.replaceInstUsesWith(II, Src2);
1547
1548 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1549 : IC.Builder.CreateMinNum(Src1, Src2);
1550 break;
1551 case KnownIEEEMode::Off:
1552 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1553 : IC.Builder.CreateMinimumNum(Src1, Src2);
1554 break;
1556 break;
1557 }
1558 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1559 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1560 isa<UndefValue>(Src1)) {
1561 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1562 switch (fpenvIEEEMode(II)) {
1563 case KnownIEEEMode::On:
1564 // TODO: If Src2 is snan, does it need quieting?
1565 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1566 return IC.replaceInstUsesWith(II, Src2);
1567
1568 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1569 : IC.Builder.CreateMinNum(Src0, Src2);
1570 break;
1571 case KnownIEEEMode::Off:
1572 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1573 : IC.Builder.CreateMinimumNum(Src0, Src2);
1574 break;
1576 break;
1577 }
1578 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1579 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1580 isa<UndefValue>(Src2)) {
1581 switch (fpenvIEEEMode(II)) {
1582 case KnownIEEEMode::On:
1583 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1584 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1585 return IC.replaceInstUsesWith(II, Quieted);
1586 }
1587
1588 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1589 ? IC.Builder.CreateMaxNum(Src0, Src1)
1590 : IC.Builder.CreateMinNum(Src0, Src1);
1591 break;
1592 case KnownIEEEMode::Off:
1593 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1594 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1595 : IC.Builder.CreateMaximumNum(Src0, Src1);
1596 break;
1598 break;
1599 }
1600 }
1601
1602 if (V) {
1603 if (auto *CI = dyn_cast<CallInst>(V)) {
1604 CI->copyFastMathFlags(&II);
1605 CI->takeName(&II);
1606 }
1607 return IC.replaceInstUsesWith(II, V);
1608 }
1609
1610 bool Swap = false;
1611 // Canonicalize constants to RHS operands.
1612 //
1613 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1614 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1615 std::swap(Src0, Src1);
1616 Swap = true;
1617 }
1618
1619 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1620 std::swap(Src1, Src2);
1621 Swap = true;
1622 }
1623
1624 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1625 std::swap(Src0, Src1);
1626 Swap = true;
1627 }
1628
1629 if (Swap) {
1630 II.setArgOperand(0, Src0);
1631 II.setArgOperand(1, Src1);
1632 II.setArgOperand(2, Src2);
1633 return &II;
1634 }
1635
1636 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1637 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1638 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1639 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1640 C2->getValueAPF());
1641 return IC.replaceInstUsesWith(II,
1642 ConstantFP::get(II.getType(), Result));
1643 }
1644 }
1645 }
1646
1647 if (!ST->hasMed3_16())
1648 break;
1649
1650 // Repeat floating-point width reduction done for minnum/maxnum.
1651 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1652 if (Value *X = matchFPExtFromF16(Src0)) {
1653 if (Value *Y = matchFPExtFromF16(Src1)) {
1654 if (Value *Z = matchFPExtFromF16(Src2)) {
1655 Value *NewCall = IC.Builder.CreateIntrinsic(
1656 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1657 return new FPExtInst(NewCall, II.getType());
1658 }
1659 }
1660 }
1661
1662 break;
1663 }
1664 case Intrinsic::amdgcn_icmp:
1665 case Intrinsic::amdgcn_fcmp: {
1666 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1667 // Guard against invalid arguments.
1668 int64_t CCVal = CC->getZExtValue();
1669 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1670 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1671 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1672 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1674 break;
1675
1676 Value *Src0 = II.getArgOperand(0);
1677 Value *Src1 = II.getArgOperand(1);
1678
1679 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1680 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1682 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1683 if (CCmp && CCmp->isNullValue()) {
1684 return IC.replaceInstUsesWith(
1685 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1686 }
1687
1688 // The result of V_ICMP/V_FCMP assembly instructions (which this
1689 // intrinsic exposes) is one bit per thread, masked with the EXEC
1690 // register (which contains the bitmask of live threads). So a
1691 // comparison that always returns true is the same as a read of the
1692 // EXEC register. ballot(true) reads EXEC at the wave-size width, so
1693 // zext/trunc the result to the intrinsic's return type.
1694 Type *WaveTy = IC.Builder.getIntNTy(ST->getWavefrontSize());
1695 Value *Ballot = IC.Builder.CreateIntrinsic(
1696 Intrinsic::amdgcn_ballot, WaveTy, IC.Builder.getTrue());
1697 Value *Result = IC.Builder.CreateZExtOrTrunc(Ballot, II.getType());
1698 return IC.replaceInstUsesWith(II, Result);
1699 }
1700
1701 // Canonicalize constants to RHS.
1702 CmpInst::Predicate SwapPred =
1704 II.setArgOperand(0, Src1);
1705 II.setArgOperand(1, Src0);
1706 II.setArgOperand(
1707 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1708 return &II;
1709 }
1710
1711 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1712 break;
1713
1714 // Canonicalize compare eq with true value to compare != 0
1715 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1716 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1717 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1718 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1719 Value *ExtSrc;
1720 if (CCVal == CmpInst::ICMP_EQ &&
1721 ((match(Src1, PatternMatch::m_One()) &&
1722 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1723 (match(Src1, PatternMatch::m_AllOnes()) &&
1724 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1725 ExtSrc->getType()->isIntegerTy(1)) {
1727 IC.replaceOperand(II, 2,
1728 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1729 return &II;
1730 }
1731
1732 CmpPredicate SrcPred;
1733 Value *SrcLHS;
1734 Value *SrcRHS;
1735
1736 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1737 // intrinsic. The typical use is a wave vote function in the library, which
1738 // will be fed from a user code condition compared with 0. Fold in the
1739 // redundant compare.
1740
1741 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1742 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1743 //
1744 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1745 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1746 if (match(Src1, PatternMatch::m_Zero()) &&
1748 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1749 PatternMatch::m_Value(SrcRHS))))) {
1750 if (CCVal == CmpInst::ICMP_EQ)
1751 SrcPred = CmpInst::getInversePredicate(SrcPred);
1752
1753 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1754 ? Intrinsic::amdgcn_fcmp
1755 : Intrinsic::amdgcn_icmp;
1756
1757 Type *Ty = SrcLHS->getType();
1758 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1759 // Promote to next legal integer type.
1760 unsigned Width = CmpType->getBitWidth();
1761 unsigned NewWidth = Width;
1762
1763 // Don't do anything for i1 comparisons.
1764 if (Width == 1)
1765 break;
1766
1767 if (Width <= 16)
1768 NewWidth = 16;
1769 else if (Width <= 32)
1770 NewWidth = 32;
1771 else if (Width <= 64)
1772 NewWidth = 64;
1773 else
1774 break; // Can't handle this.
1775
1776 if (Width != NewWidth) {
1777 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1778 if (CmpInst::isSigned(SrcPred)) {
1779 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1780 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1781 } else {
1782 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1783 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1784 }
1785 }
1786 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1787 break;
1788
1789 Value *Args[] = {SrcLHS, SrcRHS,
1790 ConstantInt::get(CC->getType(), SrcPred)};
1791 Value *NewCall = IC.Builder.CreateIntrinsic(
1792 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1793 NewCall->takeName(&II);
1794 return IC.replaceInstUsesWith(II, NewCall);
1795 }
1796
1797 break;
1798 }
1799 case Intrinsic::amdgcn_mbcnt_hi:
1800 // exec_hi is all 0, so this is just a copy.
1801 if (ST->isWave32())
1802 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1803 [[fallthrough]];
1804 case Intrinsic::amdgcn_mbcnt_lo: {
1805 ConstantRange AccRange =
1806 computeConstantRange(II.getArgOperand(1),
1807 /*ForSigned=*/false, IC.getSimplifyQuery());
1808 if (AccRange.isFullSet())
1809 return nullptr;
1810
1811 // TODO: Can raise lower bound by inspecting first argument.
1812 ConstantRange MbcntRange(APInt(32, 0), APInt(32, 32 + 1));
1813 ConstantRange ComputedRange = AccRange.add(MbcntRange);
1814 if (ComputedRange.isFullSet())
1815 return nullptr;
1816
1817 if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1818 ComputedRange = ComputedRange.intersectWith(*ExistingRange);
1819 if (ComputedRange == *ExistingRange)
1820 return nullptr;
1821 }
1822
1823 II.addRangeRetAttr(ComputedRange);
1824 return nullptr;
1825 }
1826 case Intrinsic::amdgcn_ballot: {
1827 Value *Arg = II.getArgOperand(0);
1828 if (isa<PoisonValue>(Arg))
1829 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1830
1831 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1832 if (Src->isZero()) {
1833 // amdgcn.ballot(i1 0) is zero.
1834 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1835 }
1836 }
1837 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1838 // %b64 = call i64 ballot.i64(...)
1839 // =>
1840 // %b32 = call i32 ballot.i32(...)
1841 // %b64 = zext i32 %b32 to i64
1843 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1844 {IC.Builder.getInt32Ty()},
1845 {II.getArgOperand(0)}),
1846 II.getType());
1847 Call->takeName(&II);
1848 return IC.replaceInstUsesWith(II, Call);
1849 }
1850 break;
1851 }
1852 case Intrinsic::amdgcn_wavefrontsize: {
1853 if (ST->isWaveSizeKnown())
1854 return IC.replaceInstUsesWith(
1855 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1856 break;
1857 }
1858 case Intrinsic::amdgcn_wqm_vote: {
1859 // wqm_vote is identity when the argument is constant.
1860 if (!isa<Constant>(II.getArgOperand(0)))
1861 break;
1862
1863 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1864 }
1865 case Intrinsic::amdgcn_kill: {
1866 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1867 if (!C || !C->getZExtValue())
1868 break;
1869
1870 // amdgcn.kill(i1 1) is a no-op
1871 return IC.eraseInstFromFunction(II);
1872 }
1873 case Intrinsic::amdgcn_s_sendmsg:
1874 case Intrinsic::amdgcn_s_sendmsghalt: {
1875 // The second operand is copied to m0, but is only actually used for
1876 // certain message types. For message types that are known to not use m0,
1877 // fold it to poison.
1878 using namespace AMDGPU::SendMsg;
1879
1880 Value *M0Val = II.getArgOperand(1);
1881 if (isa<PoisonValue>(M0Val))
1882 break;
1883
1884 auto *MsgImm = cast<ConstantInt>(II.getArgOperand(0));
1885 uint16_t MsgId, OpId, StreamId;
1886 decodeMsg(MsgImm->getZExtValue(), MsgId, OpId, StreamId, *ST);
1887
1888 if (!msgDoesNotUseM0(MsgId, *ST))
1889 break;
1890
1891 // Drop UB-implying attributes since we're replacing with poison.
1892 II.dropUBImplyingAttrsAndMetadata();
1893 IC.replaceOperand(II, 1, PoisonValue::get(M0Val->getType()));
1894 return nullptr;
1895 }
1896 case Intrinsic::amdgcn_update_dpp: {
1897 Value *Old = II.getArgOperand(0);
1898
1899 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1900 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1901 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1902 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1903 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1904 break;
1905
1906 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1907 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1908 }
1909 case Intrinsic::amdgcn_permlane16:
1910 case Intrinsic::amdgcn_permlane16_var:
1911 case Intrinsic::amdgcn_permlanex16:
1912 case Intrinsic::amdgcn_permlanex16_var: {
1913 // Discard vdst_in if it's not going to be read.
1914 Value *VDstIn = II.getArgOperand(0);
1915 if (isa<PoisonValue>(VDstIn))
1916 break;
1917
1918 // FetchInvalid operand idx.
1919 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1920 IID == Intrinsic::amdgcn_permlanex16)
1921 ? 4 /* for permlane16 and permlanex16 */
1922 : 3; /* for permlane16_var and permlanex16_var */
1923
1924 // BoundCtrl operand idx.
1925 // For permlane16 and permlanex16 it should be 5
1926 // For Permlane16_var and permlanex16_var it should be 4
1927 unsigned int BcIdx = FiIdx + 1;
1928
1929 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1930 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1931 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1932 break;
1933
1934 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1935 }
1936 case Intrinsic::amdgcn_wave_shuffle:
1937 return tryOptimizeShufflePattern(IC, II, *ST);
1938 case Intrinsic::amdgcn_permlane64:
1939 case Intrinsic::amdgcn_readfirstlane:
1940 case Intrinsic::amdgcn_readlane:
1941 case Intrinsic::amdgcn_ds_bpermute: {
1942 // If the data argument is uniform these intrinsics return it unchanged.
1943 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1944 const Use &Src = II.getArgOperandUse(SrcIdx);
1945 if (isTriviallyUniform(Src))
1946 return IC.replaceInstUsesWith(II, Src.get());
1947
1948 if (IID == Intrinsic::amdgcn_readlane &&
1950 return &II;
1951
1952 // If the lane argument of bpermute is uniform, change it to readlane. This
1953 // generates better code and can enable further optimizations because
1954 // readlane is AlwaysUniform.
1955 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1956 const Use &Lane = II.getArgOperandUse(0);
1957 if (isTriviallyUniform(Lane)) {
1958 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1960 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1961 II.setCalledFunction(NewDecl);
1962 II.setOperand(0, Src);
1963 II.setOperand(1, NewLane);
1964 return &II;
1965 }
1966 }
1967
1968 if (IID == Intrinsic::amdgcn_ds_bpermute)
1969 return tryOptimizeShufflePattern(IC, II, *ST);
1970
1972 return Res;
1973
1974 return std::nullopt;
1975 }
1976 case Intrinsic::amdgcn_writelane: {
1977 // TODO: Fold bitcast like readlane.
1978 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1979 return &II;
1980 return std::nullopt;
1981 }
1982 case Intrinsic::amdgcn_trig_preop: {
1983 // The intrinsic is declared with name mangling, but currently the
1984 // instruction only exists for f64
1985 if (!II.getType()->isDoubleTy())
1986 break;
1987
1988 Value *Src = II.getArgOperand(0);
1989 Value *Segment = II.getArgOperand(1);
1990 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1991 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1992
1993 if (isa<UndefValue>(Segment))
1994 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1995
1996 // Sign bit is not used.
1997 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Src);
1998 if (StrippedSign != Src)
1999 return IC.replaceOperand(II, 0, StrippedSign);
2000
2001 if (II.isStrictFP())
2002 break;
2003
2004 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
2005 if (!CSrc && !isa<UndefValue>(Src))
2006 break;
2007
2008 // The instruction ignores special cases, and literally just extracts the
2009 // exponents. Fold undef to nan, and index the table as normal.
2010 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
2011 : APFloat::getQNaN(II.getType()->getFltSemantics())
2012 .bitcastToAPInt();
2013
2014 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
2015 if (!Cseg) {
2016 if (isa<UndefValue>(Src))
2017 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2018 break;
2019 }
2020
2021 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
2022 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
2023 unsigned Shift = SegmentVal * 53;
2024 if (Exponent > 1077)
2025 Shift += Exponent - 1077;
2026
2027 // 2.0/PI table.
2028 static const uint32_t TwoByPi[] = {
2029 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
2030 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
2031 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
2032 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
2033 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
2034 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
2035 0x56033046};
2036
2037 // Return 0 for outbound segment (hardware behavior).
2038 unsigned Idx = Shift >> 5;
2039 if (Idx + 2 >= std::size(TwoByPi)) {
2040 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
2041 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
2042 }
2043
2044 unsigned BShift = Shift & 0x1f;
2045 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
2046 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
2047 if (BShift)
2048 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
2049 Thi = Thi >> 11;
2050 APFloat Result = APFloat((double)Thi);
2051
2052 int Scale = -53 - Shift;
2053 if (Exponent >= 1968)
2054 Scale += 128;
2055
2056 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
2057 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
2058 }
2059 case Intrinsic::amdgcn_fmul_legacy: {
2060 Value *Op0 = II.getArgOperand(0);
2061 Value *Op1 = II.getArgOperand(1);
2062
2063 for (Value *Src : {Op0, Op1}) {
2064 if (isa<PoisonValue>(Src))
2065 return IC.replaceInstUsesWith(II, Src);
2066 }
2067
2068 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2069 // infinity, gives +0.0.
2070 // TODO: Move to InstSimplify?
2071 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2073 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2074
2075 // If we can prove we don't have one of the special cases then we can use a
2076 // normal fmul instruction instead.
2077 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2078 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
2079 FMul->takeName(&II);
2080 return IC.replaceInstUsesWith(II, FMul);
2081 }
2082 break;
2083 }
2084 case Intrinsic::amdgcn_fma_legacy: {
2085 Value *Op0 = II.getArgOperand(0);
2086 Value *Op1 = II.getArgOperand(1);
2087 Value *Op2 = II.getArgOperand(2);
2088
2089 for (Value *Src : {Op0, Op1, Op2}) {
2090 if (isa<PoisonValue>(Src))
2091 return IC.replaceInstUsesWith(II, Src);
2092 }
2093
2094 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2095 // infinity, gives +0.0.
2096 // TODO: Move to InstSimplify?
2097 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2099 // It's tempting to just return Op2 here, but that would give the wrong
2100 // result if Op2 was -0.0.
2101 auto *Zero = ConstantFP::getZero(II.getType());
2102 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
2103 FAdd->takeName(&II);
2104 return IC.replaceInstUsesWith(II, FAdd);
2105 }
2106
2107 // If we can prove we don't have one of the special cases then we can use a
2108 // normal fma instead.
2109 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2110 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
2111 II.getModule(), Intrinsic::fma, II.getType()));
2112 return &II;
2113 }
2114 break;
2115 }
2116 case Intrinsic::amdgcn_is_shared:
2117 case Intrinsic::amdgcn_is_private: {
2118 Value *Src = II.getArgOperand(0);
2119 if (isa<PoisonValue>(Src))
2120 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2121 if (isa<UndefValue>(Src))
2122 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
2123
2124 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
2125 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
2126 break;
2127 }
2128 case Intrinsic::amdgcn_make_buffer_rsrc: {
2129 Value *Src = II.getArgOperand(0);
2130 if (isa<PoisonValue>(Src))
2131 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2132 return std::nullopt;
2133 }
2134 case Intrinsic::amdgcn_raw_buffer_store_format:
2135 case Intrinsic::amdgcn_struct_buffer_store_format:
2136 case Intrinsic::amdgcn_raw_tbuffer_store:
2137 case Intrinsic::amdgcn_struct_tbuffer_store:
2138 case Intrinsic::amdgcn_image_store_1d:
2139 case Intrinsic::amdgcn_image_store_1darray:
2140 case Intrinsic::amdgcn_image_store_2d:
2141 case Intrinsic::amdgcn_image_store_2darray:
2142 case Intrinsic::amdgcn_image_store_2darraymsaa:
2143 case Intrinsic::amdgcn_image_store_2dmsaa:
2144 case Intrinsic::amdgcn_image_store_3d:
2145 case Intrinsic::amdgcn_image_store_cube:
2146 case Intrinsic::amdgcn_image_store_mip_1d:
2147 case Intrinsic::amdgcn_image_store_mip_1darray:
2148 case Intrinsic::amdgcn_image_store_mip_2d:
2149 case Intrinsic::amdgcn_image_store_mip_2darray:
2150 case Intrinsic::amdgcn_image_store_mip_3d:
2151 case Intrinsic::amdgcn_image_store_mip_cube: {
2152 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
2153 break;
2154
2155 APInt DemandedElts;
2156 if (ST->hasDefaultComponentBroadcast())
2157 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
2158 else if (ST->hasDefaultComponentZero())
2159 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
2160 else
2161 break;
2162
2163 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
2164 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
2165 false)) {
2166 return IC.eraseInstFromFunction(II);
2167 }
2168
2169 break;
2170 }
2171 case Intrinsic::amdgcn_prng_b32: {
2172 auto *Src = II.getArgOperand(0);
2173 if (isa<UndefValue>(Src)) {
2174 return IC.replaceInstUsesWith(II, Src);
2175 }
2176 return std::nullopt;
2177 }
2178 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
2179 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
2180 Value *Src0 = II.getArgOperand(0);
2181 Value *Src1 = II.getArgOperand(1);
2182 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
2183 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
2184 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2185 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2186
2187 auto getFormatNumRegs = [](unsigned FormatVal) {
2188 switch (FormatVal) {
2191 return 6u;
2193 return 4u;
2196 return 8u;
2197 default:
2198 llvm_unreachable("invalid format value");
2199 }
2200 };
2201
2202 bool MadeChange = false;
2203 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
2204 unsigned Src1NumElts = getFormatNumRegs(BLGP);
2205
2206 // Depending on the used format, fewer registers are required so shrink the
2207 // vector type.
2208 if (Src0Ty->getNumElements() > Src0NumElts) {
2209 Src0 = IC.Builder.CreateExtractVector(
2210 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2211 uint64_t(0));
2212 MadeChange = true;
2213 }
2214
2215 if (Src1Ty->getNumElements() > Src1NumElts) {
2216 Src1 = IC.Builder.CreateExtractVector(
2217 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2218 uint64_t(0));
2219 MadeChange = true;
2220 }
2221
2222 if (!MadeChange)
2223 return std::nullopt;
2224
2225 SmallVector<Value *, 10> Args(II.args());
2226 Args[0] = Src0;
2227 Args[1] = Src1;
2228
2229 Value *NewII = IC.Builder.CreateIntrinsic(
2230 IID, {Src0->getType(), Src1->getType()}, Args, &II);
2231 NewII->takeName(&II);
2232 return IC.replaceInstUsesWith(II, NewII);
2233 }
2234 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
2235 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
2236 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
2237 Value *Src0 = II.getArgOperand(1);
2238 Value *Src1 = II.getArgOperand(3);
2239 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2240 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
2241 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2242 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2243
2244 bool MadeChange = false;
2245 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
2246 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
2247
2248 // Depending on the used format, fewer registers are required so shrink the
2249 // vector type.
2250 if (Src0Ty->getNumElements() > Src0NumElts) {
2251 Src0 = IC.Builder.CreateExtractVector(
2252 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2253 IC.Builder.getInt64(0));
2254 MadeChange = true;
2255 }
2256
2257 if (Src1Ty->getNumElements() > Src1NumElts) {
2258 Src1 = IC.Builder.CreateExtractVector(
2259 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2260 IC.Builder.getInt64(0));
2261 MadeChange = true;
2262 }
2263
2264 if (!MadeChange)
2265 return std::nullopt;
2266
2267 SmallVector<Value *, 13> Args(II.args());
2268 Args[1] = Src0;
2269 Args[3] = Src1;
2270
2271 Value *NewII = IC.Builder.CreateIntrinsic(
2272 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
2273 Args, &II);
2274 NewII->takeName(&II);
2275 return IC.replaceInstUsesWith(II, NewII);
2276 }
2277 }
2278 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
2279 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
2280 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
2281 }
2282 return std::nullopt;
2283}
2284
2285/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
2286///
2287/// The result of simplifying amdgcn image and buffer store intrinsics is updating
2288/// definitions of the intrinsics vector argument, not Uses of the result like
2289/// image and buffer loads.
2290/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
2291/// struct returns.
2294 APInt DemandedElts,
2295 int DMaskIdx, bool IsLoad) {
2296
2297 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
2298 : II.getOperand(0)->getType());
2299 unsigned VWidth = IIVTy->getNumElements();
2300 if (VWidth == 1)
2301 return nullptr;
2302 Type *EltTy = IIVTy->getElementType();
2303
2306
2307 // Assume the arguments are unchanged and later override them, if needed.
2308 SmallVector<Value *, 16> Args(II.args());
2309
2310 if (DMaskIdx < 0) {
2311 // Buffer case.
2312
2313 const unsigned ActiveBits = DemandedElts.getActiveBits();
2314 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
2315
2316 // Start assuming the prefix of elements is demanded, but possibly clear
2317 // some other bits if there are trailing zeros (unused components at front)
2318 // and update offset.
2319 DemandedElts = (1 << ActiveBits) - 1;
2320
2321 if (UnusedComponentsAtFront > 0) {
2322 static const unsigned InvalidOffsetIdx = 0xf;
2323
2324 unsigned OffsetIdx;
2325 switch (II.getIntrinsicID()) {
2326 case Intrinsic::amdgcn_raw_buffer_load:
2327 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2328 OffsetIdx = 1;
2329 break;
2330 case Intrinsic::amdgcn_s_buffer_load:
2331 // If resulting type is vec3, there is no point in trimming the
2332 // load with updated offset, as the vec3 would most likely be widened to
2333 // vec4 anyway during lowering.
2334 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
2335 OffsetIdx = InvalidOffsetIdx;
2336 else
2337 OffsetIdx = 1;
2338 break;
2339 case Intrinsic::amdgcn_struct_buffer_load:
2340 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2341 OffsetIdx = 2;
2342 break;
2343 default:
2344 // TODO: handle tbuffer* intrinsics.
2345 OffsetIdx = InvalidOffsetIdx;
2346 break;
2347 }
2348
2349 if (OffsetIdx != InvalidOffsetIdx) {
2350 // Clear demanded bits and update the offset.
2351 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
2352 auto *Offset = Args[OffsetIdx];
2353 unsigned SingleComponentSizeInBits =
2354 IC.getDataLayout().getTypeSizeInBits(EltTy);
2355 unsigned OffsetAdd =
2356 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
2357 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
2358 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
2359 }
2360 }
2361 } else {
2362 // Image case.
2363
2364 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
2365 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
2366
2367 // dmask 0 has special semantics, do not simplify.
2368 if (DMaskVal == 0)
2369 return nullptr;
2370
2371 // Mask off values that are undefined because the dmask doesn't cover them
2372 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
2373
2374 unsigned NewDMaskVal = 0;
2375 unsigned OrigLdStIdx = 0;
2376 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
2377 const unsigned Bit = 1 << SrcIdx;
2378 if (!!(DMaskVal & Bit)) {
2379 if (!!DemandedElts[OrigLdStIdx])
2380 NewDMaskVal |= Bit;
2381 OrigLdStIdx++;
2382 }
2383 }
2384
2385 if (DMaskVal != NewDMaskVal)
2386 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
2387 }
2388
2389 unsigned NewNumElts = DemandedElts.popcount();
2390 if (!NewNumElts)
2391 return PoisonValue::get(IIVTy);
2392
2393 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2394 if (DMaskIdx >= 0)
2395 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
2396 return nullptr;
2397 }
2398
2399 // Validate function argument and return types, extracting overloaded types
2400 // along the way.
2401 SmallVector<Type *, 6> OverloadTys;
2402 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
2403 return nullptr;
2404
2405 Type *NewTy =
2406 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
2407 OverloadTys[0] = NewTy;
2408
2409 if (!IsLoad) {
2410 SmallVector<int, 8> EltMask;
2411 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2412 if (DemandedElts[OrigStoreIdx])
2413 EltMask.push_back(OrigStoreIdx);
2414
2415 if (NewNumElts == 1)
2416 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
2417 else
2418 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
2419 }
2420
2422 II.getIntrinsicID(), OverloadTys, Args);
2423 NewCall->takeName(&II);
2424 NewCall->copyMetadata(II);
2425 AttributeList OldAttrList = II.getAttributes();
2426 NewCall->setAttributes(OldAttrList);
2427
2428 if (IsLoad) {
2429 if (NewNumElts == 1) {
2430 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
2431 DemandedElts.countr_zero());
2432 }
2433
2434 SmallVector<int, 8> EltMask;
2435 unsigned NewLoadIdx = 0;
2436 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2437 if (!!DemandedElts[OrigLoadIdx])
2438 EltMask.push_back(NewLoadIdx++);
2439 else
2440 EltMask.push_back(NewNumElts);
2441 }
2442
2443 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
2444
2445 return Shuffle;
2446 }
2447
2448 return NewCall;
2449}
2450
2452 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2453 APInt &UndefElts) const {
2454 auto *VT = dyn_cast<FixedVectorType>(II.getType());
2455 if (!VT)
2456 return nullptr;
2457
2458 const unsigned FirstElt = DemandedElts.countr_zero();
2459 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2460 const unsigned MaskLen = LastElt - FirstElt + 1;
2461
2462 unsigned OldNumElts = VT->getNumElements();
2463 if (MaskLen == OldNumElts && MaskLen != 1)
2464 return nullptr;
2465
2466 Type *EltTy = VT->getElementType();
2467 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
2468
2469 // Theoretically we should support these intrinsics for any legal type. Avoid
2470 // introducing cases that aren't direct register types like v3i16.
2471 if (!isTypeLegal(NewVT))
2472 return nullptr;
2473
2474 Value *Src = II.getArgOperand(0);
2475
2476 // Make sure convergence tokens are preserved.
2477 // TODO: CreateIntrinsic should allow directly copying bundles
2479 II.getOperandBundlesAsDefs(OpBundles);
2480
2482 Function *Remangled =
2483 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
2484
2485 if (MaskLen == 1) {
2486 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
2487
2488 // TODO: Preserve callsite attributes?
2489 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2490
2491 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
2492 NewCall, FirstElt);
2493 }
2494
2495 SmallVector<int> ExtractMask(MaskLen, -1);
2496 for (unsigned I = 0; I != MaskLen; ++I) {
2497 if (DemandedElts[FirstElt + I])
2498 ExtractMask[I] = FirstElt + I;
2499 }
2500
2501 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
2502
2503 // TODO: Preserve callsite attributes?
2504 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2505
2506 SmallVector<int> InsertMask(OldNumElts, -1);
2507 for (unsigned I = 0; I != MaskLen; ++I) {
2508 if (DemandedElts[FirstElt + I])
2509 InsertMask[FirstElt + I] = I;
2510 }
2511
2512 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2513 // call behind.
2514 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
2515}
2516
2518 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2519 APInt &UndefElts2, APInt &UndefElts3,
2520 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2521 SimplifyAndSetOp) const {
2522 switch (II.getIntrinsicID()) {
2523 case Intrinsic::amdgcn_readfirstlane:
2524 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2525 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2526 case Intrinsic::amdgcn_raw_buffer_load:
2527 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2528 case Intrinsic::amdgcn_raw_buffer_load_format:
2529 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2530 case Intrinsic::amdgcn_raw_tbuffer_load:
2531 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2532 case Intrinsic::amdgcn_s_buffer_load:
2533 case Intrinsic::amdgcn_struct_buffer_load:
2534 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2535 case Intrinsic::amdgcn_struct_buffer_load_format:
2536 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2537 case Intrinsic::amdgcn_struct_tbuffer_load:
2538 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2539 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2540 default: {
2541 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2542 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2543 }
2544 break;
2545 }
2546 }
2547 return std::nullopt;
2548}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * createPermlane16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlane16 with the precomputed lane-select halves.
static std::optional< unsigned > matchRowSharePattern(ArrayRef< uint8_t > Ids)
Match a row-share pattern: all 16 lanes of each row read the same source lane.
static bool matchMirrorPattern(ArrayRef< uint8_t > Ids)
Match an N-lane reversal (mirror) pattern.
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat, bool AllowI16SExt=false)
static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST, SmallVectorImpl< uint8_t > &Ids, const DataLayout &DL)
Build the per-lane shuffle map by evaluating Index for every lane in the wave.
static std::optional< unsigned > matchQuadPermPattern(ArrayRef< uint8_t > Ids)
Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp QUAD_PERM control word: bits[1:0]=Ids...
static std::optional< unsigned > matchDsSwizzleRotatePattern(ArrayRef< uint8_t > Ids)
Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation of all 32 lanes within each ...
static std::optional< unsigned > matchHalfRowPermPattern(ArrayRef< uint8_t > Ids)
Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8 24-bit selector (three bits per ...
static std::optional< unsigned > matchRowXMaskPattern(ArrayRef< uint8_t > Ids)
Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J, with Mask in [1,...
static constexpr auto matchHalfRowMirrorPattern
static Value * createPermlaneX16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlanex16 with the precomputed lane-select halves.
static bool isRowPattern(ArrayRef< uint8_t > Ids)
Match an N-lane row pattern: each lane in [0, N) reads from a source lane in the same N-lane row,...
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static constexpr auto isFullRowPattern
static constexpr auto isQuadPattern
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static uint64_t computePermlane16Masks(ArrayRef< uint8_t > Ids)
Pack a 16-lane permutation into a single 64-bit value: four bits per output lane, lane J in bits [J*4...
static bool matchHalfWaveSwapPattern(ArrayRef< uint8_t > Ids)
Match a half-wave swap: lane J reads from lane J ^ 32.
static bool hasPeriodicLayout(ArrayRef< uint8_t > Ids)
Lanes are partitioned into groups of Period; each group is a translated copy of the first: Ids[I] = I...
static std::optional< Instruction * > tryOptimizeShufflePattern(InstCombiner &IC, IntrinsicInst &II, const GCNSubtarget &ST)
Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant function of the lane ID into a ...
static constexpr auto isHalfRowPattern
static APInt defaultComponentBroadcast(Value *V)
static std::optional< unsigned > matchDsSwizzleBitmaskPattern(ArrayRef< uint8_t > Ids)
Match a DS_SWIZZLE bitmask-mode permutation: dst_lane = ((src_lane & AND) | OR) ^ XOR with each mask ...
static Value * createDsSwizzle(IRBuilderBase &B, Value *Val, unsigned Offset, const DataLayout &DL)
Emit ds_swizzle with the given immediate, bitcasting/converting between pointer/float types and i32 a...
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static Value * matchShuffleToHWIntrinsic(IRBuilderBase &B, Value *Src, ArrayRef< uint8_t > Ids, const GCNSubtarget &ST, const DataLayout &DL)
Given a shuffle map, try to emit the best hardware intrinsic.
static std::optional< unsigned > matchRowRotatePattern(ArrayRef< uint8_t > Ids)
Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
static bool isCrossRowPattern(ArrayRef< uint8_t > Ids)
Match a cross-row permutation suitable for v_permlanex16: every lane in the low 16-lane half reads fr...
static bool isThreadID(const GCNSubtarget &ST, Value *V)
static Value * createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl)
Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and bound_ctrl=1 so out-of-bounds...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static Value * createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector)
Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
static constexpr auto matchFullRowMirrorPattern
static std::optional< unsigned > evalLaneExpr(Value *V, unsigned Lane, const GCNSubtarget &ST, const DataLayout &DL, unsigned Depth=0)
Evaluate V as a function of the lane ID and return its value on Lane, or std::nullopt if V is not a c...
static Value * createPermlane64(IRBuilderBase &B, Value *Val)
Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
static constexpr roundingMode rmTowardZero
Definition APFloat.h:349
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:345
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1200
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1288
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5920
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1524
bool isPosInfinity() const
Definition APFloat.h:1572
const fltSemantics & getSemantics() const
Definition APFloat.h:1567
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1396
bool isNaN() const
Definition APFloat.h:1557
bool isSignaling() const
Definition APFloat.h:1561
APInt bitcastToAPInt() const
Definition APFloat.h:1451
bool isNegInfinity() const
Definition APFloat.h:1573
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1159
bool isInfinity() const
Definition APFloat.h:1556
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1429
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:521
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
bool isMask(unsigned numBits) const
Definition APInt.h:489
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:218
size_t size() const
Get the array size.
Definition ArrayRef.h:141
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
void setAttributes(AttributeList A)
Set the attributes for this call.
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
AttributeList getAttributes() const
Return the attributes for this call.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
bool isSigned() const
Definition InstrTypes.h:993
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:890
bool isFPPredicate() const
Definition InstrTypes.h:845
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
const APFloat & getValueAPF() const
Definition Constants.h:463
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantFP * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange add(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an addition of a value in this ran...
LLVM_ABI bool isFullSet() const
Return true if this set contains all of the elements possible for this data-type.
LLVM_ABI ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition Constant.h:43
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constant.h:64
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Tagged union holding either a T or a Error.
Definition Error.h:485
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:288
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:69
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
LLVM_ABI CallInst * CreateIntrinsicWithoutFolding(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2617
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2605
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:547
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2128
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:457
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2122
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1112
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:175
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2368
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:482
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1043
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1511
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2110
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2639
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1071
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1031
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1422
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2543
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1641
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1065
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1551
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1679
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2848
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
IRBuilder< TargetFolder, IRBuilderInstCombineInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
static Value * stripSignOnlyFPOps(Value *Val)
Ignore all operations which only change the sign of a value, returning the underlying magnitude value...
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:163
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool isSignatureValid(Intrinsic::ID ID, FunctionType *FT, SmallVectorImpl< Type * > &OverloadTys, raw_ostream &OS=nulls())
Returns true if FT is a valid function type for intrinsic ID.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:573
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1689
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1732
constexpr unsigned MaxAnalysisRecursionDepth
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1677
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Constant * ConstantFoldInstOperands(const Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:58
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.