LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
20#include "SIDefines.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Sequence.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include <optional>
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
38namespace {
39
40struct AMDGPUImageDMaskIntrinsic {
41 unsigned Intr;
42};
43
44#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
45#include "AMDGPUGenSearchableTables.inc"
46
47} // end anonymous namespace
48
49// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
50//
51// A single NaN input is folded to minnum, so we rely on that folding for
52// handling NaNs.
53static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
54 const APFloat &Src2) {
55 assert(!Src0.isNaN() && !Src1.isNaN() && !Src2.isNaN() &&
56 "nans handled separately");
57 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
58
59 if (Max3.bitwiseIsEqual(Src0))
60 return maxnum(Src1, Src2);
61
62 if (Max3.bitwiseIsEqual(Src1))
63 return maxnum(Src0, Src2);
64
65 return maxnum(Src0, Src1);
66}
67
68// Check if a value can be converted to a 16-bit value without losing
69// precision.
70// The value is expected to be either a float (IsFloat = true) or an unsigned
71// integer (IsFloat = false).
72static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
73 Type *VTy = V.getType();
74 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
75 // The value is already 16-bit, so we don't want to convert to 16-bit again!
76 return false;
77 }
78 if (IsFloat) {
79 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
80 // We need to check that if we cast the index down to a half, we do not
81 // lose precision.
82 APFloat FloatValue(ConstFloat->getValueAPF());
83 bool LosesInfo = true;
85 &LosesInfo);
86 return !LosesInfo;
87 }
88 } else {
89 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
90 // We need to check that if we cast the index down to an i16, we do not
91 // lose precision.
92 APInt IntValue(ConstInt->getValue());
93 return IntValue.getActiveBits() <= 16;
94 }
95 }
96
97 Value *CastSrc;
98 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
99 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
100 if (IsExt) {
101 Type *CastSrcTy = CastSrc->getType();
102 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
103 return true;
104 }
105
106 return false;
107}
108
109// Convert a value to 16-bit.
111 Type *VTy = V.getType();
113 return cast<Instruction>(&V)->getOperand(0);
114 if (VTy->isIntegerTy())
115 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
116 if (VTy->isFloatingPointTy())
117 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
118
119 llvm_unreachable("Should never be called!");
120}
121
122/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
123/// modified arguments (based on OldIntr) and replaces InstToReplace with
124/// this newly created intrinsic call.
125static std::optional<Instruction *> modifyIntrinsicCall(
126 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
127 InstCombiner &IC,
128 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
129 Func) {
130 SmallVector<Type *, 4> OverloadTys;
131 if (!Intrinsic::isSignatureValid(OldIntr.getCalledFunction(), OverloadTys))
132 return std::nullopt;
133
134 SmallVector<Value *, 8> Args(OldIntr.args());
135
136 // Modify arguments and types
137 Func(Args, OverloadTys);
138
139 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, OverloadTys, Args);
140 NewCall->takeName(&OldIntr);
141 NewCall->copyMetadata(OldIntr);
142 if (isa<FPMathOperator>(NewCall))
143 NewCall->copyFastMathFlags(&OldIntr);
144 // Copy attributes
145 AttributeList OldAttrList = OldIntr.getAttributes();
146 NewCall->setAttributes(OldAttrList);
147
148 // Erase and replace uses
149 if (!InstToReplace.getType()->isVoidTy())
150 IC.replaceInstUsesWith(InstToReplace, NewCall);
151
152 bool RemoveOldIntr = &OldIntr != &InstToReplace;
153
154 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
155 if (RemoveOldIntr)
156 IC.eraseInstFromFunction(OldIntr);
157
158 return RetValue;
159}
160
161static std::optional<Instruction *>
163 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
165 // Optimize _L to _LZ when _L is zero
166 if (const auto *LZMappingInfo =
168 if (auto *ConstantLod =
169 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
170 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
171 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
173 ImageDimIntr->Dim);
174 return modifyIntrinsicCall(
175 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
176 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
177 });
178 }
179 }
180 }
181
182 // Optimize _mip away, when 'lod' is zero
183 if (const auto *MIPMappingInfo =
185 if (auto *ConstantMip =
186 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
187 if (ConstantMip->isZero()) {
188 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
189 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
190 ImageDimIntr->Dim);
191 return modifyIntrinsicCall(
192 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
193 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
194 });
195 }
196 }
197 }
198
199 // Optimize _bias away when 'bias' is zero
200 if (const auto *BiasMappingInfo =
202 if (auto *ConstantBias =
203 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
204 if (ConstantBias->isZero()) {
205 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
206 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
207 ImageDimIntr->Dim);
208 return modifyIntrinsicCall(
209 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
210 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
211 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
212 });
213 }
214 }
215 }
216
217 // Optimize _offset away when 'offset' is zero
218 if (const auto *OffsetMappingInfo =
220 if (auto *ConstantOffset =
221 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
222 if (ConstantOffset->isZero()) {
223 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
225 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
226 return modifyIntrinsicCall(
227 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
228 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
229 });
230 }
231 }
232 }
233
234 // Try to use D16
235 if (ST->hasD16Images()) {
236
237 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
239
240 if (BaseOpcode->HasD16) {
241
242 // If the only use of image intrinsic is a fptrunc (with conversion to
243 // half) then both fptrunc and image intrinsic will be replaced with image
244 // intrinsic with D16 flag.
245 if (II.hasOneUse()) {
246 Instruction *User = II.user_back();
247
248 if (User->getOpcode() == Instruction::FPTrunc &&
250
251 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
252 [&](auto &Args, auto &ArgTys) {
253 // Change return type of image intrinsic.
254 // Set it to return type of fptrunc.
255 ArgTys[0] = User->getType();
256 });
257 }
258 }
259
260 // Only perform D16 folding if every user of the image sample is
261 // an ExtractElementInst immediately followed by an FPTrunc to half.
263 ExtractTruncPairs;
264 bool AllHalfExtracts = true;
265
266 for (User *U : II.users()) {
267 auto *Ext = dyn_cast<ExtractElementInst>(U);
268 if (!Ext || !Ext->hasOneUse()) {
269 AllHalfExtracts = false;
270 break;
271 }
272
273 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
274 if (!Tr || !Tr->getType()->isHalfTy()) {
275 AllHalfExtracts = false;
276 break;
277 }
278
279 ExtractTruncPairs.emplace_back(Ext, Tr);
280 }
281
282 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
283 auto *VecTy = cast<VectorType>(II.getType());
284 Type *HalfVecTy =
285 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
286
287 // Obtain the original image sample intrinsic's signature
288 // and replace its return type with the half-vector for D16 folding
289 SmallVector<Type *, 8> OverloadTys;
290 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
291 return std::nullopt;
292
293 OverloadTys[0] = HalfVecTy;
294 Module *M = II.getModule();
296 M, ImageDimIntr->Intr, OverloadTys);
297
298 II.mutateType(HalfVecTy);
299 II.setCalledFunction(HalfDecl);
300
301 IRBuilder<> Builder(II.getContext());
302 for (auto &[Ext, Tr] : ExtractTruncPairs) {
303 Value *Idx = Ext->getIndexOperand();
304
305 Builder.SetInsertPoint(Tr);
306
307 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
308 HalfExtract->takeName(Tr);
309
310 Tr->replaceAllUsesWith(HalfExtract);
311 }
312
313 for (auto &[Ext, Tr] : ExtractTruncPairs) {
314 IC.eraseInstFromFunction(*Tr);
315 IC.eraseInstFromFunction(*Ext);
316 }
317
318 return &II;
319 }
320 }
321 }
322
323 // Try to use A16 or G16
324 if (!ST->hasA16() && !ST->hasG16())
325 return std::nullopt;
326
327 // Address is interpreted as float if the instruction has a sampler or as
328 // unsigned int if there is no sampler.
329 bool HasSampler =
331 bool FloatCoord = false;
332 // true means derivatives can be converted to 16 bit, coordinates not
333 bool OnlyDerivatives = false;
334
335 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
336 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
337 Value *Coord = II.getOperand(OperandIndex);
338 // If the values are not derived from 16-bit values, we cannot optimize.
339 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
340 if (OperandIndex < ImageDimIntr->CoordStart ||
341 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
342 return std::nullopt;
343 }
344 // All gradients can be converted, so convert only them
345 OnlyDerivatives = true;
346 break;
347 }
348
349 assert(OperandIndex == ImageDimIntr->GradientStart ||
350 FloatCoord == Coord->getType()->isFloatingPointTy());
351 FloatCoord = Coord->getType()->isFloatingPointTy();
352 }
353
354 if (!OnlyDerivatives && !ST->hasA16())
355 OnlyDerivatives = true; // Only supports G16
356
357 // Check if there is a bias parameter and if it can be converted to f16
358 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
359 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
360 assert(HasSampler &&
361 "Only image instructions with a sampler can have a bias");
362 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
363 OnlyDerivatives = true;
364 }
365
366 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
367 ImageDimIntr->CoordStart))
368 return std::nullopt;
369
370 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
371 : Type::getInt16Ty(II.getContext());
372
373 return modifyIntrinsicCall(
374 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
375 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
376 if (!OnlyDerivatives) {
377 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
378
379 // Change the bias type
380 if (ImageDimIntr->NumBiasArgs != 0)
381 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
382 }
383
384 unsigned EndIndex =
385 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
386 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
387 OperandIndex < EndIndex; OperandIndex++) {
388 Args[OperandIndex] =
389 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
390 }
391
392 // Convert the bias
393 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
394 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
395 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
396 }
397 });
398}
399
401 const Value *Op0, const Value *Op1,
402 InstCombiner &IC) const {
403 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
404 // infinity, gives +0.0. If we can prove we don't have one of the special
405 // cases then we can use a normal multiply instead.
406 // TODO: Create and use isKnownFiniteNonZero instead of just matching
407 // constants here.
410 // One operand is not zero or infinity or NaN.
411 return true;
412 }
413
415 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
416 // Neither operand is infinity or NaN.
417 return true;
418 }
419 return false;
420}
421
422/// Match an fpext from half to float, or a constant we can convert.
424 Value *Src = nullptr;
425 ConstantFP *CFP = nullptr;
426 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
427 if (Src->getType()->isHalfTy())
428 return Src;
429 } else if (match(Arg, m_ConstantFP(CFP))) {
430 bool LosesInfo;
431 APFloat Val(CFP->getValueAPF());
433 if (!LosesInfo)
434 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
435 }
436 return nullptr;
437}
438
439// Trim all zero components from the end of the vector \p UseV and return
440// an appropriate bitset with known elements.
442 Instruction *I) {
443 auto *VTy = cast<FixedVectorType>(UseV->getType());
444 unsigned VWidth = VTy->getNumElements();
445 APInt DemandedElts = APInt::getAllOnes(VWidth);
446
447 for (int i = VWidth - 1; i > 0; --i) {
448 auto *Elt = findScalarElement(UseV, i);
449 if (!Elt)
450 break;
451
452 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
453 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
454 break;
455 } else {
456 break;
457 }
458
459 DemandedElts.clearBit(i);
460 }
461
462 return DemandedElts;
463}
464
465// Trim elements of the end of the vector \p V, if they are
466// equal to the first element of the vector.
468 auto *VTy = cast<FixedVectorType>(V->getType());
469 unsigned VWidth = VTy->getNumElements();
470 APInt DemandedElts = APInt::getAllOnes(VWidth);
471 Value *FirstComponent = findScalarElement(V, 0);
472
473 SmallVector<int> ShuffleMask;
474 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
475 SVI->getShuffleMask(ShuffleMask);
476
477 for (int I = VWidth - 1; I > 0; --I) {
478 if (ShuffleMask.empty()) {
479 auto *Elt = findScalarElement(V, I);
480 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
481 break;
482 } else {
483 // Detect identical elements in the shufflevector result, even though
484 // findScalarElement cannot tell us what that element is.
485 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
486 break;
487 }
488 DemandedElts.clearBit(I);
489 }
490
491 return DemandedElts;
492}
493
496 APInt DemandedElts,
497 int DMaskIdx = -1,
498 bool IsLoad = true);
499
500/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
501static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
502 return (SqrtOp->getType()->isFloatTy() &&
503 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
504 SqrtOp->getType()->isHalfTy();
505}
506
507/// Return true if we can easily prove that use U is uniform.
508static bool isTriviallyUniform(const Use &U) {
509 Value *V = U.get();
510 if (isa<Constant>(V))
511 return true;
512 if (const auto *A = dyn_cast<Argument>(V))
514 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
515 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
516 return false;
517 // If II and U are in different blocks then there is a possibility of
518 // temporal divergence.
519 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
520 }
521 return false;
522}
523
524/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
525///
526/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
529 unsigned LaneArgIdx) const {
530 unsigned MaskBits = ST->getWavefrontSizeLog2();
531 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
532
533 KnownBits Known(32);
534 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
535 return true;
536
537 if (!Known.isConstant())
538 return false;
539
540 // Out of bounds indexes may appear in wave64 code compiled for wave32.
541 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
542 // manually fix it up.
543
544 Value *LaneArg = II.getArgOperand(LaneArgIdx);
545 Constant *MaskedConst =
546 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
547 if (MaskedConst != LaneArg) {
548 II.getOperandUse(LaneArgIdx).set(MaskedConst);
549 return true;
550 }
551
552 return false;
553}
554
556 Function &NewCallee, ArrayRef<Value *> Ops) {
558 Old.getOperandBundlesAsDefs(OpBundles);
559
560 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
561 NewCall->takeName(&Old);
562 return NewCall;
563}
564
565// Return true for sequences of instructions that effectively assign
566// each lane to its thread ID
567static bool isThreadID(const GCNSubtarget &ST, Value *V) {
568 // Case 1:
569 // wave32: mbcnt_lo(-1, 0)
570 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
576 if (ST.isWave32() && match(V, W32Pred))
577 return true;
578 if (ST.isWave64() && match(V, W64Pred))
579 return true;
580
581 return false;
582}
583
586 IntrinsicInst &II) const {
587 const auto IID = II.getIntrinsicID();
588 assert(IID == Intrinsic::amdgcn_readlane ||
589 IID == Intrinsic::amdgcn_readfirstlane ||
590 IID == Intrinsic::amdgcn_permlane64);
591
592 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
593
594 // Only do this if both instructions are in the same block
595 // (so the exec mask won't change) and the readlane is the only user of its
596 // operand.
597 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
598 return nullptr;
599
600 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
601
602 // If this is a readlane, check that the second operand is a constant, or is
603 // defined before OpInst so we know it's safe to move this intrinsic higher.
604 Value *LaneID = nullptr;
605 if (IsReadLane) {
606 LaneID = II.getOperand(1);
607
608 // readlane take an extra operand for the lane ID, so we must check if that
609 // LaneID value can be used at the point where we want to move the
610 // intrinsic.
611 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
612 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
613 return nullptr;
614 }
615 }
616
617 // Hoist the intrinsic (II) through OpInst.
618 //
619 // (II (OpInst x)) -> (OpInst (II x))
620 const auto DoIt = [&](unsigned OpIdx,
621 Function *NewIntrinsic) -> Instruction * {
623 if (IsReadLane)
624 Ops.push_back(LaneID);
625
626 // Rewrite the intrinsic call.
627 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
628
629 // Rewrite OpInst so it takes the result of the intrinsic now.
630 Instruction &NewOp = *OpInst->clone();
631 NewOp.setOperand(OpIdx, NewII);
632 return &NewOp;
633 };
634
635 // TODO(?): Should we do more with permlane64?
636 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
637 return nullptr;
638
639 if (isa<UnaryOperator>(OpInst))
640 return DoIt(0, II.getCalledFunction());
641
642 if (isa<CastInst>(OpInst)) {
643 Value *Src = OpInst->getOperand(0);
644 Type *SrcTy = Src->getType();
645 if (!isTypeLegal(SrcTy))
646 return nullptr;
647
648 Function *Remangled =
649 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
650 return DoIt(0, Remangled);
651 }
652
653 // We can also hoist through binary operators if the other operand is uniform.
654 if (isa<BinaryOperator>(OpInst)) {
655 // FIXME: If we had access to UniformityInfo here we could just check
656 // if the operand is uniform.
657 if (isTriviallyUniform(OpInst->getOperandUse(0)))
658 return DoIt(1, II.getCalledFunction());
659 if (isTriviallyUniform(OpInst->getOperandUse(1)))
660 return DoIt(0, II.getCalledFunction());
661 }
662
663 return nullptr;
664}
665
666/// Evaluate V as a function of the lane ID and return its value on Lane, or
667/// std::nullopt if V is not a closed-form expression of the lane ID.
668static std::optional<unsigned> evalLaneExpr(Value *V, unsigned Lane,
669 const GCNSubtarget &ST,
670 const DataLayout &DL,
671 unsigned Depth = 0) {
673 return std::nullopt;
674
675 // Poison/undef in the index expression: bail and let InstCombine fold the
676 // intrinsic the usual way.
677 if (isa<UndefValue>(V))
678 return std::nullopt;
679
680 if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
681 return CI->getZExtValue();
682
683 if (isThreadID(ST, V))
684 return Lane;
685
687 if (!BO)
688 return std::nullopt;
689
690 std::optional<unsigned> LHS =
691 evalLaneExpr(BO->getOperand(0), Lane, ST, DL, Depth + 1);
692 if (!LHS)
693 return std::nullopt;
694 std::optional<unsigned> RHS =
695 evalLaneExpr(BO->getOperand(1), Lane, ST, DL, Depth + 1);
696 if (!RHS)
697 return std::nullopt;
698
699 Type *Ty = BO->getType();
700 Constant *Ops[] = {ConstantInt::get(Ty, *LHS), ConstantInt::get(Ty, *RHS)};
701 auto *CI =
703 return CI ? std::optional<unsigned>(CI->getZExtValue()) : std::nullopt;
704}
705
706/// Build the per-lane shuffle map by evaluating Index for every lane in the
707/// wave. Returns false if any lane index is non-constant or out of range.
708static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST,
710 const DataLayout &DL) {
711 unsigned WaveSize = ST.getWavefrontSize();
712 Ids.resize(WaveSize);
713 for (unsigned Lane : seq(WaveSize)) {
714 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
715 if (!Val || *Val >= WaveSize)
716 return false;
717 Ids[Lane] = *Val;
718 }
719 return true;
720}
721
722/// Lanes are partitioned into groups of Period; each group is a translated
723/// copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
724template <unsigned Period>
726 static_assert(isPowerOf2_32(Period), "Period must be a power of two");
727 for (unsigned I = Period, E = Ids.size(); I < E; ++I)
728 if (Ids[I] != Ids[I % Period] + (I & ~(Period - 1)))
729 return false;
730 return true;
731}
732
733/// Match an N-lane row pattern: each lane in [0, N) reads from a source lane
734/// in the same N-lane row, and the pattern repeats periodically across rows.
735template <unsigned N> static bool isRowPattern(ArrayRef<uint8_t> Ids) {
736 for (unsigned I = 0; I < N; ++I)
737 if (Ids[I] >= N)
738 return false;
739 return hasPeriodicLayout<N>(Ids);
740}
741
742static constexpr auto isQuadPattern = isRowPattern<4>;
743static constexpr auto isHalfRowPattern = isRowPattern<8>;
744static constexpr auto isFullRowPattern = isRowPattern<16>;
745
746/// Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp
747/// QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2],
748/// [7:6]=Ids[3].
749static std::optional<unsigned> matchQuadPermPattern(ArrayRef<uint8_t> Ids) {
750 if (!isQuadPattern(Ids))
751 return std::nullopt;
752 return Ids[3] << 6 | Ids[2] << 4 | Ids[1] << 2 | Ids[0];
753}
754
755/// Match an N-lane reversal (mirror) pattern.
756template <unsigned N> static bool matchMirrorPattern(ArrayRef<uint8_t> Ids) {
757 if (!isRowPattern<N>(Ids))
758 return false;
759 for (unsigned J = 0; J < N; ++J)
760 if (Ids[J] != (N - 1) - J)
761 return false;
762 return true;
763}
764
767
768/// Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
769static std::optional<unsigned> matchRowRotatePattern(ArrayRef<uint8_t> Ids) {
770 if (Ids[0] == 0 || !isFullRowPattern(Ids))
771 return std::nullopt;
772 for (unsigned J = 1; J < 16; ++J)
773 if (Ids[J] != (Ids[0] + J) % 16)
774 return std::nullopt;
775 return 16u - Ids[0];
776}
777
778/// Match a row-share pattern: all 16 lanes of each row read the same source
779/// lane. Returns the shared source lane index in [0, 16).
780static std::optional<unsigned> matchRowSharePattern(ArrayRef<uint8_t> Ids) {
781 if (!isFullRowPattern(Ids))
782 return std::nullopt;
783 if (!all_equal(Ids.take_front(16)))
784 return std::nullopt;
785 return Ids[0];
786}
787
788/// Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J,
789/// with Mask in [1, 15].
790static std::optional<unsigned> matchRowXMaskPattern(ArrayRef<uint8_t> Ids) {
791 unsigned Mask = Ids[0];
792 if (Mask == 0 || !isFullRowPattern(Ids))
793 return std::nullopt;
794 for (unsigned J = 0; J < 16; ++J)
795 if (Ids[J] != (Mask ^ J))
796 return std::nullopt;
797 return Mask;
798}
799
800/// Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8
801/// 24-bit selector (three bits per output lane).
802static std::optional<unsigned> matchHalfRowPermPattern(ArrayRef<uint8_t> Ids) {
803 if (!isHalfRowPattern(Ids))
804 return std::nullopt;
805 unsigned Selector = 0;
806 for (unsigned J = 0; J < 8; ++J)
807 Selector |= Ids[J] << (J * 3);
808 return Selector;
809}
810
811/// Pack a 16-lane permutation into a single 64-bit value: four bits per output
812/// lane, lane J in bits [J*4 + 3 : J*4]. The caller splits it into the low and
813/// high 32-bit selector operands of v_permlane16 / v_permlanex16.
815 uint64_t Sel = 0;
816 for (unsigned J = 0; J < 16; ++J)
817 Sel |= static_cast<uint64_t>(Ids[J] & 0xF) << (J * 4);
818 return Sel;
819}
820
821/// Match a half-wave swap: lane J reads from lane J ^ 32. Only meaningful on
822/// wave64 targets.
824 if (Ids.size() != 64)
825 return false;
826 for (unsigned J = 0; J < 64; ++J)
827 if (Ids[J] != (J ^ 32))
828 return false;
829 return true;
830}
831
832/// Match a cross-row permutation suitable for v_permlanex16: every lane in
833/// the low 16-lane half reads from the high half of its own row, and vice
834/// versa.
836 if (!hasPeriodicLayout<32>(Ids))
837 return false;
838 for (unsigned J = 0; J < 16; ++J) {
839 if (Ids[J] < 16 || Ids[J] >= 32)
840 return false;
841 if (Ids[J + 16] != Ids[J] - 16)
842 return false;
843 }
844 return true;
845}
846
847/// Match a DS_SWIZZLE bitmask-mode permutation:
848/// dst_lane = ((src_lane & AND) | OR) ^ XOR
849/// with each mask being five bits. Returns the encoded swizzle immediate.
850/// The hardware applies the formula independently within each 32-lane group,
851/// so on wave64 the high group must replicate the low one (translated by 32).
852static std::optional<unsigned>
854 if (!hasPeriodicLayout<32>(Ids))
855 return std::nullopt;
856
857 // The formula is per-bit: output bit B depends only on input bit B. Probe
858 // each bit with src=0 and src=(1<<B); if the output bit flipped, AND[B]=1
859 // and XOR[B] carries the constant offset; otherwise it is a constant bit
860 // encoded in OR (with AND[B]=0, XOR[B]=0).
861 unsigned AndMask = 0, OrMask = 0, XorMask = 0;
862 for (unsigned B = 0; B < 5; ++B) {
863 unsigned Bit0 = (Ids[0] >> B) & 1;
864 unsigned Bit1 = (Ids[1u << B] >> B) & 1;
865 if (Bit0 != Bit1) {
866 AndMask |= 1u << B;
867 XorMask |= Bit0 << B;
868 } else {
869 OrMask |= Bit0 << B;
870 }
871 }
872
873 // The per-bit derivation assumes bit independence; verify the masks
874 // actually reproduce every lane in the 32-lane group.
875 for (unsigned I : seq(32u)) {
876 unsigned Expected = ((I & AndMask) | OrMask) ^ XorMask;
877 if (Ids[I] != Expected)
878 return std::nullopt;
879 }
880
885}
886
887/// Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation
888/// of all 32 lanes within each 32-lane group by a constant N in [0, 31],
889/// i.e. dst_lane = (src_lane + N) % 32. On wave64, hasPeriodicLayout<32>
890/// ensures both 32-lane groups rotate by the same amount.
891static std::optional<unsigned>
893 if (!hasPeriodicLayout<32>(Ids))
894 return std::nullopt;
895
896 // Determine the rotation amount from lane 0: every lane must read from
897 // lane (I + N) % 32 where N = Ids[0] and 0 <= N <= 31.
898 unsigned N = Ids[0];
899 if (N >= 32)
900 return std::nullopt;
901
902 for (unsigned I = 0; I < 32; ++I)
903 if (Ids[I] != (I + N) % 32)
904 return std::nullopt;
905
908}
909
910/// Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and
911/// bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can
912/// be folded into a consuming VALU op by GCNDPPCombine.
913static Value *createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl) {
914 Type *Ty = Val->getType();
915 return B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, {Ty},
916 {PoisonValue::get(Ty), Val, B.getInt32(Ctrl),
917 B.getInt32(0xF), B.getInt32(0xF), B.getTrue()});
918}
919
920/// Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
921static Value *createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector) {
922 return B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp8, {Val->getType()},
923 {Val, B.getInt32(Selector)});
924}
925
926/// Emit v_permlane16 with the precomputed lane-select halves.
928 uint32_t Hi) {
929 Type *Ty = Val->getType();
930 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane16, {Ty},
931 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
932 B.getInt32(Hi), B.getFalse(), B.getFalse()});
933}
934
935/// Emit v_permlanex16 with the precomputed lane-select halves. Each output
936/// lane reads from the other 16-lane half of the same row.
938 uint32_t Hi) {
939 Type *Ty = Val->getType();
940 return B.CreateIntrinsic(Intrinsic::amdgcn_permlanex16, {Ty},
941 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
942 B.getInt32(Hi), B.getFalse(), B.getFalse()});
943}
944
945/// Emit ds_swizzle with the given immediate, bitcasting/converting between
946/// pointer/float types and i32 as required by the intrinsic signature.
948 const DataLayout &DL) {
949 Type *OrigTy = Val->getType();
950 assert(DL.getTypeSizeInBits(OrigTy) == 32 &&
951 "ds_swizzle only supports 32-bit operands");
952 IntegerType *I32Ty = B.getInt32Ty();
953 Value *Src = Val;
954 if (OrigTy->isPointerTy())
955 Src = B.CreatePtrToInt(Src, I32Ty);
956 else if (OrigTy != I32Ty)
957 Src = B.CreateBitCast(Src, I32Ty);
958 Value *Result = B.CreateIntrinsic(Intrinsic::amdgcn_ds_swizzle, {},
959 {Src, B.getInt32(Offset)});
960 if (OrigTy->isPointerTy())
961 return B.CreateIntToPtr(Result, OrigTy);
962 if (OrigTy != I32Ty)
963 return B.CreateBitCast(Result, OrigTy);
964 return Result;
965}
966
967/// Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
969 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {Val->getType()},
970 {Val});
971}
972
973/// Given a shuffle map, try to emit the best hardware intrinsic.
976 const GCNSubtarget &ST,
977 const DataLayout &DL) {
978 // Uniform shuffle (all lanes read the same value) is handled by cheaper
979 // broadcast/readlane intrinsics.
980 if (all_equal(Ids))
981 return nullptr;
982
983 if (std::optional<unsigned> QP = matchQuadPermPattern(Ids)) {
984 if (ST.hasDPP())
985 return createUpdateDpp(B, Src, *QP);
987 }
988
989 if (ST.hasDPP()) {
994 if (std::optional<unsigned> Amt = matchRowRotatePattern(Ids))
995 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_ROR_FIRST + *Amt - 1);
996 }
997
998 // row_share is supported on GFX90A and GFX10+; row_xmask is GFX10+ only.
999 if (ST.hasDPPRowShare()) {
1000 if (std::optional<unsigned> Lane = matchRowSharePattern(Ids))
1001 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_SHARE_FIRST + *Lane);
1002 }
1003
1004 if (ST.hasDPP() && ST.hasGFX10Insts()) {
1005 if (std::optional<unsigned> Mask = matchRowXMaskPattern(Ids))
1006 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_XMASK_FIRST + *Mask);
1007 }
1008
1009 if (ST.hasDPP8()) {
1010 if (std::optional<unsigned> Sel = matchHalfRowPermPattern(Ids))
1011 return createMovDpp8(B, Src, *Sel);
1012 }
1013
1014 if (ST.hasPermlane16Insts()) {
1015 if (isFullRowPattern(Ids)) {
1017 return createPermlane16(B, Src, Lo_32(Sel), Hi_32(Sel));
1018 }
1019 // Cross-row shuffles (e.g. XOR 16..31) — covered by permlanex16.
1020 if (isCrossRowPattern(Ids)) {
1022 return createPermlaneX16(B, Src, Lo_32(Sel), Hi_32(Sel));
1023 }
1024 }
1025
1026 // Generic DS_SWIZZLE bitmask-mode fallback: handles any 32-lane shuffle that
1027 // can be expressed as dst = ((src & AND) | OR) ^ XOR with 5-bit masks. This
1028 // is available on every target that has ds_swizzle.
1029 if (std::optional<unsigned> Imm = matchDsSwizzleBitmaskPattern(Ids))
1030 return createDsSwizzle(B, Src, *Imm, DL);
1031
1032 // DS_SWIZZLE rotate mode (GFX9+): handles cyclic 32-lane rotations that
1033 // bitmask mode cannot express (e.g. +1 mod 32 requires inter-bit carry).
1034 if (ST.hasDsSwizzleRotateMode()) {
1035 if (std::optional<unsigned> Imm = matchDsSwizzleRotatePattern(Ids))
1036 return createDsSwizzle(B, Src, *Imm, DL);
1037 }
1038
1039 if (ST.hasPermLane64() && matchHalfWaveSwapPattern(Ids))
1040 return createPermlane64(B, Src);
1041
1042 return nullptr;
1043}
1044
1045/// Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant
1046/// function of the lane ID into a hardware-specific lane permutation intrinsic.
1047static std::optional<Instruction *>
1049 const GCNSubtarget &ST) {
1050 const DataLayout &DL = IC.getDataLayout();
1051 if (DL.getTypeSizeInBits(II.getType()) != 32)
1052 return std::nullopt;
1053
1054 if (!ST.isWaveSizeKnown())
1055 return std::nullopt;
1056
1057 unsigned WaveSize = ST.getWavefrontSize();
1058 bool IsBpermute = II.getIntrinsicID() == Intrinsic::amdgcn_ds_bpermute;
1059 Value *Src = II.getArgOperand(IsBpermute ? 1 : 0);
1060 Value *Index = II.getArgOperand(IsBpermute ? 0 : 1);
1061
1063 if (IsBpermute) {
1064 Ids.resize(WaveSize);
1065 for (unsigned Lane : seq(WaveSize)) {
1066 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
1067 if (!Val || (*Val & 3) || (*Val >> 2) >= WaveSize)
1068 return std::nullopt;
1069 Ids[Lane] = *Val >> 2;
1070 }
1071 } else {
1072 if (!tryBuildShuffleMap(Index, ST, Ids, DL))
1073 return std::nullopt;
1074 }
1075
1076 Value *Result = matchShuffleToHWIntrinsic(IC.Builder, Src, Ids, ST, DL);
1077 if (!Result)
1078 return std::nullopt;
1079
1080 return IC.replaceInstUsesWith(II, Result);
1081}
1082std::optional<Instruction *>
1084 Intrinsic::ID IID = II.getIntrinsicID();
1085 switch (IID) {
1086 case Intrinsic::amdgcn_implicitarg_ptr: {
1087 if (II.getFunction()->hasFnAttribute("amdgpu-no-implicitarg-ptr"))
1088 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1089 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(*II.getFunction());
1090
1091 uint64_t CurrentOrNullBytes =
1092 II.getAttributes().getRetDereferenceableOrNullBytes();
1093 if (CurrentOrNullBytes != 0) {
1094 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
1095 // into dereferenceable(max(A, B))
1096 uint64_t NewBytes = std::max(CurrentOrNullBytes, ImplicitArgBytes);
1097 II.addRetAttr(
1098 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1099 II.removeRetAttr(Attribute::DereferenceableOrNull);
1100 return &II;
1101 }
1102
1103 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
1104 uint64_t NewBytes = std::max(CurrentBytes, ImplicitArgBytes);
1105 if (NewBytes != CurrentBytes) {
1106 II.addRetAttr(
1107 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1108 return &II;
1109 }
1110
1111 return std::nullopt;
1112 }
1113 case Intrinsic::amdgcn_rcp: {
1114 Value *Src = II.getArgOperand(0);
1115 if (isa<PoisonValue>(Src))
1116 return IC.replaceInstUsesWith(II, Src);
1117
1118 // TODO: Move to ConstantFolding/InstSimplify?
1119 if (isa<UndefValue>(Src)) {
1120 Type *Ty = II.getType();
1121 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1122 return IC.replaceInstUsesWith(II, QNaN);
1123 }
1124
1125 if (II.isStrictFP())
1126 break;
1127
1128 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1129 const APFloat &ArgVal = C->getValueAPF();
1130 APFloat Val(ArgVal.getSemantics(), 1);
1132
1133 // This is more precise than the instruction may give.
1134 //
1135 // TODO: The instruction always flushes denormal results (except for f16),
1136 // should this also?
1137 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
1138 }
1139
1140 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
1141 if (!FMF.allowContract())
1142 break;
1143 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
1144 if (!SrcCI)
1145 break;
1146
1147 auto IID = SrcCI->getIntrinsicID();
1148 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
1149 //
1150 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
1151 // relaxed.
1152 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
1153 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
1154 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
1155 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
1156 break;
1157
1158 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
1159 break;
1160
1162 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
1163
1164 InnerFMF |= FMF;
1165 II.setFastMathFlags(InnerFMF);
1166
1167 II.setCalledFunction(NewDecl);
1168 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
1169 }
1170
1171 break;
1172 }
1173 case Intrinsic::amdgcn_sqrt:
1174 case Intrinsic::amdgcn_rsq:
1175 case Intrinsic::amdgcn_tanh: {
1176 Value *Src = II.getArgOperand(0);
1177 if (isa<PoisonValue>(Src))
1178 return IC.replaceInstUsesWith(II, Src);
1179
1180 // TODO: Move to ConstantFolding/InstSimplify?
1181 if (isa<UndefValue>(Src)) {
1182 Type *Ty = II.getType();
1183 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1184 return IC.replaceInstUsesWith(II, QNaN);
1185 }
1186
1187 // f16 amdgcn.sqrt is identical to regular sqrt.
1188 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
1190 II.getModule(), Intrinsic::sqrt, {II.getType()});
1191 II.setCalledFunction(NewDecl);
1192 return &II;
1193 }
1194
1195 break;
1196 }
1197 case Intrinsic::amdgcn_log:
1198 case Intrinsic::amdgcn_exp2: {
1199 const bool IsLog = IID == Intrinsic::amdgcn_log;
1200 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
1201 Value *Src = II.getArgOperand(0);
1202 Type *Ty = II.getType();
1203
1204 if (isa<PoisonValue>(Src))
1205 return IC.replaceInstUsesWith(II, Src);
1206
1207 if (IC.getSimplifyQuery().isUndefValue(Src))
1209
1210 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1211 if (C->isInfinity()) {
1212 // exp2(+inf) -> +inf
1213 // log2(+inf) -> +inf
1214 if (!C->isNegative())
1215 return IC.replaceInstUsesWith(II, C);
1216
1217 // exp2(-inf) -> 0
1218 if (IsExp && C->isNegative())
1220 }
1221
1222 if (II.isStrictFP())
1223 break;
1224
1225 if (C->isNaN()) {
1226 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
1227 return IC.replaceInstUsesWith(II, Quieted);
1228 }
1229
1230 // f32 instruction doesn't handle denormals, f16 does.
1231 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
1232 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
1233 : ConstantFP::get(Ty, 1.0);
1234 return IC.replaceInstUsesWith(II, FoldedValue);
1235 }
1236
1237 if (IsLog && C->isNegative())
1239
1240 // TODO: Full constant folding matching hardware behavior.
1241 }
1242
1243 break;
1244 }
1245 case Intrinsic::amdgcn_frexp_mant:
1246 case Intrinsic::amdgcn_frexp_exp: {
1247 Value *Src = II.getArgOperand(0);
1248 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1249 int Exp;
1250 APFloat Significand =
1251 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
1252
1253 if (IID == Intrinsic::amdgcn_frexp_mant) {
1254 return IC.replaceInstUsesWith(
1255 II, ConstantFP::get(II.getContext(), Significand));
1256 }
1257
1258 // Match instruction special case behavior.
1259 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
1260 Exp = 0;
1261
1262 return IC.replaceInstUsesWith(II,
1263 ConstantInt::getSigned(II.getType(), Exp));
1264 }
1265
1266 if (isa<PoisonValue>(Src))
1267 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1268
1269 if (isa<UndefValue>(Src)) {
1270 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1271 }
1272
1273 break;
1274 }
1275 case Intrinsic::amdgcn_class: {
1276 Value *Src0 = II.getArgOperand(0);
1277 Value *Src1 = II.getArgOperand(1);
1278 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
1279 if (CMask) {
1280 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1281 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
1282
1283 // Clamp any excess bits, as they're illegal for the generic intrinsic.
1284 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
1285 CMask->getZExtValue() & fcAllFlags));
1286 return &II;
1287 }
1288
1289 // Propagate poison.
1290 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
1291 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1292
1293 // llvm.amdgcn.class(_, undef) -> false
1294 if (IC.getSimplifyQuery().isUndefValue(Src1))
1295 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
1296
1297 // llvm.amdgcn.class(undef, mask) -> mask != 0
1298 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
1299 Value *CmpMask = IC.Builder.CreateICmpNE(
1300 Src1, ConstantInt::getNullValue(Src1->getType()));
1301 return IC.replaceInstUsesWith(II, CmpMask);
1302 }
1303 break;
1304 }
1305 case Intrinsic::amdgcn_cvt_pkrtz: {
1306 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
1307 Type *HalfTy = Type::getHalfTy(Arg->getContext());
1308
1309 if (isa<PoisonValue>(Arg))
1310 return PoisonValue::get(HalfTy);
1311 if (isa<UndefValue>(Arg))
1312 return UndefValue::get(HalfTy);
1313
1314 ConstantFP *CFP = nullptr;
1315 if (match(Arg, m_ConstantFP(CFP))) {
1316 bool LosesInfo;
1317 APFloat Val(CFP->getValueAPF());
1319 return ConstantFP::get(HalfTy, Val);
1320 }
1321
1322 Value *Src = nullptr;
1323 if (match(Arg, m_FPExt(m_Value(Src)))) {
1324 if (Src->getType()->isHalfTy())
1325 return Src;
1326 }
1327
1328 return nullptr;
1329 };
1330
1331 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
1332 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
1333 Value *V = PoisonValue::get(II.getType());
1334 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
1335 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
1336 return IC.replaceInstUsesWith(II, V);
1337 }
1338 }
1339
1340 break;
1341 }
1342 case Intrinsic::amdgcn_cvt_pknorm_i16:
1343 case Intrinsic::amdgcn_cvt_pknorm_u16:
1344 case Intrinsic::amdgcn_cvt_pk_i16:
1345 case Intrinsic::amdgcn_cvt_pk_u16: {
1346 Value *Src0 = II.getArgOperand(0);
1347 Value *Src1 = II.getArgOperand(1);
1348
1349 // TODO: Replace call with scalar operation if only one element is poison.
1350 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
1351 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1352
1353 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
1354 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1355 }
1356
1357 break;
1358 }
1359 case Intrinsic::amdgcn_cvt_off_f32_i4: {
1360 Value* Arg = II.getArgOperand(0);
1361 Type *Ty = II.getType();
1362
1363 if (isa<PoisonValue>(Arg))
1364 return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
1365
1366 if(IC.getSimplifyQuery().isUndefValue(Arg))
1368
1369 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
1370 if (!CArg)
1371 break;
1372
1373 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1374 constexpr size_t ResValsSize = 16;
1375 static constexpr float ResVals[ResValsSize] = {
1376 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1377 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1378 Constant *Res =
1379 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1380 return IC.replaceInstUsesWith(II, Res);
1381 }
1382 case Intrinsic::amdgcn_ubfe:
1383 case Intrinsic::amdgcn_sbfe: {
1384 // Decompose simple cases into standard shifts.
1385 Value *Src = II.getArgOperand(0);
1386 if (isa<UndefValue>(Src)) {
1387 return IC.replaceInstUsesWith(II, Src);
1388 }
1389
1390 unsigned Width;
1391 Type *Ty = II.getType();
1392 unsigned IntSize = Ty->getIntegerBitWidth();
1393
1394 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
1395 if (CWidth) {
1396 Width = CWidth->getZExtValue();
1397 if ((Width & (IntSize - 1)) == 0) {
1399 }
1400
1401 // Hardware ignores high bits, so remove those.
1402 if (Width >= IntSize) {
1403 return IC.replaceOperand(
1404 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
1405 }
1406 }
1407
1408 unsigned Offset;
1409 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
1410 if (COffset) {
1411 Offset = COffset->getZExtValue();
1412 if (Offset >= IntSize) {
1413 return IC.replaceOperand(
1414 II, 1,
1415 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
1416 }
1417 }
1418
1419 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1420
1421 if (!CWidth || !COffset)
1422 break;
1423
1424 // The case of Width == 0 is handled above, which makes this transformation
1425 // safe. If Width == 0, then the ashr and lshr instructions become poison
1426 // value since the shift amount would be equal to the bit size.
1427 assert(Width != 0);
1428
1429 // TODO: This allows folding to undef when the hardware has specific
1430 // behavior?
1431 if (Offset + Width < IntSize) {
1432 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
1433 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
1434 : IC.Builder.CreateLShr(Shl, IntSize - Width);
1435 RightShift->takeName(&II);
1436 return IC.replaceInstUsesWith(II, RightShift);
1437 }
1438
1439 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
1440 : IC.Builder.CreateLShr(Src, Offset);
1441
1442 RightShift->takeName(&II);
1443 return IC.replaceInstUsesWith(II, RightShift);
1444 }
1445 case Intrinsic::amdgcn_exp:
1446 case Intrinsic::amdgcn_exp_row:
1447 case Intrinsic::amdgcn_exp_compr: {
1448 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
1449 unsigned EnBits = En->getZExtValue();
1450 if (EnBits == 0xf)
1451 break; // All inputs enabled.
1452
1453 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1454 bool Changed = false;
1455 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1456 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1457 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1458 Value *Src = II.getArgOperand(I + 2);
1459 if (!isa<PoisonValue>(Src)) {
1460 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
1461 Changed = true;
1462 }
1463 }
1464 }
1465
1466 if (Changed) {
1467 return &II;
1468 }
1469
1470 break;
1471 }
1472 case Intrinsic::amdgcn_fmed3: {
1473 Value *Src0 = II.getArgOperand(0);
1474 Value *Src1 = II.getArgOperand(1);
1475 Value *Src2 = II.getArgOperand(2);
1476
1477 for (Value *Src : {Src0, Src1, Src2}) {
1478 if (isa<PoisonValue>(Src))
1479 return IC.replaceInstUsesWith(II, Src);
1480 }
1481
1482 if (II.isStrictFP())
1483 break;
1484
1485 // med3 with a nan input acts like
1486 // v_min_f32(v_min_f32(s0, s1), s2)
1487 //
1488 // Signalingness is ignored with ieee=0, so we fold to
1489 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1490 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1491 // returned signaling nan will not be quieted.
1492
1493 // ieee=1
1494 // s0 snan: s2
1495 // s1 snan: s2
1496 // s2 snan: qnan
1497
1498 // s0 qnan: min(s1, s2)
1499 // s1 qnan: min(s0, s2)
1500 // s2 qnan: min(s0, s1)
1501
1502 // ieee=0
1503 // s0 _nan: min(s1, s2)
1504 // s1 _nan: min(s0, s2)
1505 // s2 _nan: min(s0, s1)
1506
1507 // med3 behavior with infinity
1508 // s0 +inf: max(s1, s2)
1509 // s1 +inf: max(s0, s2)
1510 // s2 +inf: max(s0, s1)
1511 // s0 -inf: min(s1, s2)
1512 // s1 -inf: min(s0, s2)
1513 // s2 -inf: min(s0, s1)
1514
1515 // Checking for NaN before canonicalization provides better fidelity when
1516 // mapping other operations onto fmed3 since the order of operands is
1517 // unchanged.
1518 Value *V = nullptr;
1519 const APFloat *ConstSrc0 = nullptr;
1520 const APFloat *ConstSrc1 = nullptr;
1521 const APFloat *ConstSrc2 = nullptr;
1522
1523 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1524 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1525 isa<UndefValue>(Src0)) {
1526 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1527 switch (fpenvIEEEMode(II)) {
1528 case KnownIEEEMode::On:
1529 // TODO: If Src2 is snan, does it need quieting?
1530 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1531 return IC.replaceInstUsesWith(II, Src2);
1532
1533 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1534 : IC.Builder.CreateMinNum(Src1, Src2);
1535 break;
1536 case KnownIEEEMode::Off:
1537 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1538 : IC.Builder.CreateMinimumNum(Src1, Src2);
1539 break;
1541 break;
1542 }
1543 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1544 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1545 isa<UndefValue>(Src1)) {
1546 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1547 switch (fpenvIEEEMode(II)) {
1548 case KnownIEEEMode::On:
1549 // TODO: If Src2 is snan, does it need quieting?
1550 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1551 return IC.replaceInstUsesWith(II, Src2);
1552
1553 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1554 : IC.Builder.CreateMinNum(Src0, Src2);
1555 break;
1556 case KnownIEEEMode::Off:
1557 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1558 : IC.Builder.CreateMinimumNum(Src0, Src2);
1559 break;
1561 break;
1562 }
1563 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1564 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1565 isa<UndefValue>(Src2)) {
1566 switch (fpenvIEEEMode(II)) {
1567 case KnownIEEEMode::On:
1568 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1569 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1570 return IC.replaceInstUsesWith(II, Quieted);
1571 }
1572
1573 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1574 ? IC.Builder.CreateMaxNum(Src0, Src1)
1575 : IC.Builder.CreateMinNum(Src0, Src1);
1576 break;
1577 case KnownIEEEMode::Off:
1578 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1579 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1580 : IC.Builder.CreateMaximumNum(Src0, Src1);
1581 break;
1583 break;
1584 }
1585 }
1586
1587 if (V) {
1588 if (auto *CI = dyn_cast<CallInst>(V)) {
1589 CI->copyFastMathFlags(&II);
1590 CI->takeName(&II);
1591 }
1592 return IC.replaceInstUsesWith(II, V);
1593 }
1594
1595 bool Swap = false;
1596 // Canonicalize constants to RHS operands.
1597 //
1598 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1599 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1600 std::swap(Src0, Src1);
1601 Swap = true;
1602 }
1603
1604 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1605 std::swap(Src1, Src2);
1606 Swap = true;
1607 }
1608
1609 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1610 std::swap(Src0, Src1);
1611 Swap = true;
1612 }
1613
1614 if (Swap) {
1615 II.setArgOperand(0, Src0);
1616 II.setArgOperand(1, Src1);
1617 II.setArgOperand(2, Src2);
1618 return &II;
1619 }
1620
1621 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1622 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1623 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1624 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1625 C2->getValueAPF());
1626 return IC.replaceInstUsesWith(II,
1627 ConstantFP::get(II.getType(), Result));
1628 }
1629 }
1630 }
1631
1632 if (!ST->hasMed3_16())
1633 break;
1634
1635 // Repeat floating-point width reduction done for minnum/maxnum.
1636 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1637 if (Value *X = matchFPExtFromF16(Src0)) {
1638 if (Value *Y = matchFPExtFromF16(Src1)) {
1639 if (Value *Z = matchFPExtFromF16(Src2)) {
1640 Value *NewCall = IC.Builder.CreateIntrinsic(
1641 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1642 return new FPExtInst(NewCall, II.getType());
1643 }
1644 }
1645 }
1646
1647 break;
1648 }
1649 case Intrinsic::amdgcn_icmp:
1650 case Intrinsic::amdgcn_fcmp: {
1651 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1652 // Guard against invalid arguments.
1653 int64_t CCVal = CC->getZExtValue();
1654 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1655 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1656 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1657 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1659 break;
1660
1661 Value *Src0 = II.getArgOperand(0);
1662 Value *Src1 = II.getArgOperand(1);
1663
1664 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1665 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1667 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1668 if (CCmp && CCmp->isNullValue()) {
1669 return IC.replaceInstUsesWith(
1670 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1671 }
1672
1673 // The result of V_ICMP/V_FCMP assembly instructions (which this
1674 // intrinsic exposes) is one bit per thread, masked with the EXEC
1675 // register (which contains the bitmask of live threads). So a
1676 // comparison that always returns true is the same as a read of the
1677 // EXEC register. ballot(true) reads EXEC at the wave-size width, so
1678 // zext/trunc the result to the intrinsic's return type.
1679 Type *WaveTy = IC.Builder.getIntNTy(ST->getWavefrontSize());
1680 Value *Ballot = IC.Builder.CreateIntrinsic(
1681 Intrinsic::amdgcn_ballot, WaveTy, IC.Builder.getTrue());
1682 Value *Result = IC.Builder.CreateZExtOrTrunc(Ballot, II.getType());
1683 return IC.replaceInstUsesWith(II, Result);
1684 }
1685
1686 // Canonicalize constants to RHS.
1687 CmpInst::Predicate SwapPred =
1689 II.setArgOperand(0, Src1);
1690 II.setArgOperand(1, Src0);
1691 II.setArgOperand(
1692 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1693 return &II;
1694 }
1695
1696 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1697 break;
1698
1699 // Canonicalize compare eq with true value to compare != 0
1700 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1701 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1702 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1703 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1704 Value *ExtSrc;
1705 if (CCVal == CmpInst::ICMP_EQ &&
1706 ((match(Src1, PatternMatch::m_One()) &&
1707 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1708 (match(Src1, PatternMatch::m_AllOnes()) &&
1709 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1710 ExtSrc->getType()->isIntegerTy(1)) {
1712 IC.replaceOperand(II, 2,
1713 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1714 return &II;
1715 }
1716
1717 CmpPredicate SrcPred;
1718 Value *SrcLHS;
1719 Value *SrcRHS;
1720
1721 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1722 // intrinsic. The typical use is a wave vote function in the library, which
1723 // will be fed from a user code condition compared with 0. Fold in the
1724 // redundant compare.
1725
1726 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1727 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1728 //
1729 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1730 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1731 if (match(Src1, PatternMatch::m_Zero()) &&
1733 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1734 PatternMatch::m_Value(SrcRHS))))) {
1735 if (CCVal == CmpInst::ICMP_EQ)
1736 SrcPred = CmpInst::getInversePredicate(SrcPred);
1737
1738 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1739 ? Intrinsic::amdgcn_fcmp
1740 : Intrinsic::amdgcn_icmp;
1741
1742 Type *Ty = SrcLHS->getType();
1743 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1744 // Promote to next legal integer type.
1745 unsigned Width = CmpType->getBitWidth();
1746 unsigned NewWidth = Width;
1747
1748 // Don't do anything for i1 comparisons.
1749 if (Width == 1)
1750 break;
1751
1752 if (Width <= 16)
1753 NewWidth = 16;
1754 else if (Width <= 32)
1755 NewWidth = 32;
1756 else if (Width <= 64)
1757 NewWidth = 64;
1758 else
1759 break; // Can't handle this.
1760
1761 if (Width != NewWidth) {
1762 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1763 if (CmpInst::isSigned(SrcPred)) {
1764 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1765 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1766 } else {
1767 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1768 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1769 }
1770 }
1771 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1772 break;
1773
1774 Value *Args[] = {SrcLHS, SrcRHS,
1775 ConstantInt::get(CC->getType(), SrcPred)};
1776 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1777 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1778 NewCall->takeName(&II);
1779 return IC.replaceInstUsesWith(II, NewCall);
1780 }
1781
1782 break;
1783 }
1784 case Intrinsic::amdgcn_mbcnt_hi:
1785 // exec_hi is all 0, so this is just a copy.
1786 if (ST->isWave32())
1787 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1788 [[fallthrough]];
1789 case Intrinsic::amdgcn_mbcnt_lo: {
1790 ConstantRange AccRange =
1791 computeConstantRange(II.getArgOperand(1),
1792 /*ForSigned=*/false, IC.getSimplifyQuery());
1793 if (AccRange.isFullSet())
1794 return nullptr;
1795
1796 // TODO: Can raise lower bound by inspecting first argument.
1797 ConstantRange MbcntRange(APInt(32, 0), APInt(32, 32 + 1));
1798 ConstantRange ComputedRange = AccRange.add(MbcntRange);
1799 if (ComputedRange.isFullSet())
1800 return nullptr;
1801
1802 if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1803 ComputedRange = ComputedRange.intersectWith(*ExistingRange);
1804 if (ComputedRange == *ExistingRange)
1805 return nullptr;
1806 }
1807
1808 II.addRangeRetAttr(ComputedRange);
1809 return nullptr;
1810 }
1811 case Intrinsic::amdgcn_ballot: {
1812 Value *Arg = II.getArgOperand(0);
1813 if (isa<PoisonValue>(Arg))
1814 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1815
1816 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1817 if (Src->isZero()) {
1818 // amdgcn.ballot(i1 0) is zero.
1819 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1820 }
1821 }
1822 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1823 // %b64 = call i64 ballot.i64(...)
1824 // =>
1825 // %b32 = call i32 ballot.i32(...)
1826 // %b64 = zext i32 %b32 to i64
1828 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1829 {IC.Builder.getInt32Ty()},
1830 {II.getArgOperand(0)}),
1831 II.getType());
1832 Call->takeName(&II);
1833 return IC.replaceInstUsesWith(II, Call);
1834 }
1835 break;
1836 }
1837 case Intrinsic::amdgcn_wavefrontsize: {
1838 if (ST->isWaveSizeKnown())
1839 return IC.replaceInstUsesWith(
1840 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1841 break;
1842 }
1843 case Intrinsic::amdgcn_wqm_vote: {
1844 // wqm_vote is identity when the argument is constant.
1845 if (!isa<Constant>(II.getArgOperand(0)))
1846 break;
1847
1848 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1849 }
1850 case Intrinsic::amdgcn_kill: {
1851 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1852 if (!C || !C->getZExtValue())
1853 break;
1854
1855 // amdgcn.kill(i1 1) is a no-op
1856 return IC.eraseInstFromFunction(II);
1857 }
1858 case Intrinsic::amdgcn_s_sendmsg:
1859 case Intrinsic::amdgcn_s_sendmsghalt: {
1860 // The second operand is copied to m0, but is only actually used for
1861 // certain message types. For message types that are known to not use m0,
1862 // fold it to poison.
1863 using namespace AMDGPU::SendMsg;
1864
1865 Value *M0Val = II.getArgOperand(1);
1866 if (isa<PoisonValue>(M0Val))
1867 break;
1868
1869 auto *MsgImm = cast<ConstantInt>(II.getArgOperand(0));
1870 uint16_t MsgId, OpId, StreamId;
1871 decodeMsg(MsgImm->getZExtValue(), MsgId, OpId, StreamId, *ST);
1872
1873 if (!msgDoesNotUseM0(MsgId, *ST))
1874 break;
1875
1876 // Drop UB-implying attributes since we're replacing with poison.
1877 II.dropUBImplyingAttrsAndMetadata();
1878 IC.replaceOperand(II, 1, PoisonValue::get(M0Val->getType()));
1879 return nullptr;
1880 }
1881 case Intrinsic::amdgcn_update_dpp: {
1882 Value *Old = II.getArgOperand(0);
1883
1884 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1885 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1886 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1887 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1888 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1889 break;
1890
1891 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1892 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1893 }
1894 case Intrinsic::amdgcn_permlane16:
1895 case Intrinsic::amdgcn_permlane16_var:
1896 case Intrinsic::amdgcn_permlanex16:
1897 case Intrinsic::amdgcn_permlanex16_var: {
1898 // Discard vdst_in if it's not going to be read.
1899 Value *VDstIn = II.getArgOperand(0);
1900 if (isa<PoisonValue>(VDstIn))
1901 break;
1902
1903 // FetchInvalid operand idx.
1904 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1905 IID == Intrinsic::amdgcn_permlanex16)
1906 ? 4 /* for permlane16 and permlanex16 */
1907 : 3; /* for permlane16_var and permlanex16_var */
1908
1909 // BoundCtrl operand idx.
1910 // For permlane16 and permlanex16 it should be 5
1911 // For Permlane16_var and permlanex16_var it should be 4
1912 unsigned int BcIdx = FiIdx + 1;
1913
1914 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1915 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1916 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1917 break;
1918
1919 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1920 }
1921 case Intrinsic::amdgcn_wave_shuffle:
1922 return tryOptimizeShufflePattern(IC, II, *ST);
1923 case Intrinsic::amdgcn_permlane64:
1924 case Intrinsic::amdgcn_readfirstlane:
1925 case Intrinsic::amdgcn_readlane:
1926 case Intrinsic::amdgcn_ds_bpermute: {
1927 // If the data argument is uniform these intrinsics return it unchanged.
1928 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1929 const Use &Src = II.getArgOperandUse(SrcIdx);
1930 if (isTriviallyUniform(Src))
1931 return IC.replaceInstUsesWith(II, Src.get());
1932
1933 if (IID == Intrinsic::amdgcn_readlane &&
1935 return &II;
1936
1937 // If the lane argument of bpermute is uniform, change it to readlane. This
1938 // generates better code and can enable further optimizations because
1939 // readlane is AlwaysUniform.
1940 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1941 const Use &Lane = II.getArgOperandUse(0);
1942 if (isTriviallyUniform(Lane)) {
1943 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1945 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1946 II.setCalledFunction(NewDecl);
1947 II.setOperand(0, Src);
1948 II.setOperand(1, NewLane);
1949 return &II;
1950 }
1951 }
1952
1953 if (IID == Intrinsic::amdgcn_ds_bpermute)
1954 return tryOptimizeShufflePattern(IC, II, *ST);
1955
1957 return Res;
1958
1959 return std::nullopt;
1960 }
1961 case Intrinsic::amdgcn_writelane: {
1962 // TODO: Fold bitcast like readlane.
1963 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1964 return &II;
1965 return std::nullopt;
1966 }
1967 case Intrinsic::amdgcn_trig_preop: {
1968 // The intrinsic is declared with name mangling, but currently the
1969 // instruction only exists for f64
1970 if (!II.getType()->isDoubleTy())
1971 break;
1972
1973 Value *Src = II.getArgOperand(0);
1974 Value *Segment = II.getArgOperand(1);
1975 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1976 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1977
1978 if (isa<UndefValue>(Segment))
1979 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1980
1981 // Sign bit is not used.
1982 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Src);
1983 if (StrippedSign != Src)
1984 return IC.replaceOperand(II, 0, StrippedSign);
1985
1986 if (II.isStrictFP())
1987 break;
1988
1989 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
1990 if (!CSrc && !isa<UndefValue>(Src))
1991 break;
1992
1993 // The instruction ignores special cases, and literally just extracts the
1994 // exponents. Fold undef to nan, and index the table as normal.
1995 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
1996 : APFloat::getQNaN(II.getType()->getFltSemantics())
1997 .bitcastToAPInt();
1998
1999 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
2000 if (!Cseg) {
2001 if (isa<UndefValue>(Src))
2002 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2003 break;
2004 }
2005
2006 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
2007 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
2008 unsigned Shift = SegmentVal * 53;
2009 if (Exponent > 1077)
2010 Shift += Exponent - 1077;
2011
2012 // 2.0/PI table.
2013 static const uint32_t TwoByPi[] = {
2014 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
2015 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
2016 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
2017 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
2018 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
2019 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
2020 0x56033046};
2021
2022 // Return 0 for outbound segment (hardware behavior).
2023 unsigned Idx = Shift >> 5;
2024 if (Idx + 2 >= std::size(TwoByPi)) {
2025 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
2026 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
2027 }
2028
2029 unsigned BShift = Shift & 0x1f;
2030 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
2031 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
2032 if (BShift)
2033 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
2034 Thi = Thi >> 11;
2035 APFloat Result = APFloat((double)Thi);
2036
2037 int Scale = -53 - Shift;
2038 if (Exponent >= 1968)
2039 Scale += 128;
2040
2041 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
2042 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
2043 }
2044 case Intrinsic::amdgcn_fmul_legacy: {
2045 Value *Op0 = II.getArgOperand(0);
2046 Value *Op1 = II.getArgOperand(1);
2047
2048 for (Value *Src : {Op0, Op1}) {
2049 if (isa<PoisonValue>(Src))
2050 return IC.replaceInstUsesWith(II, Src);
2051 }
2052
2053 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2054 // infinity, gives +0.0.
2055 // TODO: Move to InstSimplify?
2056 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2058 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2059
2060 // If we can prove we don't have one of the special cases then we can use a
2061 // normal fmul instruction instead.
2062 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2063 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
2064 FMul->takeName(&II);
2065 return IC.replaceInstUsesWith(II, FMul);
2066 }
2067 break;
2068 }
2069 case Intrinsic::amdgcn_fma_legacy: {
2070 Value *Op0 = II.getArgOperand(0);
2071 Value *Op1 = II.getArgOperand(1);
2072 Value *Op2 = II.getArgOperand(2);
2073
2074 for (Value *Src : {Op0, Op1, Op2}) {
2075 if (isa<PoisonValue>(Src))
2076 return IC.replaceInstUsesWith(II, Src);
2077 }
2078
2079 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2080 // infinity, gives +0.0.
2081 // TODO: Move to InstSimplify?
2082 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2084 // It's tempting to just return Op2 here, but that would give the wrong
2085 // result if Op2 was -0.0.
2086 auto *Zero = ConstantFP::getZero(II.getType());
2087 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
2088 FAdd->takeName(&II);
2089 return IC.replaceInstUsesWith(II, FAdd);
2090 }
2091
2092 // If we can prove we don't have one of the special cases then we can use a
2093 // normal fma instead.
2094 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2095 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
2096 II.getModule(), Intrinsic::fma, II.getType()));
2097 return &II;
2098 }
2099 break;
2100 }
2101 case Intrinsic::amdgcn_is_shared:
2102 case Intrinsic::amdgcn_is_private: {
2103 Value *Src = II.getArgOperand(0);
2104 if (isa<PoisonValue>(Src))
2105 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2106 if (isa<UndefValue>(Src))
2107 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
2108
2109 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
2110 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
2111 break;
2112 }
2113 case Intrinsic::amdgcn_make_buffer_rsrc: {
2114 Value *Src = II.getArgOperand(0);
2115 if (isa<PoisonValue>(Src))
2116 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2117 return std::nullopt;
2118 }
2119 case Intrinsic::amdgcn_raw_buffer_store_format:
2120 case Intrinsic::amdgcn_struct_buffer_store_format:
2121 case Intrinsic::amdgcn_raw_tbuffer_store:
2122 case Intrinsic::amdgcn_struct_tbuffer_store:
2123 case Intrinsic::amdgcn_image_store_1d:
2124 case Intrinsic::amdgcn_image_store_1darray:
2125 case Intrinsic::amdgcn_image_store_2d:
2126 case Intrinsic::amdgcn_image_store_2darray:
2127 case Intrinsic::amdgcn_image_store_2darraymsaa:
2128 case Intrinsic::amdgcn_image_store_2dmsaa:
2129 case Intrinsic::amdgcn_image_store_3d:
2130 case Intrinsic::amdgcn_image_store_cube:
2131 case Intrinsic::amdgcn_image_store_mip_1d:
2132 case Intrinsic::amdgcn_image_store_mip_1darray:
2133 case Intrinsic::amdgcn_image_store_mip_2d:
2134 case Intrinsic::amdgcn_image_store_mip_2darray:
2135 case Intrinsic::amdgcn_image_store_mip_3d:
2136 case Intrinsic::amdgcn_image_store_mip_cube: {
2137 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
2138 break;
2139
2140 APInt DemandedElts;
2141 if (ST->hasDefaultComponentBroadcast())
2142 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
2143 else if (ST->hasDefaultComponentZero())
2144 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
2145 else
2146 break;
2147
2148 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
2149 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
2150 false)) {
2151 return IC.eraseInstFromFunction(II);
2152 }
2153
2154 break;
2155 }
2156 case Intrinsic::amdgcn_prng_b32: {
2157 auto *Src = II.getArgOperand(0);
2158 if (isa<UndefValue>(Src)) {
2159 return IC.replaceInstUsesWith(II, Src);
2160 }
2161 return std::nullopt;
2162 }
2163 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
2164 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
2165 Value *Src0 = II.getArgOperand(0);
2166 Value *Src1 = II.getArgOperand(1);
2167 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
2168 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
2169 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2170 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2171
2172 auto getFormatNumRegs = [](unsigned FormatVal) {
2173 switch (FormatVal) {
2176 return 6u;
2178 return 4u;
2181 return 8u;
2182 default:
2183 llvm_unreachable("invalid format value");
2184 }
2185 };
2186
2187 bool MadeChange = false;
2188 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
2189 unsigned Src1NumElts = getFormatNumRegs(BLGP);
2190
2191 // Depending on the used format, fewer registers are required so shrink the
2192 // vector type.
2193 if (Src0Ty->getNumElements() > Src0NumElts) {
2194 Src0 = IC.Builder.CreateExtractVector(
2195 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2196 uint64_t(0));
2197 MadeChange = true;
2198 }
2199
2200 if (Src1Ty->getNumElements() > Src1NumElts) {
2201 Src1 = IC.Builder.CreateExtractVector(
2202 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2203 uint64_t(0));
2204 MadeChange = true;
2205 }
2206
2207 if (!MadeChange)
2208 return std::nullopt;
2209
2210 SmallVector<Value *, 10> Args(II.args());
2211 Args[0] = Src0;
2212 Args[1] = Src1;
2213
2214 CallInst *NewII = IC.Builder.CreateIntrinsic(
2215 IID, {Src0->getType(), Src1->getType()}, Args, &II);
2216 NewII->takeName(&II);
2217 return IC.replaceInstUsesWith(II, NewII);
2218 }
2219 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
2220 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
2221 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
2222 Value *Src0 = II.getArgOperand(1);
2223 Value *Src1 = II.getArgOperand(3);
2224 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2225 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
2226 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2227 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2228
2229 bool MadeChange = false;
2230 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
2231 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
2232
2233 // Depending on the used format, fewer registers are required so shrink the
2234 // vector type.
2235 if (Src0Ty->getNumElements() > Src0NumElts) {
2236 Src0 = IC.Builder.CreateExtractVector(
2237 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2238 IC.Builder.getInt64(0));
2239 MadeChange = true;
2240 }
2241
2242 if (Src1Ty->getNumElements() > Src1NumElts) {
2243 Src1 = IC.Builder.CreateExtractVector(
2244 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2245 IC.Builder.getInt64(0));
2246 MadeChange = true;
2247 }
2248
2249 if (!MadeChange)
2250 return std::nullopt;
2251
2252 SmallVector<Value *, 13> Args(II.args());
2253 Args[1] = Src0;
2254 Args[3] = Src1;
2255
2256 CallInst *NewII = IC.Builder.CreateIntrinsic(
2257 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
2258 Args, &II);
2259 NewII->takeName(&II);
2260 return IC.replaceInstUsesWith(II, NewII);
2261 }
2262 }
2263 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
2264 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
2265 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
2266 }
2267 return std::nullopt;
2268}
2269
2270/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
2271///
2272/// The result of simplifying amdgcn image and buffer store intrinsics is updating
2273/// definitions of the intrinsics vector argument, not Uses of the result like
2274/// image and buffer loads.
2275/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
2276/// struct returns.
2279 APInt DemandedElts,
2280 int DMaskIdx, bool IsLoad) {
2281
2282 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
2283 : II.getOperand(0)->getType());
2284 unsigned VWidth = IIVTy->getNumElements();
2285 if (VWidth == 1)
2286 return nullptr;
2287 Type *EltTy = IIVTy->getElementType();
2288
2291
2292 // Assume the arguments are unchanged and later override them, if needed.
2293 SmallVector<Value *, 16> Args(II.args());
2294
2295 if (DMaskIdx < 0) {
2296 // Buffer case.
2297
2298 const unsigned ActiveBits = DemandedElts.getActiveBits();
2299 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
2300
2301 // Start assuming the prefix of elements is demanded, but possibly clear
2302 // some other bits if there are trailing zeros (unused components at front)
2303 // and update offset.
2304 DemandedElts = (1 << ActiveBits) - 1;
2305
2306 if (UnusedComponentsAtFront > 0) {
2307 static const unsigned InvalidOffsetIdx = 0xf;
2308
2309 unsigned OffsetIdx;
2310 switch (II.getIntrinsicID()) {
2311 case Intrinsic::amdgcn_raw_buffer_load:
2312 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2313 OffsetIdx = 1;
2314 break;
2315 case Intrinsic::amdgcn_s_buffer_load:
2316 // If resulting type is vec3, there is no point in trimming the
2317 // load with updated offset, as the vec3 would most likely be widened to
2318 // vec4 anyway during lowering.
2319 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
2320 OffsetIdx = InvalidOffsetIdx;
2321 else
2322 OffsetIdx = 1;
2323 break;
2324 case Intrinsic::amdgcn_struct_buffer_load:
2325 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2326 OffsetIdx = 2;
2327 break;
2328 default:
2329 // TODO: handle tbuffer* intrinsics.
2330 OffsetIdx = InvalidOffsetIdx;
2331 break;
2332 }
2333
2334 if (OffsetIdx != InvalidOffsetIdx) {
2335 // Clear demanded bits and update the offset.
2336 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
2337 auto *Offset = Args[OffsetIdx];
2338 unsigned SingleComponentSizeInBits =
2339 IC.getDataLayout().getTypeSizeInBits(EltTy);
2340 unsigned OffsetAdd =
2341 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
2342 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
2343 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
2344 }
2345 }
2346 } else {
2347 // Image case.
2348
2349 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
2350 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
2351
2352 // dmask 0 has special semantics, do not simplify.
2353 if (DMaskVal == 0)
2354 return nullptr;
2355
2356 // Mask off values that are undefined because the dmask doesn't cover them
2357 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
2358
2359 unsigned NewDMaskVal = 0;
2360 unsigned OrigLdStIdx = 0;
2361 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
2362 const unsigned Bit = 1 << SrcIdx;
2363 if (!!(DMaskVal & Bit)) {
2364 if (!!DemandedElts[OrigLdStIdx])
2365 NewDMaskVal |= Bit;
2366 OrigLdStIdx++;
2367 }
2368 }
2369
2370 if (DMaskVal != NewDMaskVal)
2371 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
2372 }
2373
2374 unsigned NewNumElts = DemandedElts.popcount();
2375 if (!NewNumElts)
2376 return PoisonValue::get(IIVTy);
2377
2378 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2379 if (DMaskIdx >= 0)
2380 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
2381 return nullptr;
2382 }
2383
2384 // Validate function argument and return types, extracting overloaded types
2385 // along the way.
2386 SmallVector<Type *, 6> OverloadTys;
2387 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
2388 return nullptr;
2389
2390 Type *NewTy =
2391 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
2392 OverloadTys[0] = NewTy;
2393
2394 if (!IsLoad) {
2395 SmallVector<int, 8> EltMask;
2396 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2397 if (DemandedElts[OrigStoreIdx])
2398 EltMask.push_back(OrigStoreIdx);
2399
2400 if (NewNumElts == 1)
2401 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
2402 else
2403 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
2404 }
2405
2406 CallInst *NewCall =
2407 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
2408 NewCall->takeName(&II);
2409 NewCall->copyMetadata(II);
2410 AttributeList OldAttrList = II.getAttributes();
2411 NewCall->setAttributes(OldAttrList);
2412
2413 if (IsLoad) {
2414 if (NewNumElts == 1) {
2415 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
2416 DemandedElts.countr_zero());
2417 }
2418
2419 SmallVector<int, 8> EltMask;
2420 unsigned NewLoadIdx = 0;
2421 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2422 if (!!DemandedElts[OrigLoadIdx])
2423 EltMask.push_back(NewLoadIdx++);
2424 else
2425 EltMask.push_back(NewNumElts);
2426 }
2427
2428 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
2429
2430 return Shuffle;
2431 }
2432
2433 return NewCall;
2434}
2435
2437 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2438 APInt &UndefElts) const {
2439 auto *VT = dyn_cast<FixedVectorType>(II.getType());
2440 if (!VT)
2441 return nullptr;
2442
2443 const unsigned FirstElt = DemandedElts.countr_zero();
2444 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2445 const unsigned MaskLen = LastElt - FirstElt + 1;
2446
2447 unsigned OldNumElts = VT->getNumElements();
2448 if (MaskLen == OldNumElts && MaskLen != 1)
2449 return nullptr;
2450
2451 Type *EltTy = VT->getElementType();
2452 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
2453
2454 // Theoretically we should support these intrinsics for any legal type. Avoid
2455 // introducing cases that aren't direct register types like v3i16.
2456 if (!isTypeLegal(NewVT))
2457 return nullptr;
2458
2459 Value *Src = II.getArgOperand(0);
2460
2461 // Make sure convergence tokens are preserved.
2462 // TODO: CreateIntrinsic should allow directly copying bundles
2464 II.getOperandBundlesAsDefs(OpBundles);
2465
2467 Function *Remangled =
2468 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
2469
2470 if (MaskLen == 1) {
2471 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
2472
2473 // TODO: Preserve callsite attributes?
2474 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2475
2476 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
2477 NewCall, FirstElt);
2478 }
2479
2480 SmallVector<int> ExtractMask(MaskLen, -1);
2481 for (unsigned I = 0; I != MaskLen; ++I) {
2482 if (DemandedElts[FirstElt + I])
2483 ExtractMask[I] = FirstElt + I;
2484 }
2485
2486 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
2487
2488 // TODO: Preserve callsite attributes?
2489 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2490
2491 SmallVector<int> InsertMask(OldNumElts, -1);
2492 for (unsigned I = 0; I != MaskLen; ++I) {
2493 if (DemandedElts[FirstElt + I])
2494 InsertMask[FirstElt + I] = I;
2495 }
2496
2497 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2498 // call behind.
2499 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
2500}
2501
2503 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2504 APInt &UndefElts2, APInt &UndefElts3,
2505 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2506 SimplifyAndSetOp) const {
2507 switch (II.getIntrinsicID()) {
2508 case Intrinsic::amdgcn_readfirstlane:
2509 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2510 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2511 case Intrinsic::amdgcn_raw_buffer_load:
2512 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2513 case Intrinsic::amdgcn_raw_buffer_load_format:
2514 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2515 case Intrinsic::amdgcn_raw_tbuffer_load:
2516 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2517 case Intrinsic::amdgcn_s_buffer_load:
2518 case Intrinsic::amdgcn_struct_buffer_load:
2519 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2520 case Intrinsic::amdgcn_struct_buffer_load_format:
2521 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2522 case Intrinsic::amdgcn_struct_tbuffer_load:
2523 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2524 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2525 default: {
2526 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2527 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2528 }
2529 break;
2530 }
2531 }
2532 return std::nullopt;
2533}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * createPermlane16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlane16 with the precomputed lane-select halves.
static std::optional< unsigned > matchRowSharePattern(ArrayRef< uint8_t > Ids)
Match a row-share pattern: all 16 lanes of each row read the same source lane.
static bool matchMirrorPattern(ArrayRef< uint8_t > Ids)
Match an N-lane reversal (mirror) pattern.
static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST, SmallVectorImpl< uint8_t > &Ids, const DataLayout &DL)
Build the per-lane shuffle map by evaluating Index for every lane in the wave.
static std::optional< unsigned > matchQuadPermPattern(ArrayRef< uint8_t > Ids)
Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp QUAD_PERM control word: bits[1:0]=Ids...
static std::optional< unsigned > matchDsSwizzleRotatePattern(ArrayRef< uint8_t > Ids)
Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation of all 32 lanes within each ...
static std::optional< unsigned > matchHalfRowPermPattern(ArrayRef< uint8_t > Ids)
Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8 24-bit selector (three bits per ...
static std::optional< unsigned > matchRowXMaskPattern(ArrayRef< uint8_t > Ids)
Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J, with Mask in [1,...
static constexpr auto matchHalfRowMirrorPattern
static Value * createPermlaneX16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlanex16 with the precomputed lane-select halves.
static bool isRowPattern(ArrayRef< uint8_t > Ids)
Match an N-lane row pattern: each lane in [0, N) reads from a source lane in the same N-lane row,...
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static constexpr auto isFullRowPattern
static constexpr auto isQuadPattern
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static uint64_t computePermlane16Masks(ArrayRef< uint8_t > Ids)
Pack a 16-lane permutation into a single 64-bit value: four bits per output lane, lane J in bits [J*4...
static bool matchHalfWaveSwapPattern(ArrayRef< uint8_t > Ids)
Match a half-wave swap: lane J reads from lane J ^ 32.
static bool hasPeriodicLayout(ArrayRef< uint8_t > Ids)
Lanes are partitioned into groups of Period; each group is a translated copy of the first: Ids[I] = I...
static std::optional< Instruction * > tryOptimizeShufflePattern(InstCombiner &IC, IntrinsicInst &II, const GCNSubtarget &ST)
Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant function of the lane ID into a ...
static constexpr auto isHalfRowPattern
static APInt defaultComponentBroadcast(Value *V)
static std::optional< unsigned > matchDsSwizzleBitmaskPattern(ArrayRef< uint8_t > Ids)
Match a DS_SWIZZLE bitmask-mode permutation: dst_lane = ((src_lane & AND) | OR) ^ XOR with each mask ...
static Value * createDsSwizzle(IRBuilderBase &B, Value *Val, unsigned Offset, const DataLayout &DL)
Emit ds_swizzle with the given immediate, bitcasting/converting between pointer/float types and i32 a...
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static Value * matchShuffleToHWIntrinsic(IRBuilderBase &B, Value *Src, ArrayRef< uint8_t > Ids, const GCNSubtarget &ST, const DataLayout &DL)
Given a shuffle map, try to emit the best hardware intrinsic.
static std::optional< unsigned > matchRowRotatePattern(ArrayRef< uint8_t > Ids)
Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
static bool isCrossRowPattern(ArrayRef< uint8_t > Ids)
Match a cross-row permutation suitable for v_permlanex16: every lane in the low 16-lane half reads fr...
static bool isThreadID(const GCNSubtarget &ST, Value *V)
static Value * createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl)
Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and bound_ctrl=1 so out-of-bounds...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector)
Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
static constexpr auto matchFullRowMirrorPattern
static std::optional< unsigned > evalLaneExpr(Value *V, unsigned Lane, const GCNSubtarget &ST, const DataLayout &DL, unsigned Depth=0)
Evaluate V as a function of the lane ID and return its value on Lane, or std::nullopt if V is not a c...
static Value * createPermlane64(IRBuilderBase &B, Value *Val)
Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1179
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1267
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5912
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1503
bool isPosInfinity() const
Definition APFloat.h:1551
const fltSemantics & getSemantics() const
Definition APFloat.h:1546
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1375
bool isNaN() const
Definition APFloat.h:1536
bool isSignaling() const
Definition APFloat.h:1540
APInt bitcastToAPInt() const
Definition APFloat.h:1430
bool isNegInfinity() const
Definition APFloat.h:1552
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1138
bool isInfinity() const
Definition APFloat.h:1535
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1429
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:521
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
bool isMask(unsigned numBits) const
Definition APInt.h:489
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:218
size_t size() const
Get the array size.
Definition ArrayRef.h:141
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
void setAttributes(AttributeList A)
Set the attributes for this call.
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
AttributeList getAttributes() const
Return the attributes for this call.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
bool isSigned() const
Definition InstrTypes.h:993
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:890
bool isFPPredicate() const
Definition InstrTypes.h:845
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
const APFloat & getValueAPF() const
Definition Constants.h:463
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantFP * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange add(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an addition of a value in this ran...
LLVM_ABI bool isFullSet() const
Return true if this set contains all of the elements possible for this data-type.
LLVM_ABI ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition Constant.h:43
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constant.h:64
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Tagged union holding either a T or a Error.
Definition Error.h:485
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:288
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:69
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1135
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2637
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2625
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2148
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:509
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2142
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1554
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2388
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1066
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2659
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1094
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1054
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2563
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1663
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1088
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1701
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
IRBuilder< TargetFolder, IRBuilderInstCombineInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
static Value * stripSignOnlyFPOps(Value *Val)
Ignore all operations which only change the sign of a value, returning the underlying magnitude value...
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool isSignatureValid(Intrinsic::ID ID, FunctionType *FT, SmallVectorImpl< Type * > &OverloadTys, raw_ostream &OS=nulls())
Returns true if FT is a valid function type for intrinsic ID.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1652
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1695
constexpr unsigned MaxAnalysisRecursionDepth
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1640
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Constant * ConstantFoldInstOperands(const Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:58
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.