LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
24#include <optional>
25
26using namespace llvm;
27using namespace llvm::PatternMatch;
28
29#define DEBUG_TYPE "AMDGPUtti"
30
31namespace {
32
33struct AMDGPUImageDMaskIntrinsic {
34 unsigned Intr;
35};
36
37#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
38#include "AMDGPUGenSearchableTables.inc"
39
40} // end anonymous namespace
41
42// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43//
44// A single NaN input is folded to minnum, so we rely on that folding for
45// handling NaNs.
46static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
47 const APFloat &Src2) {
48 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
49
50 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
51 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
52 if (Cmp0 == APFloat::cmpEqual)
53 return maxnum(Src1, Src2);
54
55 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
56 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
57 if (Cmp1 == APFloat::cmpEqual)
58 return maxnum(Src0, Src2);
59
60 return maxnum(Src0, Src1);
61}
62
63// Check if a value can be converted to a 16-bit value without losing
64// precision.
65// The value is expected to be either a float (IsFloat = true) or an unsigned
66// integer (IsFloat = false).
67static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
68 Type *VTy = V.getType();
69 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
70 // The value is already 16-bit, so we don't want to convert to 16-bit again!
71 return false;
72 }
73 if (IsFloat) {
74 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
75 // We need to check that if we cast the index down to a half, we do not
76 // lose precision.
77 APFloat FloatValue(ConstFloat->getValueAPF());
78 bool LosesInfo = true;
80 &LosesInfo);
81 return !LosesInfo;
82 }
83 } else {
84 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
85 // We need to check that if we cast the index down to an i16, we do not
86 // lose precision.
87 APInt IntValue(ConstInt->getValue());
88 return IntValue.getActiveBits() <= 16;
89 }
90 }
91
92 Value *CastSrc;
93 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
94 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
95 if (IsExt) {
96 Type *CastSrcTy = CastSrc->getType();
97 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
98 return true;
99 }
100
101 return false;
102}
103
104// Convert a value to 16-bit.
106 Type *VTy = V.getType();
108 return cast<Instruction>(&V)->getOperand(0);
109 if (VTy->isIntegerTy())
110 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
111 if (VTy->isFloatingPointTy())
112 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
113
114 llvm_unreachable("Should never be called!");
115}
116
117/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
118/// modified arguments (based on OldIntr) and replaces InstToReplace with
119/// this newly created intrinsic call.
120static std::optional<Instruction *> modifyIntrinsicCall(
121 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
122 InstCombiner &IC,
123 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
124 Func) {
127 return std::nullopt;
128
129 SmallVector<Value *, 8> Args(OldIntr.args());
130
131 // Modify arguments and types
132 Func(Args, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, ArgTys, Args);
135 NewCall->takeName(&OldIntr);
136 NewCall->copyMetadata(OldIntr);
137 if (isa<FPMathOperator>(NewCall))
138 NewCall->copyFastMathFlags(&OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(InstToReplace, NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(OldIntr);
149
150 return RetValue;
151}
152
153static std::optional<Instruction *>
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165 ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
182 ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
199 ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
242
243 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
244 [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251
252 // Only perform D16 folding if every user of the image sample is
253 // an ExtractElementInst immediately followed by an FPTrunc to half.
255 ExtractTruncPairs;
256 bool AllHalfExtracts = true;
257
258 for (User *U : II.users()) {
259 auto *Ext = dyn_cast<ExtractElementInst>(U);
260 if (!Ext || !Ext->hasOneUse()) {
261 AllHalfExtracts = false;
262 break;
263 }
264
265 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
266 if (!Tr || !Tr->getType()->isHalfTy()) {
267 AllHalfExtracts = false;
268 break;
269 }
270
271 ExtractTruncPairs.emplace_back(Ext, Tr);
272 }
273
274 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275 auto *VecTy = cast<VectorType>(II.getType());
276 Type *HalfVecTy =
277 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
278
279 // Obtain the original image sample intrinsic's signature
280 // and replace its return type with the half-vector for D16 folding
282 Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys);
283 SigTys[0] = HalfVecTy;
284
285 Module *M = II.getModule();
286 Function *HalfDecl =
287 Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
288
289 II.mutateType(HalfVecTy);
290 II.setCalledFunction(HalfDecl);
291
292 IRBuilder<> Builder(II.getContext());
293 for (auto &[Ext, Tr] : ExtractTruncPairs) {
294 Value *Idx = Ext->getIndexOperand();
295
296 Builder.SetInsertPoint(Tr);
297
298 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
299 HalfExtract->takeName(Tr);
300
301 Tr->replaceAllUsesWith(HalfExtract);
302 }
303
304 for (auto &[Ext, Tr] : ExtractTruncPairs) {
305 IC.eraseInstFromFunction(*Tr);
306 IC.eraseInstFromFunction(*Ext);
307 }
308
309 return &II;
310 }
311 }
312 }
313
314 // Try to use A16 or G16
315 if (!ST->hasA16() && !ST->hasG16())
316 return std::nullopt;
317
318 // Address is interpreted as float if the instruction has a sampler or as
319 // unsigned int if there is no sampler.
320 bool HasSampler =
322 bool FloatCoord = false;
323 // true means derivatives can be converted to 16 bit, coordinates not
324 bool OnlyDerivatives = false;
325
326 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
327 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
328 Value *Coord = II.getOperand(OperandIndex);
329 // If the values are not derived from 16-bit values, we cannot optimize.
330 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
331 if (OperandIndex < ImageDimIntr->CoordStart ||
332 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
333 return std::nullopt;
334 }
335 // All gradients can be converted, so convert only them
336 OnlyDerivatives = true;
337 break;
338 }
339
340 assert(OperandIndex == ImageDimIntr->GradientStart ||
341 FloatCoord == Coord->getType()->isFloatingPointTy());
342 FloatCoord = Coord->getType()->isFloatingPointTy();
343 }
344
345 if (!OnlyDerivatives && !ST->hasA16())
346 OnlyDerivatives = true; // Only supports G16
347
348 // Check if there is a bias parameter and if it can be converted to f16
349 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
350 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
351 assert(HasSampler &&
352 "Only image instructions with a sampler can have a bias");
353 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
354 OnlyDerivatives = true;
355 }
356
357 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
358 ImageDimIntr->CoordStart))
359 return std::nullopt;
360
361 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
362 : Type::getInt16Ty(II.getContext());
363
364 return modifyIntrinsicCall(
365 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
366 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
367 if (!OnlyDerivatives) {
368 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
369
370 // Change the bias type
371 if (ImageDimIntr->NumBiasArgs != 0)
372 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
373 }
374
375 unsigned EndIndex =
376 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
377 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
378 OperandIndex < EndIndex; OperandIndex++) {
379 Args[OperandIndex] =
380 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
381 }
382
383 // Convert the bias
384 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
385 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
386 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
387 }
388 });
389}
390
392 const Value *Op0, const Value *Op1,
393 InstCombiner &IC) const {
394 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
395 // infinity, gives +0.0. If we can prove we don't have one of the special
396 // cases then we can use a normal multiply instead.
397 // TODO: Create and use isKnownFiniteNonZero instead of just matching
398 // constants here.
401 // One operand is not zero or infinity or NaN.
402 return true;
403 }
404
406 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
407 // Neither operand is infinity or NaN.
408 return true;
409 }
410 return false;
411}
412
413/// Match an fpext from half to float, or a constant we can convert.
415 Value *Src = nullptr;
416 ConstantFP *CFP = nullptr;
417 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
418 if (Src->getType()->isHalfTy())
419 return Src;
420 } else if (match(Arg, m_ConstantFP(CFP))) {
421 bool LosesInfo;
422 APFloat Val(CFP->getValueAPF());
424 if (!LosesInfo)
425 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
426 }
427 return nullptr;
428}
429
430// Trim all zero components from the end of the vector \p UseV and return
431// an appropriate bitset with known elements.
433 Instruction *I) {
434 auto *VTy = cast<FixedVectorType>(UseV->getType());
435 unsigned VWidth = VTy->getNumElements();
436 APInt DemandedElts = APInt::getAllOnes(VWidth);
437
438 for (int i = VWidth - 1; i > 0; --i) {
439 auto *Elt = findScalarElement(UseV, i);
440 if (!Elt)
441 break;
442
443 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
444 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
445 break;
446 } else {
447 break;
448 }
449
450 DemandedElts.clearBit(i);
451 }
452
453 return DemandedElts;
454}
455
456// Trim elements of the end of the vector \p V, if they are
457// equal to the first element of the vector.
459 auto *VTy = cast<FixedVectorType>(V->getType());
460 unsigned VWidth = VTy->getNumElements();
461 APInt DemandedElts = APInt::getAllOnes(VWidth);
462 Value *FirstComponent = findScalarElement(V, 0);
463
464 SmallVector<int> ShuffleMask;
465 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
466 SVI->getShuffleMask(ShuffleMask);
467
468 for (int I = VWidth - 1; I > 0; --I) {
469 if (ShuffleMask.empty()) {
470 auto *Elt = findScalarElement(V, I);
471 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
472 break;
473 } else {
474 // Detect identical elements in the shufflevector result, even though
475 // findScalarElement cannot tell us what that element is.
476 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
477 break;
478 }
479 DemandedElts.clearBit(I);
480 }
481
482 return DemandedElts;
483}
484
487 APInt DemandedElts,
488 int DMaskIdx = -1,
489 bool IsLoad = true);
490
491/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
492static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
493 return (SqrtOp->getType()->isFloatTy() &&
494 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
495 SqrtOp->getType()->isHalfTy();
496}
497
498/// Return true if we can easily prove that use U is uniform.
499static bool isTriviallyUniform(const Use &U) {
500 Value *V = U.get();
501 if (isa<Constant>(V))
502 return true;
503 if (const auto *A = dyn_cast<Argument>(V))
505 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
506 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
507 return false;
508 // If II and U are in different blocks then there is a possibility of
509 // temporal divergence.
510 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
511 }
512 return false;
513}
514
515/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
516///
517/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
520 unsigned LaneArgIdx) const {
521 unsigned MaskBits = ST->getWavefrontSizeLog2();
522 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
523
524 KnownBits Known(32);
525 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
526 return true;
527
528 if (!Known.isConstant())
529 return false;
530
531 // Out of bounds indexes may appear in wave64 code compiled for wave32.
532 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
533 // manually fix it up.
534
535 Value *LaneArg = II.getArgOperand(LaneArgIdx);
536 Constant *MaskedConst =
537 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
538 if (MaskedConst != LaneArg) {
539 II.getOperandUse(LaneArgIdx).set(MaskedConst);
540 return true;
541 }
542
543 return false;
544}
545
547 Function &NewCallee, ArrayRef<Value *> Ops) {
549 Old.getOperandBundlesAsDefs(OpBundles);
550
551 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
552 NewCall->takeName(&Old);
553 return NewCall;
554}
555
556// Return true for sequences of instructions that effectively assign
557// each lane to its thread ID
558static bool isThreadID(const GCNSubtarget &ST, Value *V) {
559 // Case 1:
560 // wave32: mbcnt_lo(-1, 0)
561 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
567 if (ST.isWave32() && match(V, W32Pred))
568 return true;
569 if (ST.isWave64() && match(V, W64Pred))
570 return true;
571
572 return false;
573}
574
575// Attempt to capture situations where the index argument matches
576// a DPP pattern, and convert to a DPP-based mov
577static std::optional<Instruction *>
579 Value *Val = II.getArgOperand(0);
580 Value *Idx = II.getArgOperand(1);
581 auto &B = IC.Builder;
582
583 // DPP16 Row Share requires known wave size, architecture support
584 if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare())
585 return std::nullopt;
586
587 Value *Tid;
588 uint64_t Mask;
589 uint64_t RowIdx;
590 bool CanDPP16RowShare = false;
591
592 // wave32 requires Mask & 0x1F == 0x10
593 // wave64 requires Mask & 0x3F == 0x30
594 uint64_t MaskCheck = (1UL << ST.getWavefrontSizeLog2()) - 1;
595 uint64_t MaskTarget = MaskCheck & 0xF0;
596
597 // DPP16 Row Share 0: Idx = Tid & Mask
598 auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
599
600 // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
601 auto RowSharePred =
602 m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
603
604 // DPP16 Row Share 15: Idx = Tid | 0xF
605 auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt<0xF>());
606
607 if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
608 if ((Mask & MaskCheck) != MaskTarget)
609 return std::nullopt;
610
611 RowIdx = 0;
612 CanDPP16RowShare = true;
613 } else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) && RowIdx < 15 &&
614 RowIdx > 0) {
615 if ((Mask & MaskCheck) != MaskTarget)
616 return std::nullopt;
617
618 CanDPP16RowShare = true;
619 } else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid)) {
620 RowIdx = 15;
621 CanDPP16RowShare = true;
622 }
623
624 if (CanDPP16RowShare) {
625 CallInst *UpdateDPP =
626 B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Val->getType(),
627 {PoisonValue::get(Val->getType()), Val,
628 B.getInt32(AMDGPU::DPP::ROW_SHARE0 | RowIdx),
629 B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
630 UpdateDPP->takeName(&II);
631 UpdateDPP->copyMetadata(II);
632 return IC.replaceInstUsesWith(II, UpdateDPP);
633 }
634
635 // No valid DPP detected
636 return std::nullopt;
637}
638
641 IntrinsicInst &II) const {
642 const auto IID = II.getIntrinsicID();
643 assert(IID == Intrinsic::amdgcn_readlane ||
644 IID == Intrinsic::amdgcn_readfirstlane ||
645 IID == Intrinsic::amdgcn_permlane64);
646
647 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
648
649 // Only do this if both instructions are in the same block
650 // (so the exec mask won't change) and the readlane is the only user of its
651 // operand.
652 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
653 return nullptr;
654
655 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
656
657 // If this is a readlane, check that the second operand is a constant, or is
658 // defined before OpInst so we know it's safe to move this intrinsic higher.
659 Value *LaneID = nullptr;
660 if (IsReadLane) {
661 LaneID = II.getOperand(1);
662
663 // readlane take an extra operand for the lane ID, so we must check if that
664 // LaneID value can be used at the point where we want to move the
665 // intrinsic.
666 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
667 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
668 return nullptr;
669 }
670 }
671
672 // Hoist the intrinsic (II) through OpInst.
673 //
674 // (II (OpInst x)) -> (OpInst (II x))
675 const auto DoIt = [&](unsigned OpIdx,
676 Function *NewIntrinsic) -> Instruction * {
678 if (IsReadLane)
679 Ops.push_back(LaneID);
680
681 // Rewrite the intrinsic call.
682 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
683
684 // Rewrite OpInst so it takes the result of the intrinsic now.
685 Instruction &NewOp = *OpInst->clone();
686 NewOp.setOperand(OpIdx, NewII);
687 return &NewOp;
688 };
689
690 // TODO(?): Should we do more with permlane64?
691 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
692 return nullptr;
693
694 if (isa<UnaryOperator>(OpInst))
695 return DoIt(0, II.getCalledFunction());
696
697 if (isa<CastInst>(OpInst)) {
698 Value *Src = OpInst->getOperand(0);
699 Type *SrcTy = Src->getType();
700 if (!isTypeLegal(SrcTy))
701 return nullptr;
702
703 Function *Remangled =
704 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
705 return DoIt(0, Remangled);
706 }
707
708 // We can also hoist through binary operators if the other operand is uniform.
709 if (isa<BinaryOperator>(OpInst)) {
710 // FIXME: If we had access to UniformityInfo here we could just check
711 // if the operand is uniform.
712 if (isTriviallyUniform(OpInst->getOperandUse(0)))
713 return DoIt(1, II.getCalledFunction());
714 if (isTriviallyUniform(OpInst->getOperandUse(1)))
715 return DoIt(0, II.getCalledFunction());
716 }
717
718 return nullptr;
719}
720
721std::optional<Instruction *>
723 Intrinsic::ID IID = II.getIntrinsicID();
724 switch (IID) {
725 case Intrinsic::amdgcn_implicitarg_ptr: {
726 if (II.getFunction()->hasFnAttribute("amdgpu-no-implicitarg-ptr"))
727 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
728 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(*II.getFunction());
729
730 uint64_t CurrentOrNullBytes =
731 II.getAttributes().getRetDereferenceableOrNullBytes();
732 if (CurrentOrNullBytes != 0) {
733 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
734 // into dereferenceable(max(A, B))
735 uint64_t NewBytes = std::max(CurrentOrNullBytes, ImplicitArgBytes);
736 II.addRetAttr(
737 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
738 II.removeRetAttr(Attribute::DereferenceableOrNull);
739 return &II;
740 }
741
742 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
743 uint64_t NewBytes = std::max(CurrentBytes, ImplicitArgBytes);
744 if (NewBytes != CurrentBytes) {
745 II.addRetAttr(
746 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
747 return &II;
748 }
749
750 return std::nullopt;
751 }
752 case Intrinsic::amdgcn_rcp: {
753 Value *Src = II.getArgOperand(0);
754 if (isa<PoisonValue>(Src))
755 return IC.replaceInstUsesWith(II, Src);
756
757 // TODO: Move to ConstantFolding/InstSimplify?
758 if (isa<UndefValue>(Src)) {
759 Type *Ty = II.getType();
760 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
761 return IC.replaceInstUsesWith(II, QNaN);
762 }
763
764 if (II.isStrictFP())
765 break;
766
767 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
768 const APFloat &ArgVal = C->getValueAPF();
769 APFloat Val(ArgVal.getSemantics(), 1);
771
772 // This is more precise than the instruction may give.
773 //
774 // TODO: The instruction always flushes denormal results (except for f16),
775 // should this also?
776 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
777 }
778
779 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
780 if (!FMF.allowContract())
781 break;
782 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
783 if (!SrcCI)
784 break;
785
786 auto IID = SrcCI->getIntrinsicID();
787 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
788 //
789 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
790 // relaxed.
791 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
792 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
793 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
794 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
795 break;
796
797 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
798 break;
799
801 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
802
803 InnerFMF |= FMF;
804 II.setFastMathFlags(InnerFMF);
805
806 II.setCalledFunction(NewDecl);
807 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
808 }
809
810 break;
811 }
812 case Intrinsic::amdgcn_sqrt:
813 case Intrinsic::amdgcn_rsq:
814 case Intrinsic::amdgcn_tanh: {
815 Value *Src = II.getArgOperand(0);
816 if (isa<PoisonValue>(Src))
817 return IC.replaceInstUsesWith(II, Src);
818
819 // TODO: Move to ConstantFolding/InstSimplify?
820 if (isa<UndefValue>(Src)) {
821 Type *Ty = II.getType();
822 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
823 return IC.replaceInstUsesWith(II, QNaN);
824 }
825
826 // f16 amdgcn.sqrt is identical to regular sqrt.
827 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
829 II.getModule(), Intrinsic::sqrt, {II.getType()});
830 II.setCalledFunction(NewDecl);
831 return &II;
832 }
833
834 break;
835 }
836 case Intrinsic::amdgcn_log:
837 case Intrinsic::amdgcn_exp2: {
838 const bool IsLog = IID == Intrinsic::amdgcn_log;
839 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
840 Value *Src = II.getArgOperand(0);
841 Type *Ty = II.getType();
842
843 if (isa<PoisonValue>(Src))
844 return IC.replaceInstUsesWith(II, Src);
845
846 if (IC.getSimplifyQuery().isUndefValue(Src))
848
849 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
850 if (C->isInfinity()) {
851 // exp2(+inf) -> +inf
852 // log2(+inf) -> +inf
853 if (!C->isNegative())
854 return IC.replaceInstUsesWith(II, C);
855
856 // exp2(-inf) -> 0
857 if (IsExp && C->isNegative())
859 }
860
861 if (II.isStrictFP())
862 break;
863
864 if (C->isNaN()) {
865 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
866 return IC.replaceInstUsesWith(II, Quieted);
867 }
868
869 // f32 instruction doesn't handle denormals, f16 does.
870 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
871 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
872 : ConstantFP::get(Ty, 1.0);
873 return IC.replaceInstUsesWith(II, FoldedValue);
874 }
875
876 if (IsLog && C->isNegative())
878
879 // TODO: Full constant folding matching hardware behavior.
880 }
881
882 break;
883 }
884 case Intrinsic::amdgcn_frexp_mant:
885 case Intrinsic::amdgcn_frexp_exp: {
886 Value *Src = II.getArgOperand(0);
887 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
888 int Exp;
889 APFloat Significand =
890 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
891
892 if (IID == Intrinsic::amdgcn_frexp_mant) {
893 return IC.replaceInstUsesWith(
894 II, ConstantFP::get(II.getContext(), Significand));
895 }
896
897 // Match instruction special case behavior.
898 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
899 Exp = 0;
900
901 return IC.replaceInstUsesWith(II,
902 ConstantInt::getSigned(II.getType(), Exp));
903 }
904
905 if (isa<PoisonValue>(Src))
906 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
907
908 if (isa<UndefValue>(Src)) {
909 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
910 }
911
912 break;
913 }
914 case Intrinsic::amdgcn_class: {
915 Value *Src0 = II.getArgOperand(0);
916 Value *Src1 = II.getArgOperand(1);
917 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
918 if (CMask) {
919 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
920 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
921
922 // Clamp any excess bits, as they're illegal for the generic intrinsic.
923 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
924 CMask->getZExtValue() & fcAllFlags));
925 return &II;
926 }
927
928 // Propagate poison.
929 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
930 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
931
932 // llvm.amdgcn.class(_, undef) -> false
933 if (IC.getSimplifyQuery().isUndefValue(Src1))
934 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
935
936 // llvm.amdgcn.class(undef, mask) -> mask != 0
937 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
938 Value *CmpMask = IC.Builder.CreateICmpNE(
939 Src1, ConstantInt::getNullValue(Src1->getType()));
940 return IC.replaceInstUsesWith(II, CmpMask);
941 }
942 break;
943 }
944 case Intrinsic::amdgcn_cvt_pkrtz: {
945 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
946 Type *HalfTy = Type::getHalfTy(Arg->getContext());
947
948 if (isa<PoisonValue>(Arg))
949 return PoisonValue::get(HalfTy);
950 if (isa<UndefValue>(Arg))
951 return UndefValue::get(HalfTy);
952
953 ConstantFP *CFP = nullptr;
954 if (match(Arg, m_ConstantFP(CFP))) {
955 bool LosesInfo;
956 APFloat Val(CFP->getValueAPF());
958 return ConstantFP::get(HalfTy, Val);
959 }
960
961 Value *Src = nullptr;
962 if (match(Arg, m_FPExt(m_Value(Src)))) {
963 if (Src->getType()->isHalfTy())
964 return Src;
965 }
966
967 return nullptr;
968 };
969
970 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
971 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
972 Value *V = PoisonValue::get(II.getType());
973 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
974 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
975 return IC.replaceInstUsesWith(II, V);
976 }
977 }
978
979 break;
980 }
981 case Intrinsic::amdgcn_cvt_pknorm_i16:
982 case Intrinsic::amdgcn_cvt_pknorm_u16:
983 case Intrinsic::amdgcn_cvt_pk_i16:
984 case Intrinsic::amdgcn_cvt_pk_u16: {
985 Value *Src0 = II.getArgOperand(0);
986 Value *Src1 = II.getArgOperand(1);
987
988 // TODO: Replace call with scalar operation if only one element is poison.
989 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
990 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
991
992 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
993 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
994 }
995
996 break;
997 }
998 case Intrinsic::amdgcn_cvt_off_f32_i4: {
999 Value* Arg = II.getArgOperand(0);
1000 Type *Ty = II.getType();
1001
1002 if (isa<PoisonValue>(Arg))
1003 return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
1004
1005 if(IC.getSimplifyQuery().isUndefValue(Arg))
1007
1008 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
1009 if (!CArg)
1010 break;
1011
1012 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1013 constexpr size_t ResValsSize = 16;
1014 static constexpr float ResVals[ResValsSize] = {
1015 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1016 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1017 Constant *Res =
1018 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1019 return IC.replaceInstUsesWith(II, Res);
1020 }
1021 case Intrinsic::amdgcn_ubfe:
1022 case Intrinsic::amdgcn_sbfe: {
1023 // Decompose simple cases into standard shifts.
1024 Value *Src = II.getArgOperand(0);
1025 if (isa<UndefValue>(Src)) {
1026 return IC.replaceInstUsesWith(II, Src);
1027 }
1028
1029 unsigned Width;
1030 Type *Ty = II.getType();
1031 unsigned IntSize = Ty->getIntegerBitWidth();
1032
1033 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
1034 if (CWidth) {
1035 Width = CWidth->getZExtValue();
1036 if ((Width & (IntSize - 1)) == 0) {
1038 }
1039
1040 // Hardware ignores high bits, so remove those.
1041 if (Width >= IntSize) {
1042 return IC.replaceOperand(
1043 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
1044 }
1045 }
1046
1047 unsigned Offset;
1048 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
1049 if (COffset) {
1050 Offset = COffset->getZExtValue();
1051 if (Offset >= IntSize) {
1052 return IC.replaceOperand(
1053 II, 1,
1054 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
1055 }
1056 }
1057
1058 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1059
1060 if (!CWidth || !COffset)
1061 break;
1062
1063 // The case of Width == 0 is handled above, which makes this transformation
1064 // safe. If Width == 0, then the ashr and lshr instructions become poison
1065 // value since the shift amount would be equal to the bit size.
1066 assert(Width != 0);
1067
1068 // TODO: This allows folding to undef when the hardware has specific
1069 // behavior?
1070 if (Offset + Width < IntSize) {
1071 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
1072 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
1073 : IC.Builder.CreateLShr(Shl, IntSize - Width);
1074 RightShift->takeName(&II);
1075 return IC.replaceInstUsesWith(II, RightShift);
1076 }
1077
1078 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
1079 : IC.Builder.CreateLShr(Src, Offset);
1080
1081 RightShift->takeName(&II);
1082 return IC.replaceInstUsesWith(II, RightShift);
1083 }
1084 case Intrinsic::amdgcn_exp:
1085 case Intrinsic::amdgcn_exp_row:
1086 case Intrinsic::amdgcn_exp_compr: {
1087 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
1088 unsigned EnBits = En->getZExtValue();
1089 if (EnBits == 0xf)
1090 break; // All inputs enabled.
1091
1092 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1093 bool Changed = false;
1094 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1095 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1096 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1097 Value *Src = II.getArgOperand(I + 2);
1098 if (!isa<PoisonValue>(Src)) {
1099 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
1100 Changed = true;
1101 }
1102 }
1103 }
1104
1105 if (Changed) {
1106 return &II;
1107 }
1108
1109 break;
1110 }
1111 case Intrinsic::amdgcn_fmed3: {
1112 Value *Src0 = II.getArgOperand(0);
1113 Value *Src1 = II.getArgOperand(1);
1114 Value *Src2 = II.getArgOperand(2);
1115
1116 for (Value *Src : {Src0, Src1, Src2}) {
1117 if (isa<PoisonValue>(Src))
1118 return IC.replaceInstUsesWith(II, Src);
1119 }
1120
1121 if (II.isStrictFP())
1122 break;
1123
1124 // med3 with a nan input acts like
1125 // v_min_f32(v_min_f32(s0, s1), s2)
1126 //
1127 // Signalingness is ignored with ieee=0, so we fold to
1128 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1129 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1130 // returned signaling nan will not be quieted.
1131
1132 // ieee=1
1133 // s0 snan: s2
1134 // s1 snan: s2
1135 // s2 snan: qnan
1136
1137 // s0 qnan: min(s1, s2)
1138 // s1 qnan: min(s0, s2)
1139 // s2 qnan: min(s0, s1)
1140
1141 // ieee=0
1142 // s0 _nan: min(s1, s2)
1143 // s1 _nan: min(s0, s2)
1144 // s2 _nan: min(s0, s1)
1145
1146 // med3 behavior with infinity
1147 // s0 +inf: max(s1, s2)
1148 // s1 +inf: max(s0, s2)
1149 // s2 +inf: max(s0, s1)
1150 // s0 -inf: min(s1, s2)
1151 // s1 -inf: min(s0, s2)
1152 // s2 -inf: min(s0, s1)
1153
1154 // Checking for NaN before canonicalization provides better fidelity when
1155 // mapping other operations onto fmed3 since the order of operands is
1156 // unchanged.
1157 Value *V = nullptr;
1158 const APFloat *ConstSrc0 = nullptr;
1159 const APFloat *ConstSrc1 = nullptr;
1160 const APFloat *ConstSrc2 = nullptr;
1161
1162 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1163 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1164 isa<UndefValue>(Src0)) {
1165 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1166 switch (fpenvIEEEMode(II)) {
1167 case KnownIEEEMode::On:
1168 // TODO: If Src2 is snan, does it need quieting?
1169 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1170 return IC.replaceInstUsesWith(II, Src2);
1171
1172 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1173 : IC.Builder.CreateMinNum(Src1, Src2);
1174 break;
1175 case KnownIEEEMode::Off:
1176 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1177 : IC.Builder.CreateMinimumNum(Src1, Src2);
1178 break;
1180 break;
1181 }
1182 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1183 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1184 isa<UndefValue>(Src1)) {
1185 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1186 switch (fpenvIEEEMode(II)) {
1187 case KnownIEEEMode::On:
1188 // TODO: If Src2 is snan, does it need quieting?
1189 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1190 return IC.replaceInstUsesWith(II, Src2);
1191
1192 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1193 : IC.Builder.CreateMinNum(Src0, Src2);
1194 break;
1195 case KnownIEEEMode::Off:
1196 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1197 : IC.Builder.CreateMinimumNum(Src0, Src2);
1198 break;
1200 break;
1201 }
1202 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1203 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1204 isa<UndefValue>(Src2)) {
1205 switch (fpenvIEEEMode(II)) {
1206 case KnownIEEEMode::On:
1207 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1208 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1209 return IC.replaceInstUsesWith(II, Quieted);
1210 }
1211
1212 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1213 ? IC.Builder.CreateMaxNum(Src0, Src1)
1214 : IC.Builder.CreateMinNum(Src0, Src1);
1215 break;
1216 case KnownIEEEMode::Off:
1217 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1218 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1219 : IC.Builder.CreateMaximumNum(Src0, Src1);
1220 break;
1222 break;
1223 }
1224 }
1225
1226 if (V) {
1227 if (auto *CI = dyn_cast<CallInst>(V)) {
1228 CI->copyFastMathFlags(&II);
1229 CI->takeName(&II);
1230 }
1231 return IC.replaceInstUsesWith(II, V);
1232 }
1233
1234 bool Swap = false;
1235 // Canonicalize constants to RHS operands.
1236 //
1237 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1238 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1239 std::swap(Src0, Src1);
1240 Swap = true;
1241 }
1242
1243 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1244 std::swap(Src1, Src2);
1245 Swap = true;
1246 }
1247
1248 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1249 std::swap(Src0, Src1);
1250 Swap = true;
1251 }
1252
1253 if (Swap) {
1254 II.setArgOperand(0, Src0);
1255 II.setArgOperand(1, Src1);
1256 II.setArgOperand(2, Src2);
1257 return &II;
1258 }
1259
1260 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1261 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1262 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1263 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1264 C2->getValueAPF());
1265 return IC.replaceInstUsesWith(II,
1266 ConstantFP::get(II.getType(), Result));
1267 }
1268 }
1269 }
1270
1271 if (!ST->hasMed3_16())
1272 break;
1273
1274 // Repeat floating-point width reduction done for minnum/maxnum.
1275 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1276 if (Value *X = matchFPExtFromF16(Src0)) {
1277 if (Value *Y = matchFPExtFromF16(Src1)) {
1278 if (Value *Z = matchFPExtFromF16(Src2)) {
1279 Value *NewCall = IC.Builder.CreateIntrinsic(
1280 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1281 return new FPExtInst(NewCall, II.getType());
1282 }
1283 }
1284 }
1285
1286 break;
1287 }
1288 case Intrinsic::amdgcn_icmp:
1289 case Intrinsic::amdgcn_fcmp: {
1290 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1291 // Guard against invalid arguments.
1292 int64_t CCVal = CC->getZExtValue();
1293 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1294 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1295 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1296 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1298 break;
1299
1300 Value *Src0 = II.getArgOperand(0);
1301 Value *Src1 = II.getArgOperand(1);
1302
1303 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1304 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1306 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1307 if (CCmp && CCmp->isNullValue()) {
1308 return IC.replaceInstUsesWith(
1309 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1310 }
1311
1312 // The result of V_ICMP/V_FCMP assembly instructions (which this
1313 // intrinsic exposes) is one bit per thread, masked with the EXEC
1314 // register (which contains the bitmask of live threads). So a
1315 // comparison that always returns true is the same as a read of the
1316 // EXEC register.
1317 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
1318 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
1319 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
1320 CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
1321 II.getType(), Args);
1322 NewCall->addFnAttr(Attribute::Convergent);
1323 NewCall->takeName(&II);
1324 return IC.replaceInstUsesWith(II, NewCall);
1325 }
1326
1327 // Canonicalize constants to RHS.
1328 CmpInst::Predicate SwapPred =
1330 II.setArgOperand(0, Src1);
1331 II.setArgOperand(1, Src0);
1332 II.setArgOperand(
1333 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1334 return &II;
1335 }
1336
1337 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1338 break;
1339
1340 // Canonicalize compare eq with true value to compare != 0
1341 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1342 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1343 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1344 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1345 Value *ExtSrc;
1346 if (CCVal == CmpInst::ICMP_EQ &&
1347 ((match(Src1, PatternMatch::m_One()) &&
1348 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1349 (match(Src1, PatternMatch::m_AllOnes()) &&
1350 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1351 ExtSrc->getType()->isIntegerTy(1)) {
1353 IC.replaceOperand(II, 2,
1354 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1355 return &II;
1356 }
1357
1358 CmpPredicate SrcPred;
1359 Value *SrcLHS;
1360 Value *SrcRHS;
1361
1362 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1363 // intrinsic. The typical use is a wave vote function in the library, which
1364 // will be fed from a user code condition compared with 0. Fold in the
1365 // redundant compare.
1366
1367 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1368 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1369 //
1370 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1371 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1372 if (match(Src1, PatternMatch::m_Zero()) &&
1374 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1375 PatternMatch::m_Value(SrcRHS))))) {
1376 if (CCVal == CmpInst::ICMP_EQ)
1377 SrcPred = CmpInst::getInversePredicate(SrcPred);
1378
1379 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1380 ? Intrinsic::amdgcn_fcmp
1381 : Intrinsic::amdgcn_icmp;
1382
1383 Type *Ty = SrcLHS->getType();
1384 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1385 // Promote to next legal integer type.
1386 unsigned Width = CmpType->getBitWidth();
1387 unsigned NewWidth = Width;
1388
1389 // Don't do anything for i1 comparisons.
1390 if (Width == 1)
1391 break;
1392
1393 if (Width <= 16)
1394 NewWidth = 16;
1395 else if (Width <= 32)
1396 NewWidth = 32;
1397 else if (Width <= 64)
1398 NewWidth = 64;
1399 else
1400 break; // Can't handle this.
1401
1402 if (Width != NewWidth) {
1403 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1404 if (CmpInst::isSigned(SrcPred)) {
1405 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1406 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1407 } else {
1408 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1409 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1410 }
1411 }
1412 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1413 break;
1414
1415 Value *Args[] = {SrcLHS, SrcRHS,
1416 ConstantInt::get(CC->getType(), SrcPred)};
1417 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1418 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1419 NewCall->takeName(&II);
1420 return IC.replaceInstUsesWith(II, NewCall);
1421 }
1422
1423 break;
1424 }
1425 case Intrinsic::amdgcn_mbcnt_hi:
1426 // exec_hi is all 0, so this is just a copy.
1427 if (ST->isWave32())
1428 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1429 [[fallthrough]];
1430 case Intrinsic::amdgcn_mbcnt_lo: {
1431 ConstantRange AccRange = computeConstantRange(II.getArgOperand(1),
1432 /*ForSigned=*/false);
1433 if (AccRange.isFullSet())
1434 return nullptr;
1435
1436 // TODO: Can raise lower bound by inspecting first argument.
1437 ConstantRange MbcntRange(APInt(32, 0), APInt(32, 32 + 1));
1438 ConstantRange ComputedRange = AccRange.add(MbcntRange);
1439 if (ComputedRange.isFullSet())
1440 return nullptr;
1441
1442 if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1443 ComputedRange = ComputedRange.intersectWith(*ExistingRange);
1444 if (ComputedRange == *ExistingRange)
1445 return nullptr;
1446 }
1447
1448 II.addRangeRetAttr(ComputedRange);
1449 return nullptr;
1450 }
1451 case Intrinsic::amdgcn_ballot: {
1452 Value *Arg = II.getArgOperand(0);
1453 if (isa<PoisonValue>(Arg))
1454 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1455
1456 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1457 if (Src->isZero()) {
1458 // amdgcn.ballot(i1 0) is zero.
1459 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1460 }
1461 }
1462 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1463 // %b64 = call i64 ballot.i64(...)
1464 // =>
1465 // %b32 = call i32 ballot.i32(...)
1466 // %b64 = zext i32 %b32 to i64
1468 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1469 {IC.Builder.getInt32Ty()},
1470 {II.getArgOperand(0)}),
1471 II.getType());
1472 Call->takeName(&II);
1473 return IC.replaceInstUsesWith(II, Call);
1474 }
1475 break;
1476 }
1477 case Intrinsic::amdgcn_wavefrontsize: {
1478 if (ST->isWaveSizeKnown())
1479 return IC.replaceInstUsesWith(
1480 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1481 break;
1482 }
1483 case Intrinsic::amdgcn_wqm_vote: {
1484 // wqm_vote is identity when the argument is constant.
1485 if (!isa<Constant>(II.getArgOperand(0)))
1486 break;
1487
1488 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1489 }
1490 case Intrinsic::amdgcn_kill: {
1491 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1492 if (!C || !C->getZExtValue())
1493 break;
1494
1495 // amdgcn.kill(i1 1) is a no-op
1496 return IC.eraseInstFromFunction(II);
1497 }
1498 case Intrinsic::amdgcn_s_sendmsg:
1499 case Intrinsic::amdgcn_s_sendmsghalt: {
1500 // The second operand is copied to m0, but is only actually used for
1501 // certain message types. For message types that are known to not use m0,
1502 // fold it to poison.
1503 using namespace AMDGPU::SendMsg;
1504
1505 Value *M0Val = II.getArgOperand(1);
1506 if (isa<PoisonValue>(M0Val))
1507 break;
1508
1509 auto *MsgImm = cast<ConstantInt>(II.getArgOperand(0));
1510 uint16_t MsgId, OpId, StreamId;
1511 decodeMsg(MsgImm->getZExtValue(), MsgId, OpId, StreamId, *ST);
1512
1513 if (!msgDoesNotUseM0(MsgId, *ST))
1514 break;
1515
1516 // Drop UB-implying attributes since we're replacing with poison.
1517 II.dropUBImplyingAttrsAndMetadata();
1518 IC.replaceOperand(II, 1, PoisonValue::get(M0Val->getType()));
1519 return nullptr;
1520 }
1521 case Intrinsic::amdgcn_update_dpp: {
1522 Value *Old = II.getArgOperand(0);
1523
1524 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1525 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1526 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1527 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1528 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1529 break;
1530
1531 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1532 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1533 }
1534 case Intrinsic::amdgcn_permlane16:
1535 case Intrinsic::amdgcn_permlane16_var:
1536 case Intrinsic::amdgcn_permlanex16:
1537 case Intrinsic::amdgcn_permlanex16_var: {
1538 // Discard vdst_in if it's not going to be read.
1539 Value *VDstIn = II.getArgOperand(0);
1540 if (isa<PoisonValue>(VDstIn))
1541 break;
1542
1543 // FetchInvalid operand idx.
1544 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1545 IID == Intrinsic::amdgcn_permlanex16)
1546 ? 4 /* for permlane16 and permlanex16 */
1547 : 3; /* for permlane16_var and permlanex16_var */
1548
1549 // BoundCtrl operand idx.
1550 // For permlane16 and permlanex16 it should be 5
1551 // For Permlane16_var and permlanex16_var it should be 4
1552 unsigned int BcIdx = FiIdx + 1;
1553
1554 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1555 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1556 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1557 break;
1558
1559 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1560 }
1561 case Intrinsic::amdgcn_permlane64:
1562 case Intrinsic::amdgcn_readfirstlane:
1563 case Intrinsic::amdgcn_readlane:
1564 case Intrinsic::amdgcn_ds_bpermute: {
1565 // If the data argument is uniform these intrinsics return it unchanged.
1566 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1567 const Use &Src = II.getArgOperandUse(SrcIdx);
1568 if (isTriviallyUniform(Src))
1569 return IC.replaceInstUsesWith(II, Src.get());
1570
1571 if (IID == Intrinsic::amdgcn_readlane &&
1573 return &II;
1574
1575 // If the lane argument of bpermute is uniform, change it to readlane. This
1576 // generates better code and can enable further optimizations because
1577 // readlane is AlwaysUniform.
1578 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1579 const Use &Lane = II.getArgOperandUse(0);
1580 if (isTriviallyUniform(Lane)) {
1581 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1583 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1584 II.setCalledFunction(NewDecl);
1585 II.setOperand(0, Src);
1586 II.setOperand(1, NewLane);
1587 return &II;
1588 }
1589 }
1590
1591 if (IID != Intrinsic::amdgcn_ds_bpermute) {
1593 return Res;
1594 }
1595
1596 return std::nullopt;
1597 }
1598 case Intrinsic::amdgcn_writelane: {
1599 // TODO: Fold bitcast like readlane.
1600 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1601 return &II;
1602 return std::nullopt;
1603 }
1604 case Intrinsic::amdgcn_trig_preop: {
1605 // The intrinsic is declared with name mangling, but currently the
1606 // instruction only exists for f64
1607 if (!II.getType()->isDoubleTy())
1608 break;
1609
1610 Value *Src = II.getArgOperand(0);
1611 Value *Segment = II.getArgOperand(1);
1612 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1613 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1614
1615 if (isa<UndefValue>(Segment))
1616 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1617
1618 // Sign bit is not used.
1619 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Src);
1620 if (StrippedSign != Src)
1621 return IC.replaceOperand(II, 0, StrippedSign);
1622
1623 if (II.isStrictFP())
1624 break;
1625
1626 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
1627 if (!CSrc && !isa<UndefValue>(Src))
1628 break;
1629
1630 // The instruction ignores special cases, and literally just extracts the
1631 // exponents. Fold undef to nan, and index the table as normal.
1632 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
1633 : APFloat::getQNaN(II.getType()->getFltSemantics())
1634 .bitcastToAPInt();
1635
1636 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1637 if (!Cseg) {
1638 if (isa<UndefValue>(Src))
1639 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1640 break;
1641 }
1642
1643 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
1644 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1645 unsigned Shift = SegmentVal * 53;
1646 if (Exponent > 1077)
1647 Shift += Exponent - 1077;
1648
1649 // 2.0/PI table.
1650 static const uint32_t TwoByPi[] = {
1651 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1652 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1653 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1654 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1655 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1656 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1657 0x56033046};
1658
1659 // Return 0 for outbound segment (hardware behavior).
1660 unsigned Idx = Shift >> 5;
1661 if (Idx + 2 >= std::size(TwoByPi)) {
1662 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1663 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1664 }
1665
1666 unsigned BShift = Shift & 0x1f;
1667 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1668 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1669 if (BShift)
1670 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1671 Thi = Thi >> 11;
1672 APFloat Result = APFloat((double)Thi);
1673
1674 int Scale = -53 - Shift;
1675 if (Exponent >= 1968)
1676 Scale += 128;
1677
1678 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1679 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1680 }
1681 case Intrinsic::amdgcn_fmul_legacy: {
1682 Value *Op0 = II.getArgOperand(0);
1683 Value *Op1 = II.getArgOperand(1);
1684
1685 for (Value *Src : {Op0, Op1}) {
1686 if (isa<PoisonValue>(Src))
1687 return IC.replaceInstUsesWith(II, Src);
1688 }
1689
1690 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1691 // infinity, gives +0.0.
1692 // TODO: Move to InstSimplify?
1693 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1695 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1696
1697 // If we can prove we don't have one of the special cases then we can use a
1698 // normal fmul instruction instead.
1699 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1700 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1701 FMul->takeName(&II);
1702 return IC.replaceInstUsesWith(II, FMul);
1703 }
1704 break;
1705 }
1706 case Intrinsic::amdgcn_fma_legacy: {
1707 Value *Op0 = II.getArgOperand(0);
1708 Value *Op1 = II.getArgOperand(1);
1709 Value *Op2 = II.getArgOperand(2);
1710
1711 for (Value *Src : {Op0, Op1, Op2}) {
1712 if (isa<PoisonValue>(Src))
1713 return IC.replaceInstUsesWith(II, Src);
1714 }
1715
1716 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1717 // infinity, gives +0.0.
1718 // TODO: Move to InstSimplify?
1719 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1721 // It's tempting to just return Op2 here, but that would give the wrong
1722 // result if Op2 was -0.0.
1723 auto *Zero = ConstantFP::getZero(II.getType());
1724 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1725 FAdd->takeName(&II);
1726 return IC.replaceInstUsesWith(II, FAdd);
1727 }
1728
1729 // If we can prove we don't have one of the special cases then we can use a
1730 // normal fma instead.
1731 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1732 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1733 II.getModule(), Intrinsic::fma, II.getType()));
1734 return &II;
1735 }
1736 break;
1737 }
1738 case Intrinsic::amdgcn_is_shared:
1739 case Intrinsic::amdgcn_is_private: {
1740 Value *Src = II.getArgOperand(0);
1741 if (isa<PoisonValue>(Src))
1742 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1743 if (isa<UndefValue>(Src))
1744 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1745
1746 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1747 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1748 break;
1749 }
1750 case Intrinsic::amdgcn_make_buffer_rsrc: {
1751 Value *Src = II.getArgOperand(0);
1752 if (isa<PoisonValue>(Src))
1753 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1754 return std::nullopt;
1755 }
1756 case Intrinsic::amdgcn_raw_buffer_store_format:
1757 case Intrinsic::amdgcn_struct_buffer_store_format:
1758 case Intrinsic::amdgcn_raw_tbuffer_store:
1759 case Intrinsic::amdgcn_struct_tbuffer_store:
1760 case Intrinsic::amdgcn_image_store_1d:
1761 case Intrinsic::amdgcn_image_store_1darray:
1762 case Intrinsic::amdgcn_image_store_2d:
1763 case Intrinsic::amdgcn_image_store_2darray:
1764 case Intrinsic::amdgcn_image_store_2darraymsaa:
1765 case Intrinsic::amdgcn_image_store_2dmsaa:
1766 case Intrinsic::amdgcn_image_store_3d:
1767 case Intrinsic::amdgcn_image_store_cube:
1768 case Intrinsic::amdgcn_image_store_mip_1d:
1769 case Intrinsic::amdgcn_image_store_mip_1darray:
1770 case Intrinsic::amdgcn_image_store_mip_2d:
1771 case Intrinsic::amdgcn_image_store_mip_2darray:
1772 case Intrinsic::amdgcn_image_store_mip_3d:
1773 case Intrinsic::amdgcn_image_store_mip_cube: {
1774 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1775 break;
1776
1777 APInt DemandedElts;
1778 if (ST->hasDefaultComponentBroadcast())
1779 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1780 else if (ST->hasDefaultComponentZero())
1781 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1782 else
1783 break;
1784
1785 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1786 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1787 false)) {
1788 return IC.eraseInstFromFunction(II);
1789 }
1790
1791 break;
1792 }
1793 case Intrinsic::amdgcn_prng_b32: {
1794 auto *Src = II.getArgOperand(0);
1795 if (isa<UndefValue>(Src)) {
1796 return IC.replaceInstUsesWith(II, Src);
1797 }
1798 return std::nullopt;
1799 }
1800 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1801 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1802 Value *Src0 = II.getArgOperand(0);
1803 Value *Src1 = II.getArgOperand(1);
1804 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
1805 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
1806 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
1807 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
1808
1809 auto getFormatNumRegs = [](unsigned FormatVal) {
1810 switch (FormatVal) {
1813 return 6u;
1815 return 4u;
1818 return 8u;
1819 default:
1820 llvm_unreachable("invalid format value");
1821 }
1822 };
1823
1824 bool MadeChange = false;
1825 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
1826 unsigned Src1NumElts = getFormatNumRegs(BLGP);
1827
1828 // Depending on the used format, fewer registers are required so shrink the
1829 // vector type.
1830 if (Src0Ty->getNumElements() > Src0NumElts) {
1831 Src0 = IC.Builder.CreateExtractVector(
1832 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
1833 uint64_t(0));
1834 MadeChange = true;
1835 }
1836
1837 if (Src1Ty->getNumElements() > Src1NumElts) {
1838 Src1 = IC.Builder.CreateExtractVector(
1839 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
1840 uint64_t(0));
1841 MadeChange = true;
1842 }
1843
1844 if (!MadeChange)
1845 return std::nullopt;
1846
1847 SmallVector<Value *, 10> Args(II.args());
1848 Args[0] = Src0;
1849 Args[1] = Src1;
1850
1851 CallInst *NewII = IC.Builder.CreateIntrinsic(
1852 IID, {Src0->getType(), Src1->getType()}, Args, &II);
1853 NewII->takeName(&II);
1854 return IC.replaceInstUsesWith(II, NewII);
1855 }
1856 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
1857 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1858 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
1859 Value *Src0 = II.getArgOperand(1);
1860 Value *Src1 = II.getArgOperand(3);
1861 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1862 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
1863 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
1864 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
1865
1866 bool MadeChange = false;
1867 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
1868 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
1869
1870 // Depending on the used format, fewer registers are required so shrink the
1871 // vector type.
1872 if (Src0Ty->getNumElements() > Src0NumElts) {
1873 Src0 = IC.Builder.CreateExtractVector(
1874 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
1875 IC.Builder.getInt64(0));
1876 MadeChange = true;
1877 }
1878
1879 if (Src1Ty->getNumElements() > Src1NumElts) {
1880 Src1 = IC.Builder.CreateExtractVector(
1881 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
1882 IC.Builder.getInt64(0));
1883 MadeChange = true;
1884 }
1885
1886 if (!MadeChange)
1887 return std::nullopt;
1888
1889 SmallVector<Value *, 13> Args(II.args());
1890 Args[1] = Src0;
1891 Args[3] = Src1;
1892
1893 CallInst *NewII = IC.Builder.CreateIntrinsic(
1894 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
1895 Args, &II);
1896 NewII->takeName(&II);
1897 return IC.replaceInstUsesWith(II, NewII);
1898 }
1899 case Intrinsic::amdgcn_wave_shuffle: {
1900 if (!ST->hasDPP())
1901 return std::nullopt;
1902
1903 return tryWaveShuffleDPP(*ST, IC, II);
1904 }
1905 }
1906 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1907 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1908 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1909 }
1910 return std::nullopt;
1911}
1912
1913/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1914///
1915/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1916/// definitions of the intrinsics vector argument, not Uses of the result like
1917/// image and buffer loads.
1918/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1919/// struct returns.
1922 APInt DemandedElts,
1923 int DMaskIdx, bool IsLoad) {
1924
1925 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1926 : II.getOperand(0)->getType());
1927 unsigned VWidth = IIVTy->getNumElements();
1928 if (VWidth == 1)
1929 return nullptr;
1930 Type *EltTy = IIVTy->getElementType();
1931
1934
1935 // Assume the arguments are unchanged and later override them, if needed.
1936 SmallVector<Value *, 16> Args(II.args());
1937
1938 if (DMaskIdx < 0) {
1939 // Buffer case.
1940
1941 const unsigned ActiveBits = DemandedElts.getActiveBits();
1942 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1943
1944 // Start assuming the prefix of elements is demanded, but possibly clear
1945 // some other bits if there are trailing zeros (unused components at front)
1946 // and update offset.
1947 DemandedElts = (1 << ActiveBits) - 1;
1948
1949 if (UnusedComponentsAtFront > 0) {
1950 static const unsigned InvalidOffsetIdx = 0xf;
1951
1952 unsigned OffsetIdx;
1953 switch (II.getIntrinsicID()) {
1954 case Intrinsic::amdgcn_raw_buffer_load:
1955 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1956 OffsetIdx = 1;
1957 break;
1958 case Intrinsic::amdgcn_s_buffer_load:
1959 // If resulting type is vec3, there is no point in trimming the
1960 // load with updated offset, as the vec3 would most likely be widened to
1961 // vec4 anyway during lowering.
1962 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1963 OffsetIdx = InvalidOffsetIdx;
1964 else
1965 OffsetIdx = 1;
1966 break;
1967 case Intrinsic::amdgcn_struct_buffer_load:
1968 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1969 OffsetIdx = 2;
1970 break;
1971 default:
1972 // TODO: handle tbuffer* intrinsics.
1973 OffsetIdx = InvalidOffsetIdx;
1974 break;
1975 }
1976
1977 if (OffsetIdx != InvalidOffsetIdx) {
1978 // Clear demanded bits and update the offset.
1979 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1980 auto *Offset = Args[OffsetIdx];
1981 unsigned SingleComponentSizeInBits =
1982 IC.getDataLayout().getTypeSizeInBits(EltTy);
1983 unsigned OffsetAdd =
1984 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1985 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1986 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1987 }
1988 }
1989 } else {
1990 // Image case.
1991
1992 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1993 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1994
1995 // dmask 0 has special semantics, do not simplify.
1996 if (DMaskVal == 0)
1997 return nullptr;
1998
1999 // Mask off values that are undefined because the dmask doesn't cover them
2000 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
2001
2002 unsigned NewDMaskVal = 0;
2003 unsigned OrigLdStIdx = 0;
2004 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
2005 const unsigned Bit = 1 << SrcIdx;
2006 if (!!(DMaskVal & Bit)) {
2007 if (!!DemandedElts[OrigLdStIdx])
2008 NewDMaskVal |= Bit;
2009 OrigLdStIdx++;
2010 }
2011 }
2012
2013 if (DMaskVal != NewDMaskVal)
2014 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
2015 }
2016
2017 unsigned NewNumElts = DemandedElts.popcount();
2018 if (!NewNumElts)
2019 return PoisonValue::get(IIVTy);
2020
2021 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2022 if (DMaskIdx >= 0)
2023 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
2024 return nullptr;
2025 }
2026
2027 // Validate function argument and return types, extracting overloaded types
2028 // along the way.
2029 SmallVector<Type *, 6> OverloadTys;
2030 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
2031 return nullptr;
2032
2033 Type *NewTy =
2034 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
2035 OverloadTys[0] = NewTy;
2036
2037 if (!IsLoad) {
2038 SmallVector<int, 8> EltMask;
2039 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2040 if (DemandedElts[OrigStoreIdx])
2041 EltMask.push_back(OrigStoreIdx);
2042
2043 if (NewNumElts == 1)
2044 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
2045 else
2046 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
2047 }
2048
2049 CallInst *NewCall =
2050 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
2051 NewCall->takeName(&II);
2052 NewCall->copyMetadata(II);
2053
2054 if (IsLoad) {
2055 if (NewNumElts == 1) {
2056 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
2057 DemandedElts.countr_zero());
2058 }
2059
2060 SmallVector<int, 8> EltMask;
2061 unsigned NewLoadIdx = 0;
2062 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2063 if (!!DemandedElts[OrigLoadIdx])
2064 EltMask.push_back(NewLoadIdx++);
2065 else
2066 EltMask.push_back(NewNumElts);
2067 }
2068
2069 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
2070
2071 return Shuffle;
2072 }
2073
2074 return NewCall;
2075}
2076
2078 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2079 APInt &UndefElts) const {
2080 auto *VT = dyn_cast<FixedVectorType>(II.getType());
2081 if (!VT)
2082 return nullptr;
2083
2084 const unsigned FirstElt = DemandedElts.countr_zero();
2085 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2086 const unsigned MaskLen = LastElt - FirstElt + 1;
2087
2088 unsigned OldNumElts = VT->getNumElements();
2089 if (MaskLen == OldNumElts && MaskLen != 1)
2090 return nullptr;
2091
2092 Type *EltTy = VT->getElementType();
2093 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
2094
2095 // Theoretically we should support these intrinsics for any legal type. Avoid
2096 // introducing cases that aren't direct register types like v3i16.
2097 if (!isTypeLegal(NewVT))
2098 return nullptr;
2099
2100 Value *Src = II.getArgOperand(0);
2101
2102 // Make sure convergence tokens are preserved.
2103 // TODO: CreateIntrinsic should allow directly copying bundles
2105 II.getOperandBundlesAsDefs(OpBundles);
2106
2108 Function *Remangled =
2109 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
2110
2111 if (MaskLen == 1) {
2112 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
2113
2114 // TODO: Preserve callsite attributes?
2115 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2116
2117 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
2118 NewCall, FirstElt);
2119 }
2120
2121 SmallVector<int> ExtractMask(MaskLen, -1);
2122 for (unsigned I = 0; I != MaskLen; ++I) {
2123 if (DemandedElts[FirstElt + I])
2124 ExtractMask[I] = FirstElt + I;
2125 }
2126
2127 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
2128
2129 // TODO: Preserve callsite attributes?
2130 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2131
2132 SmallVector<int> InsertMask(OldNumElts, -1);
2133 for (unsigned I = 0; I != MaskLen; ++I) {
2134 if (DemandedElts[FirstElt + I])
2135 InsertMask[FirstElt + I] = I;
2136 }
2137
2138 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2139 // call behind.
2140 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
2141}
2142
2144 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2145 APInt &UndefElts2, APInt &UndefElts3,
2146 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2147 SimplifyAndSetOp) const {
2148 switch (II.getIntrinsicID()) {
2149 case Intrinsic::amdgcn_readfirstlane:
2150 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2151 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2152 case Intrinsic::amdgcn_raw_buffer_load:
2153 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2154 case Intrinsic::amdgcn_raw_buffer_load_format:
2155 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2156 case Intrinsic::amdgcn_raw_tbuffer_load:
2157 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2158 case Intrinsic::amdgcn_s_buffer_load:
2159 case Intrinsic::amdgcn_struct_buffer_load:
2160 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2161 case Intrinsic::amdgcn_struct_buffer_load_format:
2162 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2163 case Intrinsic::amdgcn_struct_tbuffer_load:
2164 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2165 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2166 default: {
2167 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2168 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2169 }
2170 break;
2171 }
2172 }
2173 return std::nullopt;
2174}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static APInt defaultComponentBroadcast(Value *V)
static std::optional< Instruction * > tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static bool isThreadID(const GCNSubtarget &ST, Value *V)
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition APFloat.h:334
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1175
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1263
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5890
bool isPosInfinity() const
Definition APFloat.h:1529
const fltSemantics & getSemantics() const
Definition APFloat.h:1524
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1371
bool isNaN() const
Definition APFloat.h:1514
bool isSignaling() const
Definition APFloat.h:1518
APInt bitcastToAPInt() const
Definition APFloat.h:1408
bool isNegInfinity() const
Definition APFloat.h:1530
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1134
cmpResult compare(const APFloat &RHS) const
Definition APFloat.h:1459
bool isInfinity() const
Definition APFloat.h:1513
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1421
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:956
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
bool isMask(unsigned numBits) const
Definition APInt.h:489
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
bool isFPPredicate() const
Definition InstrTypes.h:782
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
const APFloat & getValueAPF() const
Definition Constants.h:463
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange add(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an addition of a value in this ran...
LLVM_ABI bool isFullSet() const
Return true if this set contains all of the elements possible for this data-type.
LLVM_ABI ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constants.cpp:74
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:784
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:328
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:72
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1120
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2584
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2572
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:592
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2089
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1539
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2335
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1051
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1518
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2077
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2606
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1079
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1039
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1429
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2510
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1644
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1073
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1558
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1682
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
static Value * stripSignOnlyFPOps(Value *Val)
Ignore all operations which only change the sign of a value, returning the underlying magnitude value...
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Metadata node.
Definition Metadata.h:1080
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
static LLVM_ABI MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition Metadata.cpp:110
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:288
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &OverloadTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1622
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1665
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1610
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.