LLVM 17.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
23#include <optional>
24
25using namespace llvm;
26
27#define DEBUG_TYPE "AMDGPUtti"
28
29namespace {
30
31struct AMDGPUImageDMaskIntrinsic {
32 unsigned Intr;
33};
34
35#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
36#include "InstCombineTables.inc"
37
38} // end anonymous namespace
39
40// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
41//
42// A single NaN input is folded to minnum, so we rely on that folding for
43// handling NaNs.
44static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
45 const APFloat &Src2) {
46 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
47
48 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
49 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
50 if (Cmp0 == APFloat::cmpEqual)
51 return maxnum(Src1, Src2);
52
53 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
54 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
55 if (Cmp1 == APFloat::cmpEqual)
56 return maxnum(Src0, Src2);
57
58 return maxnum(Src0, Src1);
59}
60
61// Check if a value can be converted to a 16-bit value without losing
62// precision.
63// The value is expected to be either a float (IsFloat = true) or an unsigned
64// integer (IsFloat = false).
65static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
66 Type *VTy = V.getType();
67 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
68 // The value is already 16-bit, so we don't want to convert to 16-bit again!
69 return false;
70 }
71 if (IsFloat) {
72 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
73 // We need to check that if we cast the index down to a half, we do not
74 // lose precision.
75 APFloat FloatValue(ConstFloat->getValueAPF());
76 bool LosesInfo = true;
77 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
78 &LosesInfo);
79 return !LosesInfo;
80 }
81 } else {
82 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
83 // We need to check that if we cast the index down to an i16, we do not
84 // lose precision.
85 APInt IntValue(ConstInt->getValue());
86 return IntValue.getActiveBits() <= 16;
87 }
88 }
89
90 Value *CastSrc;
91 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
92 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
93 if (IsExt) {
94 Type *CastSrcTy = CastSrc->getType();
95 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
96 return true;
97 }
98
99 return false;
100}
101
102// Convert a value to 16-bit.
104 Type *VTy = V.getType();
105 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
106 return cast<Instruction>(&V)->getOperand(0);
107 if (VTy->isIntegerTy())
108 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
109 if (VTy->isFloatingPointTy())
110 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
111
112 llvm_unreachable("Should never be called!");
113}
114
115/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
116/// modified arguments (based on OldIntr) and replaces InstToReplace with
117/// this newly created intrinsic call.
118static std::optional<Instruction *> modifyIntrinsicCall(
119 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
120 InstCombiner &IC,
121 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
122 Func) {
125 return std::nullopt;
126
127 SmallVector<Value *, 8> Args(OldIntr.args());
128
129 // Modify arguments and types
130 Func(Args, ArgTys);
131
132 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateCall(I, Args);
135 NewCall->takeName(&OldIntr);
136 NewCall->copyMetadata(OldIntr);
137 if (isa<FPMathOperator>(NewCall))
138 NewCall->copyFastMathFlags(&OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(InstToReplace, NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto RetValue = IC.eraseInstFromFunction(InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(OldIntr);
149
150 return RetValue;
151}
152
153static std::optional<Instruction *>
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
156 IntrinsicInst &II, InstCombiner &IC) {
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
159 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
164 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
165 ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
176 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
182 ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
193 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
199 ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
211 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
216 AMDGPU::getImageDimIntrinsicByBaseOpcode(
217 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
230 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
242
243 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
244 [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251 }
252 }
253
254 // Try to use A16 or G16
255 if (!ST->hasA16() && !ST->hasG16())
256 return std::nullopt;
257
258 // Address is interpreted as float if the instruction has a sampler or as
259 // unsigned int if there is no sampler.
260 bool HasSampler =
261 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
262 bool FloatCoord = false;
263 // true means derivatives can be converted to 16 bit, coordinates not
264 bool OnlyDerivatives = false;
265
266 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
267 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
268 Value *Coord = II.getOperand(OperandIndex);
269 // If the values are not derived from 16-bit values, we cannot optimize.
270 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
271 if (OperandIndex < ImageDimIntr->CoordStart ||
272 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
273 return std::nullopt;
274 }
275 // All gradients can be converted, so convert only them
276 OnlyDerivatives = true;
277 break;
278 }
279
280 assert(OperandIndex == ImageDimIntr->GradientStart ||
281 FloatCoord == Coord->getType()->isFloatingPointTy());
282 FloatCoord = Coord->getType()->isFloatingPointTy();
283 }
284
285 if (!OnlyDerivatives && !ST->hasA16())
286 OnlyDerivatives = true; // Only supports G16
287
288 // Check if there is a bias parameter and if it can be converted to f16
289 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
290 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
291 assert(HasSampler &&
292 "Only image instructions with a sampler can have a bias");
293 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
294 OnlyDerivatives = true;
295 }
296
297 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
298 ImageDimIntr->CoordStart))
299 return std::nullopt;
300
301 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
303
304 return modifyIntrinsicCall(
305 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
306 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
307 if (!OnlyDerivatives) {
308 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
309
310 // Change the bias type
311 if (ImageDimIntr->NumBiasArgs != 0)
312 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
313 }
314
315 unsigned EndIndex =
316 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
317 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
318 OperandIndex < EndIndex; OperandIndex++) {
319 Args[OperandIndex] =
320 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
321 }
322
323 // Convert the bias
324 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
325 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
326 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
327 }
328 });
329}
330
332 InstCombiner &IC) const {
333 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
334 // infinity, gives +0.0. If we can prove we don't have one of the special
335 // cases then we can use a normal multiply instead.
336 // TODO: Create and use isKnownFiniteNonZero instead of just matching
337 // constants here.
340 // One operand is not zero or infinity or NaN.
341 return true;
342 }
343 auto *TLI = &IC.getTargetLibraryInfo();
344 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
345 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
346 // Neither operand is infinity or NaN.
347 return true;
348 }
349 return false;
350}
351
352std::optional<Instruction *>
354 Intrinsic::ID IID = II.getIntrinsicID();
355 switch (IID) {
356 case Intrinsic::amdgcn_rcp: {
357 Value *Src = II.getArgOperand(0);
358
359 // TODO: Move to ConstantFolding/InstSimplify?
360 if (isa<UndefValue>(Src)) {
361 Type *Ty = II.getType();
362 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
363 return IC.replaceInstUsesWith(II, QNaN);
364 }
365
366 if (II.isStrictFP())
367 break;
368
369 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
370 const APFloat &ArgVal = C->getValueAPF();
371 APFloat Val(ArgVal.getSemantics(), 1);
373
374 // This is more precise than the instruction may give.
375 //
376 // TODO: The instruction always flushes denormal results (except for f16),
377 // should this also?
378 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
379 }
380
381 break;
382 }
383 case Intrinsic::amdgcn_sqrt:
384 case Intrinsic::amdgcn_rsq: {
385 Value *Src = II.getArgOperand(0);
386
387 // TODO: Move to ConstantFolding/InstSimplify?
388 if (isa<UndefValue>(Src)) {
389 Type *Ty = II.getType();
390 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
391 return IC.replaceInstUsesWith(II, QNaN);
392 }
393
394 break;
395 }
396 case Intrinsic::amdgcn_frexp_mant:
397 case Intrinsic::amdgcn_frexp_exp: {
398 Value *Src = II.getArgOperand(0);
399 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
400 int Exp;
401 APFloat Significand =
402 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
403
404 if (IID == Intrinsic::amdgcn_frexp_mant) {
405 return IC.replaceInstUsesWith(
406 II, ConstantFP::get(II.getContext(), Significand));
407 }
408
409 // Match instruction special case behavior.
410 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
411 Exp = 0;
412
413 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
414 }
415
416 if (isa<UndefValue>(Src)) {
417 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
418 }
419
420 break;
421 }
422 case Intrinsic::amdgcn_class: {
423 Value *Src0 = II.getArgOperand(0);
424 Value *Src1 = II.getArgOperand(1);
425 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
426 if (!CMask) {
427 if (isa<UndefValue>(Src0)) {
428 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
429 }
430
431 if (isa<UndefValue>(Src1)) {
432 return IC.replaceInstUsesWith(II,
433 ConstantInt::get(II.getType(), false));
434 }
435 break;
436 }
437
438 uint32_t Mask = CMask->getZExtValue();
439
440 // If all tests are made, it doesn't matter what the value is.
441 if ((Mask & fcAllFlags) == fcAllFlags) {
442 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
443 }
444
445 if ((Mask & fcAllFlags) == 0) {
446 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
447 }
448
449 if (Mask == fcNan && !II.isStrictFP()) {
450 // Equivalent of isnan. Replace with standard fcmp.
451 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
452 FCmp->takeName(&II);
453 return IC.replaceInstUsesWith(II, FCmp);
454 }
455
456 if (Mask == fcZero && !II.isStrictFP()) {
457 // Equivalent of == 0.
458 Value *FCmp =
459 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
460
461 FCmp->takeName(&II);
462 return IC.replaceInstUsesWith(II, FCmp);
463 }
464
465 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
466 if ((Mask & fcNan) && isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
467 return IC.replaceOperand(
468 II, 1, ConstantInt::get(Src1->getType(), Mask & ~fcNan));
469 }
470
471 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
472 if (!CVal) {
473 if (isa<UndefValue>(Src0)) {
474 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
475 }
476
477 // Clamp mask to used bits
478 if ((Mask & fcAllFlags) != Mask) {
479 CallInst *NewCall = IC.Builder.CreateCall(
481 {Src0, ConstantInt::get(Src1->getType(), Mask & fcAllFlags)});
482
483 NewCall->takeName(&II);
484 return IC.replaceInstUsesWith(II, NewCall);
485 }
486
487 break;
488 }
489
490 const APFloat &Val = CVal->getValueAPF();
491
492 bool Result =
493 ((Mask & fcSNan) && Val.isNaN() && Val.isSignaling()) ||
494 ((Mask & fcQNan) && Val.isNaN() && !Val.isSignaling()) ||
495 ((Mask & fcNegInf) && Val.isInfinity() && Val.isNegative()) ||
496 ((Mask & fcNegNormal) && Val.isNormal() && Val.isNegative()) ||
497 ((Mask & fcNegSubnormal) && Val.isDenormal() && Val.isNegative()) ||
498 ((Mask & fcNegZero) && Val.isZero() && Val.isNegative()) ||
499 ((Mask & fcPosZero) && Val.isZero() && !Val.isNegative()) ||
500 ((Mask & fcPosSubnormal) && Val.isDenormal() && !Val.isNegative()) ||
501 ((Mask & fcPosNormal) && Val.isNormal() && !Val.isNegative()) ||
502 ((Mask & fcPosInf) && Val.isInfinity() && !Val.isNegative());
503
504 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
505 }
506 case Intrinsic::amdgcn_cvt_pkrtz: {
507 Value *Src0 = II.getArgOperand(0);
508 Value *Src1 = II.getArgOperand(1);
509 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
510 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
511 const fltSemantics &HalfSem =
513 bool LosesInfo;
514 APFloat Val0 = C0->getValueAPF();
515 APFloat Val1 = C1->getValueAPF();
516 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
517 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
518
519 Constant *Folded =
521 ConstantFP::get(II.getContext(), Val1)});
522 return IC.replaceInstUsesWith(II, Folded);
523 }
524 }
525
526 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
527 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
528 }
529
530 break;
531 }
532 case Intrinsic::amdgcn_cvt_pknorm_i16:
533 case Intrinsic::amdgcn_cvt_pknorm_u16:
534 case Intrinsic::amdgcn_cvt_pk_i16:
535 case Intrinsic::amdgcn_cvt_pk_u16: {
536 Value *Src0 = II.getArgOperand(0);
537 Value *Src1 = II.getArgOperand(1);
538
539 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
540 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
541 }
542
543 break;
544 }
545 case Intrinsic::amdgcn_ubfe:
546 case Intrinsic::amdgcn_sbfe: {
547 // Decompose simple cases into standard shifts.
548 Value *Src = II.getArgOperand(0);
549 if (isa<UndefValue>(Src)) {
550 return IC.replaceInstUsesWith(II, Src);
551 }
552
553 unsigned Width;
554 Type *Ty = II.getType();
555 unsigned IntSize = Ty->getIntegerBitWidth();
556
557 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
558 if (CWidth) {
559 Width = CWidth->getZExtValue();
560 if ((Width & (IntSize - 1)) == 0) {
562 }
563
564 // Hardware ignores high bits, so remove those.
565 if (Width >= IntSize) {
566 return IC.replaceOperand(
567 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
568 }
569 }
570
571 unsigned Offset;
572 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
573 if (COffset) {
574 Offset = COffset->getZExtValue();
575 if (Offset >= IntSize) {
576 return IC.replaceOperand(
577 II, 1,
578 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
579 }
580 }
581
582 bool Signed = IID == Intrinsic::amdgcn_sbfe;
583
584 if (!CWidth || !COffset)
585 break;
586
587 // The case of Width == 0 is handled above, which makes this transformation
588 // safe. If Width == 0, then the ashr and lshr instructions become poison
589 // value since the shift amount would be equal to the bit size.
590 assert(Width != 0);
591
592 // TODO: This allows folding to undef when the hardware has specific
593 // behavior?
594 if (Offset + Width < IntSize) {
595 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
596 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
597 : IC.Builder.CreateLShr(Shl, IntSize - Width);
598 RightShift->takeName(&II);
599 return IC.replaceInstUsesWith(II, RightShift);
600 }
601
602 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
603 : IC.Builder.CreateLShr(Src, Offset);
604
605 RightShift->takeName(&II);
606 return IC.replaceInstUsesWith(II, RightShift);
607 }
608 case Intrinsic::amdgcn_exp:
609 case Intrinsic::amdgcn_exp_row:
610 case Intrinsic::amdgcn_exp_compr: {
611 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
612 unsigned EnBits = En->getZExtValue();
613 if (EnBits == 0xf)
614 break; // All inputs enabled.
615
616 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
617 bool Changed = false;
618 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
619 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
620 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
621 Value *Src = II.getArgOperand(I + 2);
622 if (!isa<UndefValue>(Src)) {
623 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
624 Changed = true;
625 }
626 }
627 }
628
629 if (Changed) {
630 return &II;
631 }
632
633 break;
634 }
635 case Intrinsic::amdgcn_fmed3: {
636 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
637 // for the shader.
638
639 Value *Src0 = II.getArgOperand(0);
640 Value *Src1 = II.getArgOperand(1);
641 Value *Src2 = II.getArgOperand(2);
642
643 // Checking for NaN before canonicalization provides better fidelity when
644 // mapping other operations onto fmed3 since the order of operands is
645 // unchanged.
646 CallInst *NewCall = nullptr;
647 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
648 NewCall = IC.Builder.CreateMinNum(Src1, Src2);
649 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
650 NewCall = IC.Builder.CreateMinNum(Src0, Src2);
651 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
652 NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
653 }
654
655 if (NewCall) {
656 NewCall->copyFastMathFlags(&II);
657 NewCall->takeName(&II);
658 return IC.replaceInstUsesWith(II, NewCall);
659 }
660
661 bool Swap = false;
662 // Canonicalize constants to RHS operands.
663 //
664 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
665 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
666 std::swap(Src0, Src1);
667 Swap = true;
668 }
669
670 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
671 std::swap(Src1, Src2);
672 Swap = true;
673 }
674
675 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
676 std::swap(Src0, Src1);
677 Swap = true;
678 }
679
680 if (Swap) {
681 II.setArgOperand(0, Src0);
682 II.setArgOperand(1, Src1);
683 II.setArgOperand(2, Src2);
684 return &II;
685 }
686
687 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
688 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
689 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
690 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
691 C2->getValueAPF());
692 return IC.replaceInstUsesWith(
693 II, ConstantFP::get(IC.Builder.getContext(), Result));
694 }
695 }
696 }
697
698 break;
699 }
700 case Intrinsic::amdgcn_icmp:
701 case Intrinsic::amdgcn_fcmp: {
702 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
703 // Guard against invalid arguments.
704 int64_t CCVal = CC->getZExtValue();
705 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
706 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
708 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
710 break;
711
712 Value *Src0 = II.getArgOperand(0);
713 Value *Src1 = II.getArgOperand(1);
714
715 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
716 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
717 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
718 if (CCmp->isNullValue()) {
719 return IC.replaceInstUsesWith(
720 II, ConstantExpr::getSExt(CCmp, II.getType()));
721 }
722
723 // The result of V_ICMP/V_FCMP assembly instructions (which this
724 // intrinsic exposes) is one bit per thread, masked with the EXEC
725 // register (which contains the bitmask of live threads). So a
726 // comparison that always returns true is the same as a read of the
727 // EXEC register.
729 II.getModule(), Intrinsic::read_register, II.getType());
730 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
731 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
732 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
733 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
734 NewCall->addFnAttr(Attribute::Convergent);
735 NewCall->takeName(&II);
736 return IC.replaceInstUsesWith(II, NewCall);
737 }
738
739 // Canonicalize constants to RHS.
740 CmpInst::Predicate SwapPred =
742 II.setArgOperand(0, Src1);
743 II.setArgOperand(1, Src0);
744 II.setArgOperand(
745 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
746 return &II;
747 }
748
749 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
750 break;
751
752 // Canonicalize compare eq with true value to compare != 0
753 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
754 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
755 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
756 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
757 Value *ExtSrc;
758 if (CCVal == CmpInst::ICMP_EQ &&
759 ((match(Src1, PatternMatch::m_One()) &&
760 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
761 (match(Src1, PatternMatch::m_AllOnes()) &&
762 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
763 ExtSrc->getType()->isIntegerTy(1)) {
765 IC.replaceOperand(II, 2,
767 return &II;
768 }
769
770 CmpInst::Predicate SrcPred;
771 Value *SrcLHS;
772 Value *SrcRHS;
773
774 // Fold compare eq/ne with 0 from a compare result as the predicate to the
775 // intrinsic. The typical use is a wave vote function in the library, which
776 // will be fed from a user code condition compared with 0. Fold in the
777 // redundant compare.
778
779 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
780 // -> llvm.amdgcn.[if]cmp(a, b, pred)
781 //
782 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
783 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
784 if (match(Src1, PatternMatch::m_Zero()) &&
786 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
787 PatternMatch::m_Value(SrcRHS))))) {
788 if (CCVal == CmpInst::ICMP_EQ)
789 SrcPred = CmpInst::getInversePredicate(SrcPred);
790
791 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
792 ? Intrinsic::amdgcn_fcmp
793 : Intrinsic::amdgcn_icmp;
794
795 Type *Ty = SrcLHS->getType();
796 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
797 // Promote to next legal integer type.
798 unsigned Width = CmpType->getBitWidth();
799 unsigned NewWidth = Width;
800
801 // Don't do anything for i1 comparisons.
802 if (Width == 1)
803 break;
804
805 if (Width <= 16)
806 NewWidth = 16;
807 else if (Width <= 32)
808 NewWidth = 32;
809 else if (Width <= 64)
810 NewWidth = 64;
811 else if (Width > 64)
812 break; // Can't handle this.
813
814 if (Width != NewWidth) {
815 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
816 if (CmpInst::isSigned(SrcPred)) {
817 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
818 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
819 } else {
820 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
821 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
822 }
823 }
824 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
825 break;
826
828 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
829 Value *Args[] = {SrcLHS, SrcRHS,
830 ConstantInt::get(CC->getType(), SrcPred)};
831 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
832 NewCall->takeName(&II);
833 return IC.replaceInstUsesWith(II, NewCall);
834 }
835
836 break;
837 }
838 case Intrinsic::amdgcn_ballot: {
839 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
840 if (Src->isZero()) {
841 // amdgcn.ballot(i1 0) is zero.
843 }
844
845 if (Src->isOne()) {
846 // amdgcn.ballot(i1 1) is exec.
847 const char *RegName = "exec";
848 if (II.getType()->isIntegerTy(32))
849 RegName = "exec_lo";
850 else if (!II.getType()->isIntegerTy(64))
851 break;
852
854 II.getModule(), Intrinsic::read_register, II.getType());
855 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
856 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
857 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
858 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
859 NewCall->addFnAttr(Attribute::Convergent);
860 NewCall->takeName(&II);
861 return IC.replaceInstUsesWith(II, NewCall);
862 }
863 }
864 break;
865 }
866 case Intrinsic::amdgcn_wqm_vote: {
867 // wqm_vote is identity when the argument is constant.
868 if (!isa<Constant>(II.getArgOperand(0)))
869 break;
870
871 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
872 }
873 case Intrinsic::amdgcn_kill: {
874 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
875 if (!C || !C->getZExtValue())
876 break;
877
878 // amdgcn.kill(i1 1) is a no-op
879 return IC.eraseInstFromFunction(II);
880 }
881 case Intrinsic::amdgcn_update_dpp: {
882 Value *Old = II.getArgOperand(0);
883
884 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
885 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
886 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
887 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
888 BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
889 break;
890
891 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
892 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
893 }
894 case Intrinsic::amdgcn_permlane16:
895 case Intrinsic::amdgcn_permlanex16: {
896 // Discard vdst_in if it's not going to be read.
897 Value *VDstIn = II.getArgOperand(0);
898 if (isa<UndefValue>(VDstIn))
899 break;
900
901 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
902 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
903 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
904 break;
905
906 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
907 }
908 case Intrinsic::amdgcn_permlane64:
909 // A constant value is trivially uniform.
910 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
911 return IC.replaceInstUsesWith(II, C);
912 }
913 break;
914 case Intrinsic::amdgcn_readfirstlane:
915 case Intrinsic::amdgcn_readlane: {
916 // A constant value is trivially uniform.
917 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
918 return IC.replaceInstUsesWith(II, C);
919 }
920
921 // The rest of these may not be safe if the exec may not be the same between
922 // the def and use.
923 Value *Src = II.getArgOperand(0);
924 Instruction *SrcInst = dyn_cast<Instruction>(Src);
925 if (SrcInst && SrcInst->getParent() != II.getParent())
926 break;
927
928 // readfirstlane (readfirstlane x) -> readfirstlane x
929 // readlane (readfirstlane x), y -> readfirstlane x
930 if (match(Src,
931 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
932 return IC.replaceInstUsesWith(II, Src);
933 }
934
935 if (IID == Intrinsic::amdgcn_readfirstlane) {
936 // readfirstlane (readlane x, y) -> readlane x, y
937 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
938 return IC.replaceInstUsesWith(II, Src);
939 }
940 } else {
941 // readlane (readlane x, y), y -> readlane x, y
942 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
945 return IC.replaceInstUsesWith(II, Src);
946 }
947 }
948
949 break;
950 }
951 case Intrinsic::amdgcn_ldexp: {
952 // FIXME: This doesn't introduce new instructions and belongs in
953 // InstructionSimplify.
954 Type *Ty = II.getType();
955 Value *Op0 = II.getArgOperand(0);
956 Value *Op1 = II.getArgOperand(1);
957
958 // Folding undef to qnan is safe regardless of the FP mode.
959 if (isa<UndefValue>(Op0)) {
960 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
961 return IC.replaceInstUsesWith(II, QNaN);
962 }
963
964 const APFloat *C = nullptr;
966
967 // FIXME: Should flush denorms depending on FP mode, but that's ignored
968 // everywhere else.
969 //
970 // These cases should be safe, even with strictfp.
971 // ldexp(0.0, x) -> 0.0
972 // ldexp(-0.0, x) -> -0.0
973 // ldexp(inf, x) -> inf
974 // ldexp(-inf, x) -> -inf
975 if (C && (C->isZero() || C->isInfinity())) {
976 return IC.replaceInstUsesWith(II, Op0);
977 }
978
979 // With strictfp, be more careful about possibly needing to flush denormals
980 // or not, and snan behavior depends on ieee_mode.
981 if (II.isStrictFP())
982 break;
983
984 if (C && C->isNaN())
985 return IC.replaceInstUsesWith(II, ConstantFP::get(Ty, C->makeQuiet()));
986
987 // ldexp(x, 0) -> x
988 // ldexp(x, undef) -> x
989 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
990 return IC.replaceInstUsesWith(II, Op0);
991 }
992
993 break;
994 }
995 case Intrinsic::amdgcn_fmul_legacy: {
996 Value *Op0 = II.getArgOperand(0);
997 Value *Op1 = II.getArgOperand(1);
998
999 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1000 // infinity, gives +0.0.
1001 // TODO: Move to InstSimplify?
1002 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1005
1006 // If we can prove we don't have one of the special cases then we can use a
1007 // normal fmul instruction instead.
1008 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1009 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1010 FMul->takeName(&II);
1011 return IC.replaceInstUsesWith(II, FMul);
1012 }
1013 break;
1014 }
1015 case Intrinsic::amdgcn_fma_legacy: {
1016 Value *Op0 = II.getArgOperand(0);
1017 Value *Op1 = II.getArgOperand(1);
1018 Value *Op2 = II.getArgOperand(2);
1019
1020 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1021 // infinity, gives +0.0.
1022 // TODO: Move to InstSimplify?
1023 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1025 // It's tempting to just return Op2 here, but that would give the wrong
1026 // result if Op2 was -0.0.
1027 auto *Zero = ConstantFP::getNullValue(II.getType());
1028 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1029 FAdd->takeName(&II);
1030 return IC.replaceInstUsesWith(II, FAdd);
1031 }
1032
1033 // If we can prove we don't have one of the special cases then we can use a
1034 // normal fma instead.
1035 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1037 II.getModule(), Intrinsic::fma, II.getType()));
1038 return &II;
1039 }
1040 break;
1041 }
1042 case Intrinsic::amdgcn_is_shared:
1043 case Intrinsic::amdgcn_is_private: {
1044 if (isa<UndefValue>(II.getArgOperand(0)))
1045 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1046
1047 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1049 break;
1050 }
1051 default: {
1052 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1054 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1055 }
1056 }
1057 }
1058 return std::nullopt;
1059}
1060
1061/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1062///
1063/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1064/// struct returns.
1066 IntrinsicInst &II,
1067 APInt DemandedElts,
1068 int DMaskIdx = -1) {
1069
1070 auto *IIVTy = cast<FixedVectorType>(II.getType());
1071 unsigned VWidth = IIVTy->getNumElements();
1072 if (VWidth == 1)
1073 return nullptr;
1074 Type *EltTy = IIVTy->getElementType();
1075
1077 IC.Builder.SetInsertPoint(&II);
1078
1079 // Assume the arguments are unchanged and later override them, if needed.
1080 SmallVector<Value *, 16> Args(II.args());
1081
1082 if (DMaskIdx < 0) {
1083 // Buffer case.
1084
1085 const unsigned ActiveBits = DemandedElts.getActiveBits();
1086 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1087
1088 // Start assuming the prefix of elements is demanded, but possibly clear
1089 // some other bits if there are trailing zeros (unused components at front)
1090 // and update offset.
1091 DemandedElts = (1 << ActiveBits) - 1;
1092
1093 if (UnusedComponentsAtFront > 0) {
1094 static const unsigned InvalidOffsetIdx = 0xf;
1095
1096 unsigned OffsetIdx;
1097 switch (II.getIntrinsicID()) {
1098 case Intrinsic::amdgcn_raw_buffer_load:
1099 OffsetIdx = 1;
1100 break;
1101 case Intrinsic::amdgcn_s_buffer_load:
1102 // If resulting type is vec3, there is no point in trimming the
1103 // load with updated offset, as the vec3 would most likely be widened to
1104 // vec4 anyway during lowering.
1105 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1106 OffsetIdx = InvalidOffsetIdx;
1107 else
1108 OffsetIdx = 1;
1109 break;
1110 case Intrinsic::amdgcn_struct_buffer_load:
1111 OffsetIdx = 2;
1112 break;
1113 default:
1114 // TODO: handle tbuffer* intrinsics.
1115 OffsetIdx = InvalidOffsetIdx;
1116 break;
1117 }
1118
1119 if (OffsetIdx != InvalidOffsetIdx) {
1120 // Clear demanded bits and update the offset.
1121 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1122 auto *Offset = Args[OffsetIdx];
1123 unsigned SingleComponentSizeInBits =
1124 IC.getDataLayout().getTypeSizeInBits(EltTy);
1125 unsigned OffsetAdd =
1126 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1127 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1128 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1129 }
1130 }
1131 } else {
1132 // Image case.
1133
1134 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1135 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1136
1137 // Mask off values that are undefined because the dmask doesn't cover them
1138 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1139
1140 unsigned NewDMaskVal = 0;
1141 unsigned OrigLoadIdx = 0;
1142 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1143 const unsigned Bit = 1 << SrcIdx;
1144 if (!!(DMaskVal & Bit)) {
1145 if (!!DemandedElts[OrigLoadIdx])
1146 NewDMaskVal |= Bit;
1147 OrigLoadIdx++;
1148 }
1149 }
1150
1151 if (DMaskVal != NewDMaskVal)
1152 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1153 }
1154
1155 unsigned NewNumElts = DemandedElts.popcount();
1156 if (!NewNumElts)
1157 return UndefValue::get(IIVTy);
1158
1159 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1160 if (DMaskIdx >= 0)
1161 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1162 return nullptr;
1163 }
1164
1165 // Validate function argument and return types, extracting overloaded types
1166 // along the way.
1167 SmallVector<Type *, 6> OverloadTys;
1169 return nullptr;
1170
1171 Type *NewTy =
1172 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1173 OverloadTys[0] = NewTy;
1174
1176 II.getModule(), II.getIntrinsicID(), OverloadTys);
1177 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1178 NewCall->takeName(&II);
1179 NewCall->copyMetadata(II);
1180
1181 if (NewNumElts == 1) {
1182 return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
1183 DemandedElts.countr_zero());
1184 }
1185
1186 SmallVector<int, 8> EltMask;
1187 unsigned NewLoadIdx = 0;
1188 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1189 if (!!DemandedElts[OrigLoadIdx])
1190 EltMask.push_back(NewLoadIdx++);
1191 else
1192 EltMask.push_back(NewNumElts);
1193 }
1194
1195 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1196
1197 return Shuffle;
1198}
1199
1201 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1202 APInt &UndefElts2, APInt &UndefElts3,
1203 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1204 SimplifyAndSetOp) const {
1205 switch (II.getIntrinsicID()) {
1206 case Intrinsic::amdgcn_buffer_load:
1207 case Intrinsic::amdgcn_buffer_load_format:
1208 case Intrinsic::amdgcn_raw_buffer_load:
1209 case Intrinsic::amdgcn_raw_buffer_load_format:
1210 case Intrinsic::amdgcn_raw_tbuffer_load:
1211 case Intrinsic::amdgcn_s_buffer_load:
1212 case Intrinsic::amdgcn_struct_buffer_load:
1213 case Intrinsic::amdgcn_struct_buffer_load_format:
1214 case Intrinsic::amdgcn_struct_tbuffer_load:
1215 case Intrinsic::amdgcn_tbuffer_load:
1216 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1217 default: {
1218 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1219 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1220 }
1221 break;
1222 }
1223 }
1224 return std::nullopt;
1225}
unsigned Intr
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
assume Assume Builder
for(auto &MBB :MF)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
#define RegName(no)
#define I(x, y, z)
Definition: MD5.cpp:58
if(VerifyEach)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:962
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1043
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5450
bool isNegative() const
Definition: APFloat.h:1269
bool isNormal() const
Definition: APFloat.h:1273
bool isDenormal() const
Definition: APFloat.h:1270
const fltSemantics & getSemantics() const
Definition: APFloat.h:1277
bool isNaN() const
Definition: APFloat.h:1267
bool isSignaling() const
Definition: APFloat.h:1271
bool isZero() const
Definition: APFloat.h:1265
cmpResult compare(const APFloat &RHS) const
Definition: APFloat.h:1224
bool isInfinity() const
Definition: APFloat.h:1266
Class for arbitrary precision integers.
Definition: APInt.h:75
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1623
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1463
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1592
bool isMask(unsigned numBits) const
Definition: APInt.h:476
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
Definition: InstrTypes.h:1518
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1408
bool isStrictFP() const
Determine if the call requires strict floating point semantics.
Definition: InstrTypes.h:1874
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1353
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1358
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1344
void setCalledOperand(Value *V)
Definition: InstrTypes.h:1444
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ FIRST_ICMP_PREDICATE
Definition: InstrTypes.h:749
@ FIRST_FCMP_PREDICATE
Definition: InstrTypes.h:736
@ ICMP_EQ
equal
Definition: InstrTypes.h:739
@ ICMP_NE
not equal
Definition: InstrTypes.h:740
bool isSigned() const
Definition: InstrTypes.h:957
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:859
bool isFPPredicate() const
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:832
static Constant * getSExt(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2096
static Constant * getCompare(unsigned short pred, Constant *C1, Constant *C2, bool OnlyIfReduced=false)
Return an ICmp or FCmp comparison operator constant expression.
Definition: Constants.cpp:2392
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:260
const APFloat & getValueAPF() const
Definition: Constants.h:301
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:927
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
IntegerType * getType() const
getType - Specialize the getType() method to always return an IntegerType, which reduces the amount o...
Definition: Constants.h:176
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:840
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:145
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1349
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:356
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:76
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:669
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, InstCombiner &IC) const
Value * CreateFAddFMF(Value *L, Value *R, Instruction *FMFSource, const Twine &Name="")
Copy fast-math-flags from an instruction rather than using the builder's default FMF.
Definition: IRBuilder.h:1470
CallInst * CreateMinNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minnum intrinsic.
Definition: IRBuilder.h:936
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2361
Value * CreateFMulFMF(Value *L, Value *R, Instruction *FMFSource, const Twine &Name="")
Copy fast-math-flags from an instruction rather than using the builder's default FMF.
Definition: IRBuilder.h:1524
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:525
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1928
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1360
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1924
CallInst * CreateMaxNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition: IRBuilder.h:941
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1339
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2383
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2170
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1250
Value * CreateFCmpUNO(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2205
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2301
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1379
The core instruction combiner logic.
Definition: InstCombiner.h:45
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:372
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
TargetLibraryInfo & getTargetLibraryInfo() const
Definition: InstCombiner.h:370
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:418
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:442
BuilderTy & Builder
Definition: InstCombiner.h:58
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:70
const BasicBlock * getParent() const
Definition: Instruction.h:90
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:87
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
Metadata node.
Definition: Metadata.h:943
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1399
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:497
static MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition: Metadata.cpp:102
Root of the metadata hierarchy.
Definition: Metadata.h:61
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
unsigned getIntegerBitWidth() const
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
static IntegerType * getInt16Ty(LLVMContext &C)
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:231
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1731
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:381
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool getIntrinsicSignature(Function *F, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
Definition: Function.cpp:1850
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1506
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:453
CastClass_match< OpTy, Instruction::SExt > m_SExt(const OpTy &Op)
Matches SExt.
CastClass_match< OpTy, Instruction::ZExt > m_ZExt(const OpTy &Op)
Matches ZExt.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
Definition: PatternMatch.h:664
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:517
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:524
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:89
match_combine_or< CastClass_match< OpTy, Instruction::ZExt >, CastClass_match< OpTy, Instruction::SExt > > m_ZExtOrSExt(const OpTy &Op)
CastClass_match< OpTy, Instruction::FPExt > m_FPExt(const OpTy &Op)
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
Definition: PatternMatch.h:652
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
Definition: PatternMatch.h:295
cstfp_pred_ty< is_nan > m_NaN()
Match an arbitrary NaN constant.
Definition: PatternMatch.h:610
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:537
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:406
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:349
bool isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition: APFloat.h:1337
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1370
bool isKnownNeverInfinity(const Value *V, const TargetLibraryInfo *TLI, unsigned Depth=0)
Return true if the floating-point scalar value is not an infinity or if the floating-point vector val...
@ FMul
Product of floats.
@ FAdd
Sum of floats.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition: APFloat.h:215
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:225
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:229