LLVM 20.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT F32 = LLT::float32();
287static const LLT S64 = LLT::scalar(64);
288static const LLT F64 = LLT::float64();
289static const LLT S96 = LLT::scalar(96);
290static const LLT S128 = LLT::scalar(128);
291static const LLT S160 = LLT::scalar(160);
292static const LLT S224 = LLT::scalar(224);
293static const LLT S256 = LLT::scalar(256);
294static const LLT S512 = LLT::scalar(512);
296
297static const LLT V2S8 = LLT::fixed_vector(2, 8);
298static const LLT V2S16 = LLT::fixed_vector(2, 16);
299static const LLT V4S16 = LLT::fixed_vector(4, 16);
300static const LLT V6S16 = LLT::fixed_vector(6, 16);
301static const LLT V8S16 = LLT::fixed_vector(8, 16);
302static const LLT V10S16 = LLT::fixed_vector(10, 16);
303static const LLT V12S16 = LLT::fixed_vector(12, 16);
304static const LLT V16S16 = LLT::fixed_vector(16, 16);
305
307static const LLT V2BF16 = V2F16; // FIXME
308
309static const LLT V2S32 = LLT::fixed_vector(2, 32);
310static const LLT V3S32 = LLT::fixed_vector(3, 32);
311static const LLT V4S32 = LLT::fixed_vector(4, 32);
312static const LLT V5S32 = LLT::fixed_vector(5, 32);
313static const LLT V6S32 = LLT::fixed_vector(6, 32);
314static const LLT V7S32 = LLT::fixed_vector(7, 32);
315static const LLT V8S32 = LLT::fixed_vector(8, 32);
316static const LLT V9S32 = LLT::fixed_vector(9, 32);
317static const LLT V10S32 = LLT::fixed_vector(10, 32);
318static const LLT V11S32 = LLT::fixed_vector(11, 32);
319static const LLT V12S32 = LLT::fixed_vector(12, 32);
320static const LLT V16S32 = LLT::fixed_vector(16, 32);
321static const LLT V32S32 = LLT::fixed_vector(32, 32);
322
323static const LLT V2S64 = LLT::fixed_vector(2, 64);
324static const LLT V3S64 = LLT::fixed_vector(3, 64);
325static const LLT V4S64 = LLT::fixed_vector(4, 64);
326static const LLT V5S64 = LLT::fixed_vector(5, 64);
327static const LLT V6S64 = LLT::fixed_vector(6, 64);
328static const LLT V7S64 = LLT::fixed_vector(7, 64);
329static const LLT V8S64 = LLT::fixed_vector(8, 64);
330static const LLT V16S64 = LLT::fixed_vector(16, 64);
331
332static const LLT V2S128 = LLT::fixed_vector(2, 128);
333static const LLT V4S128 = LLT::fixed_vector(4, 128);
334
335static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336 S160, S224, S256, S512};
337
338static std::initializer_list<LLT> AllS16Vectors{
340
341static std::initializer_list<LLT> AllS32Vectors = {
344
345static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
347
348// Checks whether a type is in the list of legal register types.
349static bool isRegisterClassType(LLT Ty) {
352
355}
356
357static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358 return [TypeIdx](const LegalityQuery &Query) {
359 return isRegisterClassType(Query.Types[TypeIdx]);
360 };
361}
362
363// If we have a truncating store or an extending load with a data size larger
364// than 32-bits, we need to reduce to a 32-bit type.
366 return [=](const LegalityQuery &Query) {
367 const LLT Ty = Query.Types[TypeIdx];
368 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370 };
371}
372
373// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374// handle some operations by just promoting the register during
375// selection. There are also d16 loads on GFX9+ which preserve the high bits.
376static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377 bool IsLoad, bool IsAtomic) {
378 switch (AS) {
380 // FIXME: Private element size.
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
393 // kernel.
394 return IsLoad ? 512 : 128;
395 default:
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
400 }
401}
402
403static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404 const LegalityQuery &Query) {
405 const LLT Ty = Query.Types[0];
406
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409
410 unsigned RegSize = Ty.getSizeInBits();
411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413 unsigned AS = Query.Types[1].getAddressSpace();
414
415 // All of these need to be custom lowered to cast the pointer operand.
417 return false;
418
419 // Do not handle extending vector loads.
420 if (Ty.isVector() && MemSize != RegSize)
421 return false;
422
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
425#if 0
426 // Accept widening loads based on alignment.
427 if (IsLoad && MemSize < Size)
428 MemSize = std::max(MemSize, Align);
429#endif
430
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize != RegSize && RegSize != 32)
433 return false;
434
435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436 Query.MMODescrs[0].Ordering !=
437 AtomicOrdering::NotAtomic))
438 return false;
439
440 switch (MemSize) {
441 case 8:
442 case 16:
443 case 32:
444 case 64:
445 case 128:
446 break;
447 case 96:
448 if (!ST.hasDwordx3LoadStores())
449 return false;
450 break;
451 case 256:
452 case 512:
453 // These may contextually need to be broken down.
454 break;
455 default:
456 return false;
457 }
458
459 assert(RegSize >= MemSize);
460
461 if (AlignBits < MemSize) {
462 const SITargetLowering *TLI = ST.getTargetLowering();
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464 Align(AlignBits / 8)))
465 return false;
466 }
467
468 return true;
469}
470
471// The newer buffer intrinsic forms take their resource arguments as
472// pointers in address space 8, aka s128 values. However, in order to not break
473// SelectionDAG, the underlying operations have to continue to take v4i32
474// arguments. Therefore, we convert resource pointers - or vectors of them
475// to integer values here.
476static bool hasBufferRsrcWorkaround(const LLT Ty) {
478 return true;
479 if (Ty.isVector()) {
480 const LLT ElemTy = Ty.getElementType();
481 return hasBufferRsrcWorkaround(ElemTy);
482 }
483 return false;
484}
485
486// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487// workaround this. Eventually it should ignore the type for loads and only care
488// about the size. Return true in cases where we will workaround this for now by
489// bitcasting.
490static bool loadStoreBitcastWorkaround(const LLT Ty) {
492 return false;
493
494 const unsigned Size = Ty.getSizeInBits();
495 if (Size <= 64)
496 return false;
497 // Address space 8 pointers get their own workaround.
499 return false;
500 if (!Ty.isVector())
501 return true;
502
503 if (Ty.isPointerVector())
504 return true;
505
506 unsigned EltSize = Ty.getScalarSizeInBits();
507 return EltSize != 32 && EltSize != 64;
508}
509
510static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511 const LLT Ty = Query.Types[0];
512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
514}
515
516/// Return true if a load or store of the type should be lowered with a bitcast
517/// to a different type.
518static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519 const LLT MemTy) {
520 const unsigned MemSizeInBits = MemTy.getSizeInBits();
521 const unsigned Size = Ty.getSizeInBits();
522 if (Size != MemSizeInBits)
523 return Size <= 32 && Ty.isVector();
524
526 return true;
527
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530 (Size <= 32 || isRegisterSize(Size)) &&
532}
533
534/// Return true if we should legalize a load by widening an odd sized memory
535/// access up to the alignment. Note this case when the memory access itself
536/// changes, not the size of the result register.
537static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538 uint64_t AlignInBits, unsigned AddrSpace,
539 unsigned Opcode) {
540 unsigned SizeInBits = MemoryTy.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(SizeInBits))
543 return false;
544
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549 return false;
550
551 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552 return false;
553
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
555 // to it.
556 //
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize = NextPowerOf2(SizeInBits);
559 if (AlignInBits < RoundedSize)
560 return false;
561
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering *TLI = ST.getTargetLowering();
564 unsigned Fast = 0;
566 RoundedSize, AddrSpace, Align(AlignInBits / 8),
568 Fast;
569}
570
571static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572 unsigned Opcode) {
573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574 return false;
575
576 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577 Query.MMODescrs[0].AlignInBits,
578 Query.Types[1].getAddressSpace(), Opcode);
579}
580
581/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
585 MachineRegisterInfo &MRI, unsigned Idx) {
586 MachineOperand &MO = MI.getOperand(Idx);
587
588 const LLT PointerTy = MRI.getType(MO.getReg());
589
590 // Paranoidly prevent us from doing this multiple times.
592 return PointerTy;
593
594 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
595 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
596 if (!PointerTy.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599 const LLT S32 = LLT::scalar(32);
600
601 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
604 for (unsigned I = 0; I < NumParts; ++I)
605 VectorElems[I] =
606 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
608 MO.setReg(VectorReg);
609 return VectorTy;
610 }
611 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
613 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
615 MO.setReg(BitcastReg);
616
617 return VectorTy;
618}
619
620/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621/// the form in which the value must be in order to be passed to the low-level
622/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623/// needed in order to account for the fact that we can't define a register
624/// class for s128 without breaking SelectionDAG.
626 MachineRegisterInfo &MRI = *B.getMRI();
627 const LLT PointerTy = MRI.getType(Pointer);
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630
631 if (!PointerTy.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector<Register, 4> PointerParts;
634 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
636 for (unsigned I = 0; I < NumParts; ++I)
637 PointerParts.push_back(Unmerged.getReg(I));
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
639 }
640 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
642}
643
645 unsigned Idx) {
646 MachineOperand &MO = MI.getOperand(Idx);
647
648 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
651 return;
653}
654
656 const GCNTargetMachine &TM)
657 : ST(ST_) {
658 using namespace TargetOpcode;
659
660 auto GetAddrSpacePtr = [&TM](unsigned AS) {
661 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
662 };
663
664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673 const LLT BufferStridedPtr =
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
675
676 const LLT CodePtr = FlatPtr;
677
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
680 };
681
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684 };
685
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687
688 const std::initializer_list<LLT> FPTypesBase = {
689 S32, S64
690 };
691
692 const std::initializer_list<LLT> FPTypes16 = {
693 S32, S64, S16
694 };
695
696 const std::initializer_list<LLT> FPTypesPK16 = {
697 S32, S64, S16, V2S16
698 };
699
700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701
702 // s1 for VCC branches, s32 for SCC branches.
704
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
709 .legalFor(AllS32Vectors)
711 .legalFor(AddrSpaces64)
712 .legalFor(AddrSpaces32)
713 .legalFor(AddrSpaces128)
714 .legalIf(isPointer(0))
715 .clampScalar(0, S16, S256)
717 .clampMaxNumElements(0, S32, 16)
719 .scalarize(0);
720
721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder({G_ADD, G_SUB})
725 .legalFor({S64, S32, S16, V2S16})
726 .clampMaxNumElementsStrict(0, S16, 2)
727 .scalarize(0)
728 .minScalar(0, S16)
730 .maxScalar(0, S32);
731 } else {
732 getActionDefinitionsBuilder({G_ADD, G_SUB})
733 .legalFor({S32, S16, V2S16})
734 .clampMaxNumElementsStrict(0, S16, 2)
735 .scalarize(0)
736 .minScalar(0, S16)
738 .maxScalar(0, S32);
739 }
740
741 if (ST.hasScalarSMulU64()) {
743 .legalFor({S64, S32, S16, V2S16})
744 .clampMaxNumElementsStrict(0, S16, 2)
745 .scalarize(0)
746 .minScalar(0, S16)
748 .custom();
749 } else {
751 .legalFor({S32, S16, V2S16})
752 .clampMaxNumElementsStrict(0, S16, 2)
753 .scalarize(0)
754 .minScalar(0, S16)
756 .custom();
757 }
758 assert(ST.hasMad64_32());
759
760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761 .legalFor({S32, S16, V2S16}) // Clamp modifier
762 .minScalarOrElt(0, S16)
764 .scalarize(0)
766 .lower();
767 } else if (ST.has16BitInsts()) {
768 getActionDefinitionsBuilder({G_ADD, G_SUB})
769 .legalFor({S32, S16})
770 .minScalar(0, S16)
772 .maxScalar(0, S32)
773 .scalarize(0);
774
776 .legalFor({S32, S16})
777 .scalarize(0)
778 .minScalar(0, S16)
779 .widenScalarToNextMultipleOf(0, 32)
780 .custom();
781 assert(ST.hasMad64_32());
782
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786 .legalFor({S32, S16}) // Clamp modifier
787 .minScalar(0, S16)
788 .scalarize(0)
790 .lower();
791
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795 .minScalar(0, S16)
796 .scalarize(0)
797 .lower();
798 } else {
799 getActionDefinitionsBuilder({G_ADD, G_SUB})
800 .legalFor({S32})
801 .widenScalarToNextMultipleOf(0, 32)
802 .clampScalar(0, S32, S32)
803 .scalarize(0);
804
805 auto &Mul = getActionDefinitionsBuilder(G_MUL)
806 .legalFor({S32})
807 .scalarize(0)
808 .minScalar(0, S32)
809 .widenScalarToNextMultipleOf(0, 32);
810
811 if (ST.hasMad64_32())
812 Mul.custom();
813 else
814 Mul.maxScalar(0, S32);
815
816 if (ST.hasIntClamp()) {
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32}) // Clamp modifier.
819 .scalarize(0)
820 .minScalarOrElt(0, S32)
821 .lower();
822 } else {
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825 .minScalar(0, S32)
826 .scalarize(0)
827 .lower();
828 }
829
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833 .minScalar(0, S32)
834 .scalarize(0)
835 .lower();
836 }
837
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840 .customFor({S32, S64})
841 .clampScalar(0, S32, S64)
843 .scalarize(0);
844
845 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
846 .legalFor({S32})
847 .maxScalar(0, S32);
848
849 if (ST.hasVOP3PInsts()) {
850 Mulh
851 .clampMaxNumElements(0, S8, 2)
852 .lowerFor({V2S8});
853 }
854
855 Mulh
856 .scalarize(0)
857 .lower();
858
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
862 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
863 .clampScalar(0, S32, S64)
867 .scalarize(0);
868
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871 .legalFor({{S32, S1}, {S32, S32}})
872 .clampScalar(0, S32, S32)
873 .scalarize(0);
874
876 // Don't worry about the size constraint.
878 .lower();
879
881 .legalFor({S1, S32, S64, S16, GlobalPtr,
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883 .legalIf(isPointer(0))
884 .clampScalar(0, S32, S64)
886
887 getActionDefinitionsBuilder(G_FCONSTANT)
888 .legalFor({S32, S64, S16})
889 .clampScalar(0, S16, S64);
890
891 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
892 .legalIf(isRegisterType(0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
895 .legalFor({S1, S16})
896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
899 .clampMaxNumElements(0, S32, 16);
900
901 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
902
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
906 .legalFor({{PrivatePtr, S32}});
907
908 getActionDefinitionsBuilder(G_STACKSAVE)
909 .customFor({PrivatePtr});
910 getActionDefinitionsBuilder(G_STACKRESTORE)
911 .legalFor({PrivatePtr});
912
913 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
914
915 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916 .customIf(typeIsNot(0, PrivatePtr));
917
918 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
919
920 auto &FPOpActions = getActionDefinitionsBuilder(
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923 .legalFor({S32, S64});
924 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
925 .customFor({S32, S64});
926 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
927 .customFor({S32, S64});
928
929 if (ST.has16BitInsts()) {
930 if (ST.hasVOP3PInsts())
931 FPOpActions.legalFor({S16, V2S16});
932 else
933 FPOpActions.legalFor({S16});
934
935 TrigActions.customFor({S16});
936 FDIVActions.customFor({S16});
937 }
938
939 if (ST.hasPackedFP32Ops()) {
940 FPOpActions.legalFor({V2S32});
941 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
942 }
943
944 auto &MinNumMaxNum = getActionDefinitionsBuilder({
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946
947 if (ST.hasVOP3PInsts()) {
948 MinNumMaxNum.customFor(FPTypesPK16)
951 .clampScalar(0, S16, S64)
952 .scalarize(0);
953 } else if (ST.has16BitInsts()) {
954 MinNumMaxNum.customFor(FPTypes16)
955 .clampScalar(0, S16, S64)
956 .scalarize(0);
957 } else {
958 MinNumMaxNum.customFor(FPTypesBase)
959 .clampScalar(0, S32, S64)
960 .scalarize(0);
961 }
962
963 if (ST.hasVOP3PInsts())
964 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
965
966 FPOpActions
967 .scalarize(0)
968 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
969
970 TrigActions
971 .scalarize(0)
972 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
973
974 FDIVActions
975 .scalarize(0)
976 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
977
978 getActionDefinitionsBuilder({G_FNEG, G_FABS})
979 .legalFor(FPTypesPK16)
981 .scalarize(0)
982 .clampScalar(0, S16, S64);
983
984 if (ST.has16BitInsts()) {
986 .legalFor({S16})
987 .customFor({S32, S64})
988 .scalarize(0)
989 .unsupported();
991 .legalFor({S32, S64, S16})
992 .scalarize(0)
993 .clampScalar(0, S16, S64);
994
995 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
996 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
997 .scalarize(0)
998 .maxScalarIf(typeIs(0, S16), 1, S16)
999 .clampScalar(1, S32, S32)
1000 .lower();
1001
1003 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004 .scalarize(0)
1005 .lower();
1006 } else {
1008 .customFor({S32, S64, S16})
1009 .scalarize(0)
1010 .unsupported();
1011
1012
1013 if (ST.hasFractBug()) {
1015 .customFor({S64})
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 } else {
1021 .legalFor({S32, S64})
1022 .scalarize(0)
1023 .clampScalar(0, S32, S64);
1024 }
1025
1026 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1027 .legalFor({{S32, S32}, {S64, S32}})
1028 .scalarize(0)
1029 .clampScalar(0, S32, S64)
1030 .clampScalar(1, S32, S32)
1031 .lower();
1032
1034 .customFor({{S32, S32}, {S64, S32}})
1035 .scalarize(0)
1036 .minScalar(0, S32)
1037 .clampScalar(1, S32, S32)
1038 .lower();
1039 }
1040
1042 .legalFor({{S32, S64}, {S16, S32}})
1043 .scalarize(0)
1044 .lower();
1045
1047 .legalFor({{S64, S32}, {S32, S16}})
1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1049 .scalarize(0);
1050
1051 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1052 if (ST.has16BitInsts()) {
1053 FSubActions
1054 // Use actual fsub instruction
1055 .legalFor({S32, S16})
1056 // Must use fadd + fneg
1057 .lowerFor({S64, V2S16});
1058 } else {
1059 FSubActions
1060 // Use actual fsub instruction
1061 .legalFor({S32})
1062 // Must use fadd + fneg
1063 .lowerFor({S64, S16, V2S16});
1064 }
1065
1066 FSubActions
1067 .scalarize(0)
1068 .clampScalar(0, S32, S64);
1069
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073 FMad.customFor({S32, S16});
1074 else if (ST.hasMadMacF32Insts())
1075 FMad.customFor({S32});
1076 else if (ST.hasMadF16())
1077 FMad.customFor({S16});
1078 FMad.scalarize(0)
1079 .lower();
1080
1081 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082 if (ST.has16BitInsts()) {
1083 FRem.customFor({S16, S32, S64});
1084 } else {
1085 FRem.minScalar(0, S32)
1086 .customFor({S32, S64});
1087 }
1088 FRem.scalarize(0);
1089
1090 // TODO: Do we need to clamp maximum bitwidth?
1092 .legalIf(isScalar(0))
1093 .legalFor({{V2S16, V2S32}})
1094 .clampMaxNumElements(0, S16, 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1099 .alwaysLegal();
1100
1101 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1102 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1103 {S32, S1}, {S64, S1}, {S16, S1}})
1104 .scalarize(0)
1105 .clampScalar(0, S32, S64)
1106 .widenScalarToNextPow2(1, 32);
1107
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111 .lowerIf(typeIs(1, S1))
1112 .customFor({{S32, S64}, {S64, S64}});
1113 if (ST.has16BitInsts())
1114 IToFP.legalFor({{S16, S16}});
1115 IToFP.clampScalar(1, S32, S64)
1116 .minScalar(0, S32)
1117 .scalarize(0)
1119
1120 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1121 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122 .customFor({{S64, S32}, {S64, S64}})
1123 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1124 if (ST.has16BitInsts())
1125 FPToI.legalFor({{S16, S16}});
1126 else
1127 FPToI.minScalar(1, S32);
1128
1129 FPToI.minScalar(0, S32)
1130 .widenScalarToNextPow2(0, 32)
1131 .scalarize(0)
1132 .lower();
1133
1134 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1135 .clampScalar(0, S16, S64)
1136 .scalarize(0)
1137 .lower();
1138
1139 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1140 .customFor({S16, S32})
1141 .scalarize(0)
1142 .lower();
1143
1144 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1145 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1146 .scalarize(0)
1147 .lower();
1148
1149 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1150 .clampScalar(0, S16, S64)
1151 .scalarize(0)
1152 .lower();
1153
1154 if (ST.has16BitInsts()) {
1155 getActionDefinitionsBuilder(
1156 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1157 .legalFor({S16, S32, S64})
1158 .clampScalar(0, S16, S64)
1159 .scalarize(0);
1160 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1161 getActionDefinitionsBuilder(
1162 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1163 .legalFor({S32, S64})
1164 .clampScalar(0, S32, S64)
1165 .scalarize(0);
1166 } else {
1167 getActionDefinitionsBuilder(
1168 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1169 .legalFor({S32})
1170 .customFor({S64})
1171 .clampScalar(0, S32, S64)
1172 .scalarize(0);
1173 }
1174
1175 getActionDefinitionsBuilder(G_PTR_ADD)
1176 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1177 .legalIf(all(isPointer(0), sameSize(0, 1)))
1178 .scalarize(0)
1179 .scalarSameSizeAs(1, 0);
1180
1181 getActionDefinitionsBuilder(G_PTRMASK)
1182 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1183 .scalarSameSizeAs(1, 0)
1184 .scalarize(0);
1185
1186 auto &CmpBuilder =
1187 getActionDefinitionsBuilder(G_ICMP)
1188 // The compare output type differs based on the register bank of the output,
1189 // so make both s1 and s32 legal.
1190 //
1191 // Scalar compares producing output in scc will be promoted to s32, as that
1192 // is the allocatable register type that will be needed for the copy from
1193 // scc. This will be promoted during RegBankSelect, and we assume something
1194 // before that won't try to use s32 result types.
1195 //
1196 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1197 // bank.
1198 .legalForCartesianProduct(
1199 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1200 .legalForCartesianProduct(
1201 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1202 if (ST.has16BitInsts()) {
1203 CmpBuilder.legalFor({{S1, S16}});
1204 }
1205
1206 CmpBuilder
1207 .widenScalarToNextPow2(1)
1208 .clampScalar(1, S32, S64)
1209 .scalarize(0)
1210 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1211
1212 auto &FCmpBuilder =
1213 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1214 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1215
1216 if (ST.hasSALUFloatInsts())
1217 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1218
1219 FCmpBuilder
1220 .widenScalarToNextPow2(1)
1221 .clampScalar(1, S32, S64)
1222 .scalarize(0);
1223
1224 // FIXME: fpow has a selection pattern that should move to custom lowering.
1225 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1226 if (ST.has16BitInsts())
1227 ExpOps.customFor({{S32}, {S16}});
1228 else
1229 ExpOps.customFor({S32});
1230 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1231 .scalarize(0);
1232
1233 getActionDefinitionsBuilder(G_FPOWI)
1234 .clampScalar(0, MinScalarFPTy, S32)
1235 .lower();
1236
1237 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1238 Log2Ops.customFor({S32});
1239 if (ST.has16BitInsts())
1240 Log2Ops.legalFor({S16});
1241 else
1242 Log2Ops.customFor({S16});
1243 Log2Ops.scalarize(0)
1244 .lower();
1245
1246 auto &LogOps =
1247 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1248 LogOps.customFor({S32, S16});
1249 LogOps.clampScalar(0, MinScalarFPTy, S32)
1250 .scalarize(0);
1251
1252 // The 64-bit versions produce 32-bit results, but only on the SALU.
1253 getActionDefinitionsBuilder(G_CTPOP)
1254 .legalFor({{S32, S32}, {S32, S64}})
1255 .clampScalar(0, S32, S32)
1256 .widenScalarToNextPow2(1, 32)
1257 .clampScalar(1, S32, S64)
1258 .scalarize(0)
1259 .widenScalarToNextPow2(0, 32);
1260
1261 // If no 16 bit instr is available, lower into different instructions.
1262 if (ST.has16BitInsts())
1263 getActionDefinitionsBuilder(G_IS_FPCLASS)
1264 .legalForCartesianProduct({S1}, FPTypes16)
1265 .widenScalarToNextPow2(1)
1266 .scalarize(0)
1267 .lower();
1268 else
1269 getActionDefinitionsBuilder(G_IS_FPCLASS)
1270 .legalForCartesianProduct({S1}, FPTypesBase)
1271 .lowerFor({S1, S16})
1272 .widenScalarToNextPow2(1)
1273 .scalarize(0)
1274 .lower();
1275
1276 // The hardware instructions return a different result on 0 than the generic
1277 // instructions expect. The hardware produces -1, but these produce the
1278 // bitwidth.
1279 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1280 .scalarize(0)
1281 .clampScalar(0, S32, S32)
1282 .clampScalar(1, S32, S64)
1283 .widenScalarToNextPow2(0, 32)
1284 .widenScalarToNextPow2(1, 32)
1285 .custom();
1286
1287 // The 64-bit versions produce 32-bit results, but only on the SALU.
1288 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1289 .legalFor({{S32, S32}, {S32, S64}})
1290 .customIf(scalarNarrowerThan(1, 32))
1291 .clampScalar(0, S32, S32)
1292 .clampScalar(1, S32, S64)
1293 .scalarize(0)
1294 .widenScalarToNextPow2(0, 32)
1295 .widenScalarToNextPow2(1, 32);
1296
1297 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1298 .legalFor({{S32, S32}, {S32, S64}})
1299 .clampScalar(0, S32, S32)
1300 .clampScalar(1, S32, S64)
1301 .scalarize(0)
1302 .widenScalarToNextPow2(0, 32)
1303 .widenScalarToNextPow2(1, 32);
1304
1305 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1306 // RegBankSelect.
1307 getActionDefinitionsBuilder(G_BITREVERSE)
1308 .legalFor({S32, S64})
1309 .clampScalar(0, S32, S64)
1310 .scalarize(0)
1311 .widenScalarToNextPow2(0);
1312
1313 if (ST.has16BitInsts()) {
1314 getActionDefinitionsBuilder(G_BSWAP)
1315 .legalFor({S16, S32, V2S16})
1316 .clampMaxNumElementsStrict(0, S16, 2)
1317 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1318 // narrowScalar limitation.
1319 .widenScalarToNextPow2(0)
1320 .clampScalar(0, S16, S32)
1321 .scalarize(0);
1322
1323 if (ST.hasVOP3PInsts()) {
1324 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1325 .legalFor({S32, S16, V2S16})
1326 .clampMaxNumElements(0, S16, 2)
1327 .minScalar(0, S16)
1328 .widenScalarToNextPow2(0)
1329 .scalarize(0)
1330 .lower();
1331 } else {
1332 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1333 .legalFor({S32, S16})
1334 .widenScalarToNextPow2(0)
1335 .minScalar(0, S16)
1336 .scalarize(0)
1337 .lower();
1338 }
1339 } else {
1340 // TODO: Should have same legality without v_perm_b32
1341 getActionDefinitionsBuilder(G_BSWAP)
1342 .legalFor({S32})
1343 .lowerIf(scalarNarrowerThan(0, 32))
1344 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1345 // narrowScalar limitation.
1346 .widenScalarToNextPow2(0)
1347 .maxScalar(0, S32)
1348 .scalarize(0)
1349 .lower();
1350
1351 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1352 .legalFor({S32})
1353 .minScalar(0, S32)
1354 .widenScalarToNextPow2(0)
1355 .scalarize(0)
1356 .lower();
1357 }
1358
1359 getActionDefinitionsBuilder(G_INTTOPTR)
1360 // List the common cases
1361 .legalForCartesianProduct(AddrSpaces64, {S64})
1362 .legalForCartesianProduct(AddrSpaces32, {S32})
1363 .scalarize(0)
1364 // Accept any address space as long as the size matches
1365 .legalIf(sameSize(0, 1))
1366 .widenScalarIf(smallerThan(1, 0),
1367 [](const LegalityQuery &Query) {
1368 return std::pair(
1369 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1370 })
1371 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1372 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1373 });
1374
1375 getActionDefinitionsBuilder(G_PTRTOINT)
1376 // List the common cases
1377 .legalForCartesianProduct(AddrSpaces64, {S64})
1378 .legalForCartesianProduct(AddrSpaces32, {S32})
1379 .scalarize(0)
1380 // Accept any address space as long as the size matches
1381 .legalIf(sameSize(0, 1))
1382 .widenScalarIf(smallerThan(0, 1),
1383 [](const LegalityQuery &Query) {
1384 return std::pair(
1385 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1386 })
1387 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1388 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1389 });
1390
1391 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1392 .scalarize(0)
1393 .custom();
1394
1395 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1396 bool IsLoad) -> bool {
1397 const LLT DstTy = Query.Types[0];
1398
1399 // Split vector extloads.
1400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1401
1402 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1403 return true;
1404
1405 const LLT PtrTy = Query.Types[1];
1406 unsigned AS = PtrTy.getAddressSpace();
1407 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1408 Query.MMODescrs[0].Ordering !=
1410 return true;
1411
1412 // Catch weird sized loads that don't evenly divide into the access sizes
1413 // TODO: May be able to widen depending on alignment etc.
1414 unsigned NumRegs = (MemSize + 31) / 32;
1415 if (NumRegs == 3) {
1416 if (!ST.hasDwordx3LoadStores())
1417 return true;
1418 } else {
1419 // If the alignment allows, these should have been widened.
1420 if (!isPowerOf2_32(NumRegs))
1421 return true;
1422 }
1423
1424 return false;
1425 };
1426
1427 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1428 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1429 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1430
1431 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1432 // LDS
1433 // TODO: Unsupported flat for SI.
1434
1435 for (unsigned Op : {G_LOAD, G_STORE}) {
1436 const bool IsStore = Op == G_STORE;
1437
1438 auto &Actions = getActionDefinitionsBuilder(Op);
1439 // Explicitly list some common cases.
1440 // TODO: Does this help compile time at all?
1441 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1442 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1443 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1444 {S64, GlobalPtr, S64, GlobalAlign32},
1445 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1446 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1447 {S32, GlobalPtr, S8, GlobalAlign8},
1448 {S32, GlobalPtr, S16, GlobalAlign16},
1449
1450 {S32, LocalPtr, S32, 32},
1451 {S64, LocalPtr, S64, 32},
1452 {V2S32, LocalPtr, V2S32, 32},
1453 {S32, LocalPtr, S8, 8},
1454 {S32, LocalPtr, S16, 16},
1455 {V2S16, LocalPtr, S32, 32},
1456
1457 {S32, PrivatePtr, S32, 32},
1458 {S32, PrivatePtr, S8, 8},
1459 {S32, PrivatePtr, S16, 16},
1460 {V2S16, PrivatePtr, S32, 32},
1461
1462 {S32, ConstantPtr, S32, GlobalAlign32},
1463 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1464 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1465 {S64, ConstantPtr, S64, GlobalAlign32},
1466 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1467 Actions.legalIf(
1468 [=](const LegalityQuery &Query) -> bool {
1469 return isLoadStoreLegal(ST, Query);
1470 });
1471
1472 // The custom pointers (fat pointers, buffer resources) don't work with load
1473 // and store at this level. Fat pointers should have been lowered to
1474 // intrinsics before the translation to MIR.
1475 Actions.unsupportedIf(
1476 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1477
1478 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1479 // ptrtoint. This is needed to account for the fact that we can't have i128
1480 // as a register class for SelectionDAG reasons.
1481 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1482 return hasBufferRsrcWorkaround(Query.Types[0]);
1483 });
1484
1485 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1486 // 64-bits.
1487 //
1488 // TODO: Should generalize bitcast action into coerce, which will also cover
1489 // inserting addrspacecasts.
1490 Actions.customIf(typeIs(1, Constant32Ptr));
1491
1492 // Turn any illegal element vectors into something easier to deal
1493 // with. These will ultimately produce 32-bit scalar shifts to extract the
1494 // parts anyway.
1495 //
1496 // For odd 16-bit element vectors, prefer to split those into pieces with
1497 // 16-bit vector parts.
1498 Actions.bitcastIf(
1499 [=](const LegalityQuery &Query) -> bool {
1500 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1501 Query.MMODescrs[0].MemoryTy);
1502 }, bitcastToRegisterType(0));
1503
1504 if (!IsStore) {
1505 // Widen suitably aligned loads by loading extra bytes. The standard
1506 // legalization actions can't properly express widening memory operands.
1507 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1508 return shouldWidenLoad(ST, Query, G_LOAD);
1509 });
1510 }
1511
1512 // FIXME: load/store narrowing should be moved to lower action
1513 Actions
1514 .narrowScalarIf(
1515 [=](const LegalityQuery &Query) -> bool {
1516 return !Query.Types[0].isVector() &&
1517 needToSplitMemOp(Query, Op == G_LOAD);
1518 },
1519 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1520 const LLT DstTy = Query.Types[0];
1521 const LLT PtrTy = Query.Types[1];
1522
1523 const unsigned DstSize = DstTy.getSizeInBits();
1524 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1525
1526 // Split extloads.
1527 if (DstSize > MemSize)
1528 return std::pair(0, LLT::scalar(MemSize));
1529
1530 unsigned MaxSize = maxSizeForAddrSpace(
1531 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1532 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1533 if (MemSize > MaxSize)
1534 return std::pair(0, LLT::scalar(MaxSize));
1535
1536 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1537 return std::pair(0, LLT::scalar(Align));
1538 })
1539 .fewerElementsIf(
1540 [=](const LegalityQuery &Query) -> bool {
1541 return Query.Types[0].isVector() &&
1542 needToSplitMemOp(Query, Op == G_LOAD);
1543 },
1544 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1545 const LLT DstTy = Query.Types[0];
1546 const LLT PtrTy = Query.Types[1];
1547
1548 LLT EltTy = DstTy.getElementType();
1549 unsigned MaxSize = maxSizeForAddrSpace(
1550 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1551 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1552
1553 // FIXME: Handle widened to power of 2 results better. This ends
1554 // up scalarizing.
1555 // FIXME: 3 element stores scalarized on SI
1556
1557 // Split if it's too large for the address space.
1558 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1559 if (MemSize > MaxSize) {
1560 unsigned NumElts = DstTy.getNumElements();
1561 unsigned EltSize = EltTy.getSizeInBits();
1562
1563 if (MaxSize % EltSize == 0) {
1564 return std::pair(
1566 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1567 }
1568
1569 unsigned NumPieces = MemSize / MaxSize;
1570
1571 // FIXME: Refine when odd breakdowns handled
1572 // The scalars will need to be re-legalized.
1573 if (NumPieces == 1 || NumPieces >= NumElts ||
1574 NumElts % NumPieces != 0)
1575 return std::pair(0, EltTy);
1576
1577 return std::pair(0,
1578 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1579 }
1580
1581 // FIXME: We could probably handle weird extending loads better.
1582 if (DstTy.getSizeInBits() > MemSize)
1583 return std::pair(0, EltTy);
1584
1585 unsigned EltSize = EltTy.getSizeInBits();
1586 unsigned DstSize = DstTy.getSizeInBits();
1587 if (!isPowerOf2_32(DstSize)) {
1588 // We're probably decomposing an odd sized store. Try to split
1589 // to the widest type. TODO: Account for alignment. As-is it
1590 // should be OK, since the new parts will be further legalized.
1591 unsigned FloorSize = llvm::bit_floor(DstSize);
1592 return std::pair(
1594 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1595 }
1596
1597 // May need relegalization for the scalars.
1598 return std::pair(0, EltTy);
1599 })
1600 .minScalar(0, S32)
1601 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1602 .widenScalarToNextPow2(0)
1603 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1604 .lower();
1605 }
1606
1607 // FIXME: Unaligned accesses not lowered.
1608 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1609 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1610 {S32, GlobalPtr, S16, 2 * 8},
1611 {S32, LocalPtr, S8, 8},
1612 {S32, LocalPtr, S16, 16},
1613 {S32, PrivatePtr, S8, 8},
1614 {S32, PrivatePtr, S16, 16},
1615 {S32, ConstantPtr, S8, 8},
1616 {S32, ConstantPtr, S16, 2 * 8}})
1617 .legalIf(
1618 [=](const LegalityQuery &Query) -> bool {
1619 return isLoadStoreLegal(ST, Query);
1620 });
1621
1622 if (ST.hasFlatAddressSpace()) {
1623 ExtLoads.legalForTypesWithMemDesc(
1624 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1625 }
1626
1627 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1628 // 64-bits.
1629 //
1630 // TODO: Should generalize bitcast action into coerce, which will also cover
1631 // inserting addrspacecasts.
1632 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1633
1634 ExtLoads.clampScalar(0, S32, S32)
1635 .widenScalarToNextPow2(0)
1636 .lower();
1637
1638 auto &Atomics = getActionDefinitionsBuilder(
1639 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1640 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1641 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1642 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1643 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1644 {S64, GlobalPtr}, {S64, LocalPtr},
1645 {S32, RegionPtr}, {S64, RegionPtr}});
1646 if (ST.hasFlatAddressSpace()) {
1647 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1648 }
1649
1650 // TODO: v2bf16 operations, and fat buffer pointer support.
1651 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1652 if (ST.hasLDSFPAtomicAddF32()) {
1653 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1654 if (ST.hasLdsAtomicAddF64())
1655 Atomic.legalFor({{S64, LocalPtr}});
1656 if (ST.hasAtomicDsPkAdd16Insts())
1657 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1658 }
1659 if (ST.hasAtomicFaddInsts())
1660 Atomic.legalFor({{S32, GlobalPtr}});
1661 if (ST.hasFlatAtomicFaddF32Inst())
1662 Atomic.legalFor({{S32, FlatPtr}});
1663
1664 if (ST.hasGFX90AInsts()) {
1665 // These are legal with some caveats, and should have undergone expansion in
1666 // the IR in most situations
1667 // TODO: Move atomic expansion into legalizer
1668 Atomic.legalFor({
1669 {S32, GlobalPtr},
1670 {S64, GlobalPtr},
1671 {S64, FlatPtr}
1672 });
1673 }
1674
1675 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1676 ST.hasAtomicBufferGlobalPkAddF16Insts())
1677 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1678 if (ST.hasAtomicGlobalPkAddBF16Inst())
1679 Atomic.legalFor({{V2BF16, GlobalPtr}});
1680 if (ST.hasAtomicFlatPkAdd16Insts())
1681 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1682
1683
1684 // Most of the legalization work here is done by AtomicExpand. We could
1685 // probably use a simpler legality rule that just assumes anything is OK.
1686 auto &AtomicFMinFMax =
1687 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1688 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1689
1690 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1691 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1692 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1693 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1694 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1695 AtomicFMinFMax.legalFor({F32, FlatPtr});
1696 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1697 AtomicFMinFMax.legalFor({F64, FlatPtr});
1698
1699 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1700 // demarshalling
1701 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1702 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1703 {S32, FlatPtr}, {S64, FlatPtr}})
1704 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1705 {S32, RegionPtr}, {S64, RegionPtr}});
1706 // TODO: Pointer types, any 32-bit or 64-bit vector
1707
1708 // Condition should be s32 for scalar, s1 for vector.
1709 getActionDefinitionsBuilder(G_SELECT)
1710 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1711 LocalPtr, FlatPtr, PrivatePtr,
1712 LLT::fixed_vector(2, LocalPtr),
1713 LLT::fixed_vector(2, PrivatePtr)},
1714 {S1, S32})
1715 .clampScalar(0, S16, S64)
1716 .scalarize(1)
1717 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1718 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1719 .clampMaxNumElements(0, S32, 2)
1720 .clampMaxNumElements(0, LocalPtr, 2)
1721 .clampMaxNumElements(0, PrivatePtr, 2)
1722 .scalarize(0)
1723 .widenScalarToNextPow2(0)
1724 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1725
1726 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1727 // be more flexible with the shift amount type.
1728 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1729 .legalFor({{S32, S32}, {S64, S32}});
1730 if (ST.has16BitInsts()) {
1731 if (ST.hasVOP3PInsts()) {
1732 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1733 .clampMaxNumElements(0, S16, 2);
1734 } else
1735 Shifts.legalFor({{S16, S16}});
1736
1737 // TODO: Support 16-bit shift amounts for all types
1738 Shifts.widenScalarIf(
1739 [=](const LegalityQuery &Query) {
1740 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1741 // 32-bit amount.
1742 const LLT ValTy = Query.Types[0];
1743 const LLT AmountTy = Query.Types[1];
1744 return ValTy.getSizeInBits() <= 16 &&
1745 AmountTy.getSizeInBits() < 16;
1746 }, changeTo(1, S16));
1747 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1748 Shifts.clampScalar(1, S32, S32);
1749 Shifts.widenScalarToNextPow2(0, 16);
1750 Shifts.clampScalar(0, S16, S64);
1751
1752 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1753 .minScalar(0, S16)
1754 .scalarize(0)
1755 .lower();
1756 } else {
1757 // Make sure we legalize the shift amount type first, as the general
1758 // expansion for the shifted type will produce much worse code if it hasn't
1759 // been truncated already.
1760 Shifts.clampScalar(1, S32, S32);
1761 Shifts.widenScalarToNextPow2(0, 32);
1762 Shifts.clampScalar(0, S32, S64);
1763
1764 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1765 .minScalar(0, S32)
1766 .scalarize(0)
1767 .lower();
1768 }
1769 Shifts.scalarize(0);
1770
1771 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1772 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1773 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1774 unsigned IdxTypeIdx = 2;
1775
1776 getActionDefinitionsBuilder(Op)
1777 .customIf([=](const LegalityQuery &Query) {
1778 const LLT EltTy = Query.Types[EltTypeIdx];
1779 const LLT VecTy = Query.Types[VecTypeIdx];
1780 const LLT IdxTy = Query.Types[IdxTypeIdx];
1781 const unsigned EltSize = EltTy.getSizeInBits();
1782 const bool isLegalVecType =
1784 // Address space 8 pointers are 128-bit wide values, but the logic
1785 // below will try to bitcast them to 2N x s64, which will fail.
1786 // Therefore, as an intermediate step, wrap extracts/insertions from a
1787 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1788 // extraction result) in order to produce a vector operation that can
1789 // be handled by the logic below.
1790 if (EltTy.isPointer() && EltSize > 64)
1791 return true;
1792 return (EltSize == 32 || EltSize == 64) &&
1793 VecTy.getSizeInBits() % 32 == 0 &&
1794 VecTy.getSizeInBits() <= MaxRegisterSize &&
1795 IdxTy.getSizeInBits() == 32 &&
1796 isLegalVecType;
1797 })
1798 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1799 bitcastToVectorElement32(VecTypeIdx))
1800 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1801 .bitcastIf(
1802 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1803 [=](const LegalityQuery &Query) {
1804 // For > 64-bit element types, try to turn this into a 64-bit
1805 // element vector since we may be able to do better indexing
1806 // if this is scalar. If not, fall back to 32.
1807 const LLT EltTy = Query.Types[EltTypeIdx];
1808 const LLT VecTy = Query.Types[VecTypeIdx];
1809 const unsigned DstEltSize = EltTy.getSizeInBits();
1810 const unsigned VecSize = VecTy.getSizeInBits();
1811
1812 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1813 return std::pair(
1814 VecTypeIdx,
1815 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1816 })
1817 .clampScalar(EltTypeIdx, S32, S64)
1818 .clampScalar(VecTypeIdx, S32, S64)
1819 .clampScalar(IdxTypeIdx, S32, S32)
1820 .clampMaxNumElements(VecTypeIdx, S32, 32)
1821 // TODO: Clamp elements for 64-bit vectors?
1822 .moreElementsIf(
1823 isIllegalRegisterType(VecTypeIdx),
1825 // It should only be necessary with variable indexes.
1826 // As a last resort, lower to the stack
1827 .lower();
1828 }
1829
1830 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1831 .unsupportedIf([=](const LegalityQuery &Query) {
1832 const LLT &EltTy = Query.Types[1].getElementType();
1833 return Query.Types[0] != EltTy;
1834 });
1835
1836 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1837 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1838 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1839
1840 // FIXME: Doesn't handle extract of illegal sizes.
1841 getActionDefinitionsBuilder(Op)
1842 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1843 .lowerIf([=](const LegalityQuery &Query) {
1844 // Sub-vector(or single element) insert and extract.
1845 // TODO: verify immediate offset here since lower only works with
1846 // whole elements.
1847 const LLT BigTy = Query.Types[BigTyIdx];
1848 return BigTy.isVector();
1849 })
1850 // FIXME: Multiples of 16 should not be legal.
1851 .legalIf([=](const LegalityQuery &Query) {
1852 const LLT BigTy = Query.Types[BigTyIdx];
1853 const LLT LitTy = Query.Types[LitTyIdx];
1854 return (BigTy.getSizeInBits() % 32 == 0) &&
1855 (LitTy.getSizeInBits() % 16 == 0);
1856 })
1857 .widenScalarIf(
1858 [=](const LegalityQuery &Query) {
1859 const LLT BigTy = Query.Types[BigTyIdx];
1860 return (BigTy.getScalarSizeInBits() < 16);
1861 },
1863 .widenScalarIf(
1864 [=](const LegalityQuery &Query) {
1865 const LLT LitTy = Query.Types[LitTyIdx];
1866 return (LitTy.getScalarSizeInBits() < 16);
1867 },
1869 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1870 .widenScalarToNextPow2(BigTyIdx, 32);
1871
1872 }
1873
1874 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1875 .legalForCartesianProduct(AllS32Vectors, {S32})
1876 .legalForCartesianProduct(AllS64Vectors, {S64})
1877 .clampNumElements(0, V16S32, V32S32)
1878 .clampNumElements(0, V2S64, V16S64)
1879 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1880 .moreElementsIf(
1883
1884 if (ST.hasScalarPackInsts()) {
1885 BuildVector
1886 // FIXME: Should probably widen s1 vectors straight to s32
1887 .minScalarOrElt(0, S16)
1888 .minScalar(1, S16);
1889
1890 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1891 .legalFor({V2S16, S32})
1892 .lower();
1893 } else {
1894 BuildVector.customFor({V2S16, S16});
1895 BuildVector.minScalarOrElt(0, S32);
1896
1897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1898 .customFor({V2S16, S32})
1899 .lower();
1900 }
1901
1902 BuildVector.legalIf(isRegisterType(0));
1903
1904 // FIXME: Clamp maximum size
1905 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1906 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1907 .clampMaxNumElements(0, S32, 32)
1908 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1909 .clampMaxNumElements(0, S16, 64);
1910
1911 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1912
1913 // Merge/Unmerge
1914 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1915 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1916 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1917
1918 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1919 const LLT Ty = Query.Types[TypeIdx];
1920 if (Ty.isVector()) {
1921 const LLT &EltTy = Ty.getElementType();
1922 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1923 return true;
1924 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1925 return true;
1926 }
1927 return false;
1928 };
1929
1930 auto &Builder = getActionDefinitionsBuilder(Op)
1931 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1932 .lowerFor({{S16, V2S16}})
1933 .lowerIf([=](const LegalityQuery &Query) {
1934 const LLT BigTy = Query.Types[BigTyIdx];
1935 return BigTy.getSizeInBits() == 32;
1936 })
1937 // Try to widen to s16 first for small types.
1938 // TODO: Only do this on targets with legal s16 shifts
1939 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1940 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1941 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1942 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1943 elementTypeIs(1, S16)),
1944 changeTo(1, V2S16))
1945 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1946 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1947 // valid.
1948 .clampScalar(LitTyIdx, S32, S512)
1949 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1950 // Break up vectors with weird elements into scalars
1951 .fewerElementsIf(
1952 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1953 scalarize(0))
1954 .fewerElementsIf(
1955 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1956 scalarize(1))
1957 .clampScalar(BigTyIdx, S32, MaxScalar);
1958
1959 if (Op == G_MERGE_VALUES) {
1960 Builder.widenScalarIf(
1961 // TODO: Use 16-bit shifts if legal for 8-bit values?
1962 [=](const LegalityQuery &Query) {
1963 const LLT Ty = Query.Types[LitTyIdx];
1964 return Ty.getSizeInBits() < 32;
1965 },
1966 changeTo(LitTyIdx, S32));
1967 }
1968
1969 Builder.widenScalarIf(
1970 [=](const LegalityQuery &Query) {
1971 const LLT Ty = Query.Types[BigTyIdx];
1972 return Ty.getSizeInBits() % 16 != 0;
1973 },
1974 [=](const LegalityQuery &Query) {
1975 // Pick the next power of 2, or a multiple of 64 over 128.
1976 // Whichever is smaller.
1977 const LLT &Ty = Query.Types[BigTyIdx];
1978 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1979 if (NewSizeInBits >= 256) {
1980 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1981 if (RoundedTo < NewSizeInBits)
1982 NewSizeInBits = RoundedTo;
1983 }
1984 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1985 })
1986 // Any vectors left are the wrong size. Scalarize them.
1987 .scalarize(0)
1988 .scalarize(1);
1989 }
1990
1991 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1992 // RegBankSelect.
1993 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1994 .legalFor({{S32}, {S64}});
1995
1996 if (ST.hasVOP3PInsts()) {
1997 SextInReg.lowerFor({{V2S16}})
1998 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1999 // get more vector shift opportunities, since we'll get those when
2000 // expanded.
2001 .clampMaxNumElementsStrict(0, S16, 2);
2002 } else if (ST.has16BitInsts()) {
2003 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2004 } else {
2005 // Prefer to promote to s32 before lowering if we don't have 16-bit
2006 // shifts. This avoid a lot of intermediate truncate and extend operations.
2007 SextInReg.lowerFor({{S32}, {S64}});
2008 }
2009
2010 SextInReg
2011 .scalarize(0)
2012 .clampScalar(0, S32, S64)
2013 .lower();
2014
2015 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2016 .scalarize(0)
2017 .lower();
2018
2019 // TODO: Only Try to form v2s16 with legal packed instructions.
2020 getActionDefinitionsBuilder(G_FSHR)
2021 .legalFor({{S32, S32}})
2022 .lowerFor({{V2S16, V2S16}})
2023 .clampMaxNumElementsStrict(0, S16, 2)
2024 .scalarize(0)
2025 .lower();
2026
2027 if (ST.hasVOP3PInsts()) {
2028 getActionDefinitionsBuilder(G_FSHL)
2029 .lowerFor({{V2S16, V2S16}})
2030 .clampMaxNumElementsStrict(0, S16, 2)
2031 .scalarize(0)
2032 .lower();
2033 } else {
2034 getActionDefinitionsBuilder(G_FSHL)
2035 .scalarize(0)
2036 .lower();
2037 }
2038
2039 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2040 .legalFor({S64});
2041
2042 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2043
2044 getActionDefinitionsBuilder(G_FENCE)
2045 .alwaysLegal();
2046
2047 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2048 .scalarize(0)
2049 .minScalar(0, S32)
2050 .lower();
2051
2052 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2053 .legalFor({{S32, S32}, {S64, S32}})
2054 .clampScalar(1, S32, S32)
2055 .clampScalar(0, S32, S64)
2056 .widenScalarToNextPow2(0)
2057 .scalarize(0);
2058
2059 getActionDefinitionsBuilder(
2060 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2061 G_FCOPYSIGN,
2062
2063 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2064 G_READ_REGISTER, G_WRITE_REGISTER,
2065
2066 G_SADDO, G_SSUBO})
2067 .lower();
2068
2069 if (ST.hasIEEEMinMax()) {
2070 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2071 .legalFor(FPTypesPK16)
2072 .clampMaxNumElements(0, S16, 2)
2073 .scalarize(0);
2074 } else {
2075 // TODO: Implement
2076 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2077 }
2078
2079 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2080 .lower();
2081
2082 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2083
2084 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2085 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2086 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2087 .unsupported();
2088
2089 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2090
2091 getLegacyLegalizerInfo().computeTables();
2092 verify(*ST.getInstrInfo());
2093}
2094
2097 LostDebugLocObserver &LocObserver) const {
2098 MachineIRBuilder &B = Helper.MIRBuilder;
2099 MachineRegisterInfo &MRI = *B.getMRI();
2100
2101 switch (MI.getOpcode()) {
2102 case TargetOpcode::G_ADDRSPACE_CAST:
2103 return legalizeAddrSpaceCast(MI, MRI, B);
2104 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2105 return legalizeFroundeven(MI, MRI, B);
2106 case TargetOpcode::G_FCEIL:
2107 return legalizeFceil(MI, MRI, B);
2108 case TargetOpcode::G_FREM:
2109 return legalizeFrem(MI, MRI, B);
2110 case TargetOpcode::G_INTRINSIC_TRUNC:
2111 return legalizeIntrinsicTrunc(MI, MRI, B);
2112 case TargetOpcode::G_SITOFP:
2113 return legalizeITOFP(MI, MRI, B, true);
2114 case TargetOpcode::G_UITOFP:
2115 return legalizeITOFP(MI, MRI, B, false);
2116 case TargetOpcode::G_FPTOSI:
2117 return legalizeFPTOI(MI, MRI, B, true);
2118 case TargetOpcode::G_FPTOUI:
2119 return legalizeFPTOI(MI, MRI, B, false);
2120 case TargetOpcode::G_FMINNUM:
2121 case TargetOpcode::G_FMAXNUM:
2122 case TargetOpcode::G_FMINNUM_IEEE:
2123 case TargetOpcode::G_FMAXNUM_IEEE:
2124 return legalizeMinNumMaxNum(Helper, MI);
2125 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2126 return legalizeExtractVectorElt(MI, MRI, B);
2127 case TargetOpcode::G_INSERT_VECTOR_ELT:
2128 return legalizeInsertVectorElt(MI, MRI, B);
2129 case TargetOpcode::G_FSIN:
2130 case TargetOpcode::G_FCOS:
2131 return legalizeSinCos(MI, MRI, B);
2132 case TargetOpcode::G_GLOBAL_VALUE:
2133 return legalizeGlobalValue(MI, MRI, B);
2134 case TargetOpcode::G_LOAD:
2135 case TargetOpcode::G_SEXTLOAD:
2136 case TargetOpcode::G_ZEXTLOAD:
2137 return legalizeLoad(Helper, MI);
2138 case TargetOpcode::G_STORE:
2139 return legalizeStore(Helper, MI);
2140 case TargetOpcode::G_FMAD:
2141 return legalizeFMad(MI, MRI, B);
2142 case TargetOpcode::G_FDIV:
2143 return legalizeFDIV(MI, MRI, B);
2144 case TargetOpcode::G_FFREXP:
2145 return legalizeFFREXP(MI, MRI, B);
2146 case TargetOpcode::G_FSQRT:
2147 return legalizeFSQRT(MI, MRI, B);
2148 case TargetOpcode::G_UDIV:
2149 case TargetOpcode::G_UREM:
2150 case TargetOpcode::G_UDIVREM:
2151 return legalizeUnsignedDIV_REM(MI, MRI, B);
2152 case TargetOpcode::G_SDIV:
2153 case TargetOpcode::G_SREM:
2154 case TargetOpcode::G_SDIVREM:
2155 return legalizeSignedDIV_REM(MI, MRI, B);
2156 case TargetOpcode::G_ATOMIC_CMPXCHG:
2157 return legalizeAtomicCmpXChg(MI, MRI, B);
2158 case TargetOpcode::G_FLOG2:
2159 return legalizeFlog2(MI, B);
2160 case TargetOpcode::G_FLOG:
2161 case TargetOpcode::G_FLOG10:
2162 return legalizeFlogCommon(MI, B);
2163 case TargetOpcode::G_FEXP2:
2164 return legalizeFExp2(MI, B);
2165 case TargetOpcode::G_FEXP:
2166 case TargetOpcode::G_FEXP10:
2167 return legalizeFExp(MI, B);
2168 case TargetOpcode::G_FPOW:
2169 return legalizeFPow(MI, B);
2170 case TargetOpcode::G_FFLOOR:
2171 return legalizeFFloor(MI, MRI, B);
2172 case TargetOpcode::G_BUILD_VECTOR:
2173 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2174 return legalizeBuildVector(MI, MRI, B);
2175 case TargetOpcode::G_MUL:
2176 return legalizeMul(Helper, MI);
2177 case TargetOpcode::G_CTLZ:
2178 case TargetOpcode::G_CTTZ:
2179 return legalizeCTLZ_CTTZ(MI, MRI, B);
2180 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2181 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2182 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2183 return legalizeFPTruncRound(MI, B);
2184 case TargetOpcode::G_STACKSAVE:
2185 return legalizeStackSave(MI, B);
2186 case TargetOpcode::G_GET_FPENV:
2187 return legalizeGetFPEnv(MI, MRI, B);
2188 case TargetOpcode::G_SET_FPENV:
2189 return legalizeSetFPEnv(MI, MRI, B);
2190 case TargetOpcode::G_TRAP:
2191 return legalizeTrap(MI, MRI, B);
2192 case TargetOpcode::G_DEBUGTRAP:
2193 return legalizeDebugTrap(MI, MRI, B);
2194 default:
2195 return false;
2196 }
2197
2198 llvm_unreachable("expected switch to return");
2199}
2200
2202 unsigned AS,
2204 MachineIRBuilder &B) const {
2205 MachineFunction &MF = B.getMF();
2206 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2207 const LLT S32 = LLT::scalar(32);
2208 const LLT S64 = LLT::scalar(64);
2209
2211
2212 if (ST.hasApertureRegs()) {
2213 // Note: this register is somewhat broken. When used as a 32-bit operand,
2214 // it only returns zeroes. The real value is in the upper 32 bits.
2215 // Thus, we must emit extract the high 32 bits.
2216 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2219 // FIXME: It would be more natural to emit a COPY here, but then copy
2220 // coalescing would kick in and it would think it's okay to use the "HI"
2221 // subregister (instead of extracting the HI 32 bits) which is an artificial
2222 // (unusable) register.
2223 // Register TableGen definitions would need an overhaul to get rid of the
2224 // artificial "HI" aperture registers and prevent this kind of issue from
2225 // happening.
2226 Register Dst = MRI.createGenericVirtualRegister(S64);
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2229 return B.buildUnmerge(S32, Dst).getReg(1);
2230 }
2231
2232 // TODO: can we be smarter about machine pointer info?
2234 Register LoadAddr = MRI.createGenericVirtualRegister(
2236 // For code object version 5, private_base and shared_base are passed through
2237 // implicit kernargs.
2244 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2245
2246 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2248
2249 if (!loadInputValue(KernargPtrReg, B,
2251 return Register();
2252
2254 PtrInfo,
2258
2259 // Pointer address
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2261 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2262 // Load address
2263 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2264 }
2265
2266 Register QueuePtr = MRI.createGenericVirtualRegister(
2268
2270 return Register();
2271
2272 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2273 // private_segment_aperture_base_hi.
2274 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2275
2277 PtrInfo,
2280 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2281
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2285}
2286
2287/// Return true if the value is a known valid address, such that a null check is
2288/// not necessary.
2290 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2291 MachineInstr *Def = MRI.getVRegDef(Val);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2296 return true;
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2299 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2300 }
2301 default:
2302 return false;
2303 }
2304
2305 return false;
2306}
2307
2310 MachineIRBuilder &B) const {
2311 MachineFunction &MF = B.getMF();
2312
2313 // MI can either be a G_ADDRSPACE_CAST or a
2314 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2315 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2318
2319 const LLT S32 = LLT::scalar(32);
2320 Register Dst = MI.getOperand(0).getReg();
2321 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2322 : MI.getOperand(1).getReg();
2323 LLT DstTy = MRI.getType(Dst);
2324 LLT SrcTy = MRI.getType(Src);
2325 unsigned DestAS = DstTy.getAddressSpace();
2326 unsigned SrcAS = SrcTy.getAddressSpace();
2327
2328 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2329 // vector element.
2330 assert(!DstTy.isVector());
2331
2332 const AMDGPUTargetMachine &TM
2333 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2334
2335 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2337 return true;
2338 }
2339
2340 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2341 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2342 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2343 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2344 // G_ADDRSPACE_CAST we need to guess.
2345 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2346 // Extract low 32-bits of the pointer.
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2349 return true;
2350 }
2351
2352 unsigned NullVal = TM.getNullPointerValue(DestAS);
2353
2354 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2355 auto FlatNull = B.buildConstant(SrcTy, 0);
2356
2357 // Extract low 32-bits of the pointer.
2358 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2359
2360 auto CmpRes =
2361 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2363
2364 MI.eraseFromParent();
2365 return true;
2366 }
2367
2368 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2369 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2370 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2371 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2372 if (!ApertureReg.isValid())
2373 return false;
2374
2375 // Coerce the type of the low half of the result so we can use merge_values.
2376 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2377
2378 // TODO: Should we allow mismatched types but matching sizes in merges to
2379 // avoid the ptrtoint?
2380 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2381
2382 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2383 // G_ADDRSPACE_CAST we need to guess.
2384 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2385 B.buildCopy(Dst, BuildPtr);
2386 MI.eraseFromParent();
2387 return true;
2388 }
2389
2390 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2391 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2392
2393 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2394 SegmentNull.getReg(0));
2395
2396 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2397
2398 MI.eraseFromParent();
2399 return true;
2400 }
2401
2402 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2403 SrcTy.getSizeInBits() == 64) {
2404 // Truncate.
2405 B.buildExtract(Dst, Src, 0);
2406 MI.eraseFromParent();
2407 return true;
2408 }
2409
2410 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2411 DstTy.getSizeInBits() == 64) {
2413 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2414 auto PtrLo = B.buildPtrToInt(S32, Src);
2415 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2416 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2417 MI.eraseFromParent();
2418 return true;
2419 }
2420
2421 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2422 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2423
2424 LLVMContext &Ctx = MF.getFunction().getContext();
2425 Ctx.diagnose(InvalidAddrSpaceCast);
2426 B.buildUndef(Dst);
2427 MI.eraseFromParent();
2428 return true;
2429}
2430
2433 MachineIRBuilder &B) const {
2434 Register Src = MI.getOperand(1).getReg();
2435 LLT Ty = MRI.getType(Src);
2436 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2437
2438 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2439 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2440
2441 auto C1 = B.buildFConstant(Ty, C1Val);
2442 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2443
2444 // TODO: Should this propagate fast-math-flags?
2445 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2446 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2447
2448 auto C2 = B.buildFConstant(Ty, C2Val);
2449 auto Fabs = B.buildFAbs(Ty, Src);
2450
2451 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2452 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2453 MI.eraseFromParent();
2454 return true;
2455}
2456
2459 MachineIRBuilder &B) const {
2460
2461 const LLT S1 = LLT::scalar(1);
2462 const LLT S64 = LLT::scalar(64);
2463
2464 Register Src = MI.getOperand(1).getReg();
2465 assert(MRI.getType(Src) == S64);
2466
2467 // result = trunc(src)
2468 // if (src > 0.0 && src != result)
2469 // result += 1.0
2470
2471 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2472
2473 const auto Zero = B.buildFConstant(S64, 0.0);
2474 const auto One = B.buildFConstant(S64, 1.0);
2475 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2476 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2477 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2478 auto Add = B.buildSelect(S64, And, One, Zero);
2479
2480 // TODO: Should this propagate fast-math-flags?
2481 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2482 MI.eraseFromParent();
2483 return true;
2484}
2485
2488 MachineIRBuilder &B) const {
2489 Register DstReg = MI.getOperand(0).getReg();
2490 Register Src0Reg = MI.getOperand(1).getReg();
2491 Register Src1Reg = MI.getOperand(2).getReg();
2492 auto Flags = MI.getFlags();
2493 LLT Ty = MRI.getType(DstReg);
2494
2495 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2496 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2497 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2498 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2499 MI.eraseFromParent();
2500 return true;
2501}
2502
2505 const unsigned FractBits = 52;
2506 const unsigned ExpBits = 11;
2507 LLT S32 = LLT::scalar(32);
2508
2509 auto Const0 = B.buildConstant(S32, FractBits - 32);
2510 auto Const1 = B.buildConstant(S32, ExpBits);
2511
2512 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2513 .addUse(Hi)
2514 .addUse(Const0.getReg(0))
2515 .addUse(Const1.getReg(0));
2516
2517 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2518}
2519
2522 MachineIRBuilder &B) const {
2523 const LLT S1 = LLT::scalar(1);
2524 const LLT S32 = LLT::scalar(32);
2525 const LLT S64 = LLT::scalar(64);
2526
2527 Register Src = MI.getOperand(1).getReg();
2528 assert(MRI.getType(Src) == S64);
2529
2530 // TODO: Should this use extract since the low half is unused?
2531 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2532 Register Hi = Unmerge.getReg(1);
2533
2534 // Extract the upper half, since this is where we will find the sign and
2535 // exponent.
2536 auto Exp = extractF64Exponent(Hi, B);
2537
2538 const unsigned FractBits = 52;
2539
2540 // Extract the sign bit.
2541 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2542 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2543
2544 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2545
2546 const auto Zero32 = B.buildConstant(S32, 0);
2547
2548 // Extend back to 64-bits.
2549 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2550
2551 auto Shr = B.buildAShr(S64, FractMask, Exp);
2552 auto Not = B.buildNot(S64, Shr);
2553 auto Tmp0 = B.buildAnd(S64, Src, Not);
2554 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2555
2556 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2557 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2558
2559 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2560 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2561 MI.eraseFromParent();
2562 return true;
2563}
2564
2567 MachineIRBuilder &B, bool Signed) const {
2568
2569 Register Dst = MI.getOperand(0).getReg();
2570 Register Src = MI.getOperand(1).getReg();
2571
2572 const LLT S64 = LLT::scalar(64);
2573 const LLT S32 = LLT::scalar(32);
2574
2575 assert(MRI.getType(Src) == S64);
2576
2577 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2578 auto ThirtyTwo = B.buildConstant(S32, 32);
2579
2580 if (MRI.getType(Dst) == S64) {
2581 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2582 : B.buildUITOFP(S64, Unmerge.getReg(1));
2583
2584 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2585 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2586
2587 // TODO: Should this propagate fast-math-flags?
2588 B.buildFAdd(Dst, LdExp, CvtLo);
2589 MI.eraseFromParent();
2590 return true;
2591 }
2592
2593 assert(MRI.getType(Dst) == S32);
2594
2595 auto One = B.buildConstant(S32, 1);
2596
2597 MachineInstrBuilder ShAmt;
2598 if (Signed) {
2599 auto ThirtyOne = B.buildConstant(S32, 31);
2600 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2601 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2602 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2603 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2604 .addUse(Unmerge.getReg(1));
2605 auto LS2 = B.buildSub(S32, LS, One);
2606 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2607 } else
2608 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2609 auto Norm = B.buildShl(S64, Src, ShAmt);
2610 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2611 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2612 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2613 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2614 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2615 B.buildFLdexp(Dst, FVal, Scale);
2616 MI.eraseFromParent();
2617 return true;
2618}
2619
2620// TODO: Copied from DAG implementation. Verify logic and document how this
2621// actually works.
2625 bool Signed) const {
2626
2627 Register Dst = MI.getOperand(0).getReg();
2628 Register Src = MI.getOperand(1).getReg();
2629
2630 const LLT S64 = LLT::scalar(64);
2631 const LLT S32 = LLT::scalar(32);
2632
2633 const LLT SrcLT = MRI.getType(Src);
2634 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2635
2636 unsigned Flags = MI.getFlags();
2637
2638 // The basic idea of converting a floating point number into a pair of 32-bit
2639 // integers is illustrated as follows:
2640 //
2641 // tf := trunc(val);
2642 // hif := floor(tf * 2^-32);
2643 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2644 // hi := fptoi(hif);
2645 // lo := fptoi(lof);
2646 //
2647 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2649 if (Signed && SrcLT == S32) {
2650 // However, a 32-bit floating point number has only 23 bits mantissa and
2651 // it's not enough to hold all the significant bits of `lof` if val is
2652 // negative. To avoid the loss of precision, We need to take the absolute
2653 // value after truncating and flip the result back based on the original
2654 // signedness.
2655 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2656 Trunc = B.buildFAbs(S32, Trunc, Flags);
2657 }
2658 MachineInstrBuilder K0, K1;
2659 if (SrcLT == S64) {
2660 K0 = B.buildFConstant(
2661 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2662 K1 = B.buildFConstant(
2663 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2664 } else {
2665 K0 = B.buildFConstant(
2666 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2667 K1 = B.buildFConstant(
2668 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2669 }
2670
2671 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2672 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2673 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2674
2675 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2676 : B.buildFPTOUI(S32, FloorMul);
2677 auto Lo = B.buildFPTOUI(S32, Fma);
2678
2679 if (Signed && SrcLT == S32) {
2680 // Flip the result based on the signedness, which is either all 0s or 1s.
2681 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2682 // r := xor({lo, hi}, sign) - sign;
2683 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2684 Sign);
2685 } else
2686 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2687 MI.eraseFromParent();
2688
2689 return true;
2690}
2691
2693 MachineInstr &MI) const {
2694 MachineFunction &MF = Helper.MIRBuilder.getMF();
2696
2697 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2698 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2699
2700 // With ieee_mode disabled, the instructions have the correct behavior
2701 // already for G_FMINNUM/G_FMAXNUM
2702 if (!MFI->getMode().IEEE)
2703 return !IsIEEEOp;
2704
2705 if (IsIEEEOp)
2706 return true;
2707
2709}
2710
2713 MachineIRBuilder &B) const {
2714 // TODO: Should move some of this into LegalizerHelper.
2715
2716 // TODO: Promote dynamic indexing of s16 to s32
2717
2718 Register Dst = MI.getOperand(0).getReg();
2719 Register Vec = MI.getOperand(1).getReg();
2720
2721 LLT VecTy = MRI.getType(Vec);
2722 LLT EltTy = VecTy.getElementType();
2723 assert(EltTy == MRI.getType(Dst));
2724
2725 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2726 // but we can't go directly to that logic becasue you can't bitcast a vector
2727 // of pointers to a vector of integers. Therefore, introduce an intermediate
2728 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2729 // drive the legalization forward.
2730 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2731 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2732 LLT IntVecTy = VecTy.changeElementType(IntTy);
2733
2734 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2735 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2736 B.buildIntToPtr(Dst, IntElt);
2737
2738 MI.eraseFromParent();
2739 return true;
2740 }
2741
2742 // FIXME: Artifact combiner probably should have replaced the truncated
2743 // constant before this, so we shouldn't need
2744 // getIConstantVRegValWithLookThrough.
2745 std::optional<ValueAndVReg> MaybeIdxVal =
2746 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2747 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2748 return true;
2749 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2750
2751 if (IdxVal < VecTy.getNumElements()) {
2752 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2753 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2754 } else {
2755 B.buildUndef(Dst);
2756 }
2757
2758 MI.eraseFromParent();
2759 return true;
2760}
2761
2764 MachineIRBuilder &B) const {
2765 // TODO: Should move some of this into LegalizerHelper.
2766
2767 // TODO: Promote dynamic indexing of s16 to s32
2768
2769 Register Dst = MI.getOperand(0).getReg();
2770 Register Vec = MI.getOperand(1).getReg();
2771 Register Ins = MI.getOperand(2).getReg();
2772
2773 LLT VecTy = MRI.getType(Vec);
2774 LLT EltTy = VecTy.getElementType();
2775 assert(EltTy == MRI.getType(Ins));
2776
2777 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2778 // but we can't go directly to that logic becasue you can't bitcast a vector
2779 // of pointers to a vector of integers. Therefore, make the pointer vector
2780 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2781 // new value, and then inttoptr the result vector back. This will then allow
2782 // the rest of legalization to take over.
2783 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2784 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2785 LLT IntVecTy = VecTy.changeElementType(IntTy);
2786
2787 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2788 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2789 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2790 MI.getOperand(3));
2791 B.buildIntToPtr(Dst, IntVecDest);
2792 MI.eraseFromParent();
2793 return true;
2794 }
2795
2796 // FIXME: Artifact combiner probably should have replaced the truncated
2797 // constant before this, so we shouldn't need
2798 // getIConstantVRegValWithLookThrough.
2799 std::optional<ValueAndVReg> MaybeIdxVal =
2800 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2801 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2802 return true;
2803
2804 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2805
2806 unsigned NumElts = VecTy.getNumElements();
2807 if (IdxVal < NumElts) {
2809 for (unsigned i = 0; i < NumElts; ++i)
2810 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2811 B.buildUnmerge(SrcRegs, Vec);
2812
2813 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2814 B.buildMergeLikeInstr(Dst, SrcRegs);
2815 } else {
2816 B.buildUndef(Dst);
2817 }
2818
2819 MI.eraseFromParent();
2820 return true;
2821}
2822
2825 MachineIRBuilder &B) const {
2826
2827 Register DstReg = MI.getOperand(0).getReg();
2828 Register SrcReg = MI.getOperand(1).getReg();
2829 LLT Ty = MRI.getType(DstReg);
2830 unsigned Flags = MI.getFlags();
2831
2832 Register TrigVal;
2833 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2834 if (ST.hasTrigReducedRange()) {
2835 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2836 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2837 .addUse(MulVal.getReg(0))
2838 .setMIFlags(Flags)
2839 .getReg(0);
2840 } else
2841 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2842
2843 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2844 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2845 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2846 .addUse(TrigVal)
2847 .setMIFlags(Flags);
2848 MI.eraseFromParent();
2849 return true;
2850}
2851
2854 const GlobalValue *GV,
2855 int64_t Offset,
2856 unsigned GAFlags) const {
2857 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2858 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2859 // to the following code sequence:
2860 //
2861 // For constant address space:
2862 // s_getpc_b64 s[0:1]
2863 // s_add_u32 s0, s0, $symbol
2864 // s_addc_u32 s1, s1, 0
2865 //
2866 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2867 // a fixup or relocation is emitted to replace $symbol with a literal
2868 // constant, which is a pc-relative offset from the encoding of the $symbol
2869 // operand to the global variable.
2870 //
2871 // For global address space:
2872 // s_getpc_b64 s[0:1]
2873 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2874 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2875 //
2876 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2877 // fixups or relocations are emitted to replace $symbol@*@lo and
2878 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2879 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2880 // operand to the global variable.
2881
2883
2884 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2885 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2886
2887 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2888 .addDef(PCReg);
2889
2890 MIB.addGlobalAddress(GV, Offset, GAFlags);
2891 if (GAFlags == SIInstrInfo::MO_NONE)
2892 MIB.addImm(0);
2893 else
2894 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2895
2896 if (!B.getMRI()->getRegClassOrNull(PCReg))
2897 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2898
2899 if (PtrTy.getSizeInBits() == 32)
2900 B.buildExtract(DstReg, PCReg, 0);
2901 return true;
2902}
2903
2904// Emit a ABS32_LO / ABS32_HI relocation stub.
2906 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2907 MachineRegisterInfo &MRI) const {
2908 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2909
2910 LLT S32 = LLT::scalar(32);
2911
2912 // Use the destination directly, if and only if we store the lower address
2913 // part only and we don't have a register class being set.
2914 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2915 ? DstReg
2916 : MRI.createGenericVirtualRegister(S32);
2917
2918 if (!MRI.getRegClassOrNull(AddrLo))
2919 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2920
2921 // Write the lower half.
2922 B.buildInstr(AMDGPU::S_MOV_B32)
2923 .addDef(AddrLo)
2924 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2925
2926 // If required, write the upper half as well.
2927 if (RequiresHighHalf) {
2928 assert(PtrTy.getSizeInBits() == 64 &&
2929 "Must provide a 64-bit pointer type!");
2930
2931 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2932 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2933
2934 B.buildInstr(AMDGPU::S_MOV_B32)
2935 .addDef(AddrHi)
2936 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2937
2938 // Use the destination directly, if and only if we don't have a register
2939 // class being set.
2940 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2941 ? DstReg
2942 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2943
2944 if (!MRI.getRegClassOrNull(AddrDst))
2945 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2946
2947 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2948
2949 // If we created a new register for the destination, cast the result into
2950 // the final output.
2951 if (AddrDst != DstReg)
2952 B.buildCast(DstReg, AddrDst);
2953 } else if (AddrLo != DstReg) {
2954 // If we created a new register for the destination, cast the result into
2955 // the final output.
2956 B.buildCast(DstReg, AddrLo);
2957 }
2958}
2959
2962 MachineIRBuilder &B) const {
2963 Register DstReg = MI.getOperand(0).getReg();
2964 LLT Ty = MRI.getType(DstReg);
2965 unsigned AS = Ty.getAddressSpace();
2966
2967 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2968 MachineFunction &MF = B.getMF();
2970
2972 if (!MFI->isModuleEntryFunction() &&
2973 GV->getName() != "llvm.amdgcn.module.lds") {
2974 const Function &Fn = MF.getFunction();
2975 DiagnosticInfoUnsupported BadLDSDecl(
2976 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2977 DS_Warning);
2978 Fn.getContext().diagnose(BadLDSDecl);
2979
2980 // We currently don't have a way to correctly allocate LDS objects that
2981 // aren't directly associated with a kernel. We do force inlining of
2982 // functions that use local objects. However, if these dead functions are
2983 // not eliminated, we don't want a compile time error. Just emit a warning
2984 // and a trap, since there should be no callable path here.
2985 B.buildTrap();
2986 B.buildUndef(DstReg);
2987 MI.eraseFromParent();
2988 return true;
2989 }
2990
2991 // TODO: We could emit code to handle the initialization somewhere.
2992 // We ignore the initializer for now and legalize it to allow selection.
2993 // The initializer will anyway get errored out during assembly emission.
2994 const SITargetLowering *TLI = ST.getTargetLowering();
2995 if (!TLI->shouldUseLDSConstAddress(GV)) {
2996 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2997 return true; // Leave in place;
2998 }
2999
3000 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3001 Type *Ty = GV->getValueType();
3002 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3003 // zero-sized type in other languages to declare the dynamic shared
3004 // memory which size is not known at the compile time. They will be
3005 // allocated by the runtime and placed directly after the static
3006 // allocated ones. They all share the same offset.
3007 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3008 // Adjust alignment for that dynamic shared memory array.
3009 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3010 LLT S32 = LLT::scalar(32);
3011 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3012 B.buildIntToPtr(DstReg, Sz);
3013 MI.eraseFromParent();
3014 return true;
3015 }
3016 }
3017
3018 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3019 *cast<GlobalVariable>(GV)));
3020 MI.eraseFromParent();
3021 return true;
3022 }
3023
3024 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3025 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3026 MI.eraseFromParent();
3027 return true;
3028 }
3029
3030 const SITargetLowering *TLI = ST.getTargetLowering();
3031
3032 if (TLI->shouldEmitFixup(GV)) {
3033 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3034 MI.eraseFromParent();
3035 return true;
3036 }
3037
3038 if (TLI->shouldEmitPCReloc(GV)) {
3039 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3040 MI.eraseFromParent();
3041 return true;
3042 }
3043
3045 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3046
3047 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3052 LoadTy, Align(8));
3053
3054 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3055
3056 if (Ty.getSizeInBits() == 32) {
3057 // Truncate if this is a 32-bit constant address.
3058 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3059 B.buildExtract(DstReg, Load, 0);
3060 } else
3061 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3062
3063 MI.eraseFromParent();
3064 return true;
3065}
3066
3068 if (Ty.isVector())
3069 return Ty.changeElementCount(
3072}
3073
3075 MachineInstr &MI) const {
3076 MachineIRBuilder &B = Helper.MIRBuilder;
3077 MachineRegisterInfo &MRI = *B.getMRI();
3078 GISelChangeObserver &Observer = Helper.Observer;
3079
3080 Register PtrReg = MI.getOperand(1).getReg();
3081 LLT PtrTy = MRI.getType(PtrReg);
3082 unsigned AddrSpace = PtrTy.getAddressSpace();
3083
3084 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3086 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3087 Observer.changingInstr(MI);
3088 MI.getOperand(1).setReg(Cast.getReg(0));
3089 Observer.changedInstr(MI);
3090 return true;
3091 }
3092
3093 if (MI.getOpcode() != AMDGPU::G_LOAD)
3094 return false;
3095
3096 Register ValReg = MI.getOperand(0).getReg();
3097 LLT ValTy = MRI.getType(ValReg);
3098
3099 if (hasBufferRsrcWorkaround(ValTy)) {
3100 Observer.changingInstr(MI);
3102 Observer.changedInstr(MI);
3103 return true;
3104 }
3105
3106 MachineMemOperand *MMO = *MI.memoperands_begin();
3107 const unsigned ValSize = ValTy.getSizeInBits();
3108 const LLT MemTy = MMO->getMemoryType();
3109 const Align MemAlign = MMO->getAlign();
3110 const unsigned MemSize = MemTy.getSizeInBits();
3111 const uint64_t AlignInBits = 8 * MemAlign.value();
3112
3113 // Widen non-power-of-2 loads to the alignment if needed
3114 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3115 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3116
3117 // This was already the correct extending load result type, so just adjust
3118 // the memory type.
3119 if (WideMemSize == ValSize) {
3120 MachineFunction &MF = B.getMF();
3121
3122 MachineMemOperand *WideMMO =
3123 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3124 Observer.changingInstr(MI);
3125 MI.setMemRefs(MF, {WideMMO});
3126 Observer.changedInstr(MI);
3127 return true;
3128 }
3129
3130 // Don't bother handling edge case that should probably never be produced.
3131 if (ValSize > WideMemSize)
3132 return false;
3133
3134 LLT WideTy = widenToNextPowerOf2(ValTy);
3135
3136 Register WideLoad;
3137 if (!WideTy.isVector()) {
3138 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3139 B.buildTrunc(ValReg, WideLoad).getReg(0);
3140 } else {
3141 // Extract the subvector.
3142
3143 if (isRegisterType(ValTy)) {
3144 // If this a case where G_EXTRACT is legal, use it.
3145 // (e.g. <3 x s32> -> <4 x s32>)
3146 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3147 B.buildExtract(ValReg, WideLoad, 0);
3148 } else {
3149 // For cases where the widened type isn't a nice register value, unmerge
3150 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3151 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3152 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3153 }
3154 }
3155
3156 MI.eraseFromParent();
3157 return true;
3158 }
3159
3160 return false;
3161}
3162
3164 MachineInstr &MI) const {
3165 MachineIRBuilder &B = Helper.MIRBuilder;
3166 MachineRegisterInfo &MRI = *B.getMRI();
3167 GISelChangeObserver &Observer = Helper.Observer;
3168
3169 Register DataReg = MI.getOperand(0).getReg();
3170 LLT DataTy = MRI.getType(DataReg);
3171
3172 if (hasBufferRsrcWorkaround(DataTy)) {
3173 Observer.changingInstr(MI);
3175 Observer.changedInstr(MI);
3176 return true;
3177 }
3178 return false;
3179}
3180
3183 MachineIRBuilder &B) const {
3184 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3185 assert(Ty.isScalar());
3186
3187 MachineFunction &MF = B.getMF();
3189
3190 // TODO: Always legal with future ftz flag.
3191 // FIXME: Do we need just output?
3192 if (Ty == LLT::float32() &&
3194 return true;
3195 if (Ty == LLT::float16() &&
3197 return true;
3198
3199 MachineIRBuilder HelperBuilder(MI);
3200 GISelObserverWrapper DummyObserver;
3201 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3202 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3203}
3204
3207 Register DstReg = MI.getOperand(0).getReg();
3208 Register PtrReg = MI.getOperand(1).getReg();
3209 Register CmpVal = MI.getOperand(2).getReg();
3210 Register NewVal = MI.getOperand(3).getReg();
3211
3212 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3213 "this should not have been custom lowered");
3214
3215 LLT ValTy = MRI.getType(CmpVal);
3216 LLT VecTy = LLT::fixed_vector(2, ValTy);
3217
3218 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3219
3220 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3221 .addDef(DstReg)
3222 .addUse(PtrReg)
3223 .addUse(PackedVal)
3224 .setMemRefs(MI.memoperands());
3225
3226 MI.eraseFromParent();
3227 return true;
3228}
3229
3230/// Return true if it's known that \p Src can never be an f32 denormal value.
3232 Register Src) {
3233 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3234 switch (DefMI->getOpcode()) {
3235 case TargetOpcode::G_INTRINSIC: {
3236 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3237 case Intrinsic::amdgcn_frexp_mant:
3238 return true;
3239 default:
3240 break;
3241 }
3242
3243 break;
3244 }
3245 case TargetOpcode::G_FFREXP: {
3246 if (DefMI->getOperand(0).getReg() == Src)
3247 return true;
3248 break;
3249 }
3250 case TargetOpcode::G_FPEXT: {
3251 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3252 }
3253 default:
3254 return false;
3255 }
3256
3257 return false;
3258}
3259
3260static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3261 if (Flags & MachineInstr::FmAfn)
3262 return true;
3263 const auto &Options = MF.getTarget().Options;
3264 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3265}
3266
3268 unsigned Flags) {
3269 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3272}
3273
3274std::pair<Register, Register>
3276 unsigned Flags) const {
3277 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3278 return {};
3279
3280 const LLT F32 = LLT::scalar(32);
3281 auto SmallestNormal = B.buildFConstant(
3283 auto IsLtSmallestNormal =
3284 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3285
3286 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3287 auto One = B.buildFConstant(F32, 1.0);
3288 auto ScaleFactor =
3289 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3290 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3291
3292 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3293}
3294
3296 MachineIRBuilder &B) const {
3297 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3298 // If we have to handle denormals, scale up the input and adjust the result.
3299
3300 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3301 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3302
3303 Register Dst = MI.getOperand(0).getReg();
3304 Register Src = MI.getOperand(1).getReg();
3305 LLT Ty = B.getMRI()->getType(Dst);
3306 unsigned Flags = MI.getFlags();
3307
3308 if (Ty == LLT::scalar(16)) {
3309 const LLT F32 = LLT::scalar(32);
3310 // Nothing in half is a denormal when promoted to f32.
3311 auto Ext = B.buildFPExt(F32, Src, Flags);
3312 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3313 .addUse(Ext.getReg(0))
3314 .setMIFlags(Flags);
3315 B.buildFPTrunc(Dst, Log2, Flags);
3316 MI.eraseFromParent();
3317 return true;
3318 }
3319
3320 assert(Ty == LLT::scalar(32));
3321
3322 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3323 if (!ScaledInput) {
3324 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3325 .addUse(Src)
3326 .setMIFlags(Flags);
3327 MI.eraseFromParent();
3328 return true;
3329 }
3330
3331 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3332 .addUse(ScaledInput)
3333 .setMIFlags(Flags);
3334
3335 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3336 auto Zero = B.buildFConstant(Ty, 0.0);
3337 auto ResultOffset =
3338 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3339 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3340
3341 MI.eraseFromParent();
3342 return true;
3343}
3344
3346 Register Z, unsigned Flags) {
3347 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3348 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3349}
3350
3352 MachineIRBuilder &B) const {
3353 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3354 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3355
3356 MachineRegisterInfo &MRI = *B.getMRI();
3357 Register Dst = MI.getOperand(0).getReg();
3358 Register X = MI.getOperand(1).getReg();
3359 unsigned Flags = MI.getFlags();
3360 const LLT Ty = MRI.getType(X);
3361 MachineFunction &MF = B.getMF();
3362
3363 const LLT F32 = LLT::scalar(32);
3364 const LLT F16 = LLT::scalar(16);
3365
3366 const AMDGPUTargetMachine &TM =
3367 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3368
3369 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3370 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3371 if (Ty == F16 && !ST.has16BitInsts()) {
3372 Register LogVal = MRI.createGenericVirtualRegister(F32);
3373 auto PromoteSrc = B.buildFPExt(F32, X);
3374 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3375 B.buildFPTrunc(Dst, LogVal);
3376 } else {
3377 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3378 }
3379
3380 MI.eraseFromParent();
3381 return true;
3382 }
3383
3384 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3385 if (ScaledInput)
3386 X = ScaledInput;
3387
3388 auto Y =
3389 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3390
3391 Register R;
3392 if (ST.hasFastFMAF32()) {
3393 // c+cc are ln(2)/ln(10) to more than 49 bits
3394 const float c_log10 = 0x1.344134p-2f;
3395 const float cc_log10 = 0x1.09f79ep-26f;
3396
3397 // c + cc is ln(2) to more than 49 bits
3398 const float c_log = 0x1.62e42ep-1f;
3399 const float cc_log = 0x1.efa39ep-25f;
3400
3401 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3402 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3403
3404 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3405 auto NegR = B.buildFNeg(Ty, R, Flags);
3406 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3407 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3408 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3409 } else {
3410 // ch+ct is ln(2)/ln(10) to more than 36 bits
3411 const float ch_log10 = 0x1.344000p-2f;
3412 const float ct_log10 = 0x1.3509f6p-18f;
3413
3414 // ch + ct is ln(2) to more than 36 bits
3415 const float ch_log = 0x1.62e000p-1f;
3416 const float ct_log = 0x1.0bfbe8p-15f;
3417
3418 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3419 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3420
3421 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3422 auto YH = B.buildAnd(Ty, Y, MaskConst);
3423 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3424 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3425
3426 Register Mad0 =
3427 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3428 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3429 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3430 }
3431
3432 const bool IsFiniteOnly =
3433 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3434 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3435
3436 if (!IsFiniteOnly) {
3437 // Expand isfinite(x) => fabs(x) < inf
3438 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3439 auto Fabs = B.buildFAbs(Ty, Y);
3440 auto IsFinite =
3441 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3442 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3443 }
3444
3445 if (ScaledInput) {
3446 auto Zero = B.buildFConstant(Ty, 0.0);
3447 auto ShiftK =
3448 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3449 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3450 B.buildFSub(Dst, R, Shift, Flags);
3451 } else {
3452 B.buildCopy(Dst, R);
3453 }
3454
3455 MI.eraseFromParent();
3456 return true;
3457}
3458
3460 Register Src, bool IsLog10,
3461 unsigned Flags) const {
3462 const double Log2BaseInverted =
3464
3465 LLT Ty = B.getMRI()->getType(Dst);
3466
3467 if (Ty == LLT::scalar(32)) {
3468 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3469 if (ScaledInput) {
3470 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3471 .addUse(Src)
3472 .setMIFlags(Flags);
3473 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3474 auto Zero = B.buildFConstant(Ty, 0.0);
3475 auto ResultOffset =
3476 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3477 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3478
3479 if (ST.hasFastFMAF32())
3480 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3481 else {
3482 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3483 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3484 }
3485
3486 return true;
3487 }
3488 }
3489
3490 auto Log2Operand = Ty == LLT::scalar(16)
3491 ? B.buildFLog2(Ty, Src, Flags)
3492 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3493 .addUse(Src)
3494 .setMIFlags(Flags);
3495 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3496 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3497 return true;
3498}
3499
3501 MachineIRBuilder &B) const {
3502 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3503 // If we have to handle denormals, scale up the input and adjust the result.
3504
3505 Register Dst = MI.getOperand(0).getReg();
3506 Register Src = MI.getOperand(1).getReg();
3507 unsigned Flags = MI.getFlags();
3508 LLT Ty = B.getMRI()->getType(Dst);
3509 const LLT F16 = LLT::scalar(16);
3510 const LLT F32 = LLT::scalar(32);
3511
3512 if (Ty == F16) {
3513 // Nothing in half is a denormal when promoted to f32.
3514 auto Ext = B.buildFPExt(F32, Src, Flags);
3515 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3516 .addUse(Ext.getReg(0))
3517 .setMIFlags(Flags);
3518 B.buildFPTrunc(Dst, Log2, Flags);
3519 MI.eraseFromParent();
3520 return true;
3521 }
3522
3523 assert(Ty == F32);
3524
3525 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3526 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3527 .addUse(Src)
3528 .setMIFlags(Flags);
3529 MI.eraseFromParent();
3530 return true;
3531 }
3532
3533 // bool needs_scaling = x < -0x1.f80000p+6f;
3534 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3535
3536 // -nextafter(128.0, -1)
3537 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3538 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3539 RangeCheckConst, Flags);
3540
3541 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3542 auto Zero = B.buildFConstant(Ty, 0.0);
3543 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3544 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3545
3546 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3547 .addUse(AddInput.getReg(0))
3548 .setMIFlags(Flags);
3549
3550 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3551 auto One = B.buildFConstant(Ty, 1.0);
3552 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3553 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3554 MI.eraseFromParent();
3555 return true;
3556}
3557
3559 Register X, unsigned Flags) const {
3560 LLT Ty = B.getMRI()->getType(Dst);
3561 LLT F32 = LLT::scalar(32);
3562
3563 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3564 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3565 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3566
3567 if (Ty == F32) {
3568 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3569 .addUse(Mul.getReg(0))
3570 .setMIFlags(Flags);
3571 } else {
3572 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3573 }
3574
3575 return true;
3576 }
3577
3578 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3579 auto NeedsScaling =
3580 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3581 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3582 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3583 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3584
3585 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3586 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3587
3588 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3589 .addUse(ExpInput.getReg(0))
3590 .setMIFlags(Flags);
3591
3592 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3593 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3594 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3595 return true;
3596}
3597
3599 MachineIRBuilder &B) const {
3600 Register Dst = MI.getOperand(0).getReg();
3601 Register X = MI.getOperand(1).getReg();
3602 const unsigned Flags = MI.getFlags();
3603 MachineFunction &MF = B.getMF();
3604 MachineRegisterInfo &MRI = *B.getMRI();
3605 LLT Ty = MRI.getType(Dst);
3606 const LLT F16 = LLT::scalar(16);
3607 const LLT F32 = LLT::scalar(32);
3608 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3609
3610 if (Ty == F16) {
3611 // v_exp_f16 (fmul x, log2e)
3612 if (allowApproxFunc(MF, Flags)) {
3613 // TODO: Does this really require fast?
3614 legalizeFExpUnsafe(B, Dst, X, Flags);
3615 MI.eraseFromParent();
3616 return true;
3617 }
3618
3619 // exp(f16 x) ->
3620 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3621
3622 // Nothing in half is a denormal when promoted to f32.
3623 auto Ext = B.buildFPExt(F32, X, Flags);
3624 Register Lowered = MRI.createGenericVirtualRegister(F32);
3625 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3626 B.buildFPTrunc(Dst, Lowered, Flags);
3627 MI.eraseFromParent();
3628 return true;
3629 }
3630
3631 assert(Ty == F32);
3632
3633 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3634 // library behavior. Also, is known-not-daz source sufficient?
3635 if (allowApproxFunc(MF, Flags)) {
3636 legalizeFExpUnsafe(B, Dst, X, Flags);
3637 MI.eraseFromParent();
3638 return true;
3639 }
3640
3641 // Algorithm:
3642 //
3643 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3644 //
3645 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3646 // n = 64*m + j, 0 <= j < 64
3647 //
3648 // e^x = 2^((64*m + j + f)/64)
3649 // = (2^m) * (2^(j/64)) * 2^(f/64)
3650 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3651 //
3652 // f = x*(64/ln(2)) - n
3653 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3654 //
3655 // e^x = (2^m) * (2^(j/64)) * e^r
3656 //
3657 // (2^(j/64)) is precomputed
3658 //
3659 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3660 // e^r = 1 + q
3661 //
3662 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3663 //
3664 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3665 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3666 Register PH, PL;
3667
3668 if (ST.hasFastFMAF32()) {
3669 const float c_exp = numbers::log2ef;
3670 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3671 const float c_exp10 = 0x1.a934f0p+1f;
3672 const float cc_exp10 = 0x1.2f346ep-24f;
3673
3674 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3675 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3676 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3677 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3678
3679 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3680 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3681 } else {
3682 const float ch_exp = 0x1.714000p+0f;
3683 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3684
3685 const float ch_exp10 = 0x1.a92000p+1f;
3686 const float cl_exp10 = 0x1.4f0978p-11f;
3687
3688 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3689 auto XH = B.buildAnd(Ty, X, MaskConst);
3690 auto XL = B.buildFSub(Ty, X, XH, Flags);
3691
3692 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3693 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3694
3695 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3696 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3697
3698 Register Mad0 =
3699 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3700 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3701 }
3702
3703 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3704
3705 // It is unsafe to contract this fsub into the PH multiply.
3706 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3707 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3708 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3709
3710 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3711 .addUse(A.getReg(0))
3712 .setMIFlags(Flags);
3713 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3714
3715 auto UnderflowCheckConst =
3716 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3717 auto Zero = B.buildFConstant(Ty, 0.0);
3718 auto Underflow =
3719 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3720
3721 R = B.buildSelect(Ty, Underflow, Zero, R);
3722
3723 const auto &Options = MF.getTarget().Options;
3724
3725 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3726 auto OverflowCheckConst =
3727 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3728
3729 auto Overflow =
3730 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3731 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3732 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3733 }
3734
3735 B.buildCopy(Dst, R);
3736 MI.eraseFromParent();
3737 return true;
3738}
3739
3741 MachineIRBuilder &B) const {
3742 Register Dst = MI.getOperand(0).getReg();
3743 Register Src0 = MI.getOperand(1).getReg();
3744 Register Src1 = MI.getOperand(2).getReg();
3745 unsigned Flags = MI.getFlags();
3746 LLT Ty = B.getMRI()->getType(Dst);
3747 const LLT F16 = LLT::float16();
3748 const LLT F32 = LLT::float32();
3749
3750 if (Ty == F32) {
3751 auto Log = B.buildFLog2(F32, Src0, Flags);
3752 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3753 .addUse(Log.getReg(0))
3754 .addUse(Src1)
3755 .setMIFlags(Flags);
3756 B.buildFExp2(Dst, Mul, Flags);
3757 } else if (Ty == F16) {
3758 // There's no f16 fmul_legacy, so we need to convert for it.
3759 auto Log = B.buildFLog2(F16, Src0, Flags);
3760 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3761 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3762 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3763 .addUse(Ext0.getReg(0))
3764 .addUse(Ext1.getReg(0))
3765 .setMIFlags(Flags);
3766 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3767 } else
3768 return false;
3769
3770 MI.eraseFromParent();
3771 return true;
3772}
3773
3774// Find a source register, ignoring any possible source modifiers.
3776 Register ModSrc = OrigSrc;
3777 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3778 ModSrc = SrcFNeg->getOperand(1).getReg();
3779 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3780 ModSrc = SrcFAbs->getOperand(1).getReg();
3781 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3782 ModSrc = SrcFAbs->getOperand(1).getReg();
3783 return ModSrc;
3784}
3785
3788 MachineIRBuilder &B) const {
3789
3790 const LLT S1 = LLT::scalar(1);
3791 const LLT F64 = LLT::float64();
3792 Register Dst = MI.getOperand(0).getReg();
3793 Register OrigSrc = MI.getOperand(1).getReg();
3794 unsigned Flags = MI.getFlags();
3795 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3796 "this should not have been custom lowered");
3797
3798 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3799 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3800 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3801 // V_FRACT bug is:
3802 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3803 //
3804 // Convert floor(x) to (x - fract(x))
3805
3806 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3807 .addUse(OrigSrc)
3808 .setMIFlags(Flags);
3809
3810 // Give source modifier matching some assistance before obscuring a foldable
3811 // pattern.
3812
3813 // TODO: We can avoid the neg on the fract? The input sign to fract
3814 // shouldn't matter?
3815 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3816
3817 auto Const =
3818 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3819
3820 Register Min = MRI.createGenericVirtualRegister(F64);
3821
3822 // We don't need to concern ourselves with the snan handling difference, so
3823 // use the one which will directly select.
3824 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3825 if (MFI->getMode().IEEE)
3826 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3827 else
3828 B.buildFMinNum(Min, Fract, Const, Flags);
3829
3830 Register CorrectedFract = Min;
3831 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3832 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3833 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3834 }
3835
3836 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3837 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3838
3839 MI.eraseFromParent();
3840 return true;
3841}
3842
3843// Turn an illegal packed v2s16 build vector into bit operations.
3844// TODO: This should probably be a bitcast action in LegalizerHelper.
3847 Register Dst = MI.getOperand(0).getReg();
3848 const LLT S32 = LLT::scalar(32);
3849 const LLT S16 = LLT::scalar(16);
3850 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3851
3852 Register Src0 = MI.getOperand(1).getReg();
3853 Register Src1 = MI.getOperand(2).getReg();
3854
3855 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3856 assert(MRI.getType(Src0) == S32);
3857 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3858 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3859 }
3860
3861 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3862 B.buildBitcast(Dst, Merge);
3863
3864 MI.eraseFromParent();
3865 return true;
3866}
3867
3868// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3869//
3870// Source and accumulation registers must all be 32-bits.
3871//
3872// TODO: When the multiply is uniform, we should produce a code sequence
3873// that is better suited to instruction selection on the SALU. Instead of
3874// the outer loop going over parts of the result, the outer loop should go
3875// over parts of one of the factors. This should result in instruction
3876// selection that makes full use of S_ADDC_U32 instructions.
3879 ArrayRef<Register> Src0,
3880 ArrayRef<Register> Src1,
3881 bool UsePartialMad64_32,
3882 bool SeparateOddAlignedProducts) const {
3883 // Use (possibly empty) vectors of S1 registers to represent the set of
3884 // carries from one pair of positions to the next.
3885 using Carry = SmallVector<Register, 2>;
3886
3887 MachineIRBuilder &B = Helper.MIRBuilder;
3888 GISelKnownBits &KB = *Helper.getKnownBits();
3889
3890 const LLT S1 = LLT::scalar(1);
3891 const LLT S32 = LLT::scalar(32);
3892 const LLT S64 = LLT::scalar(64);
3893
3894 Register Zero32;
3895 Register Zero64;
3896
3897 auto getZero32 = [&]() -> Register {
3898 if (!Zero32)
3899 Zero32 = B.buildConstant(S32, 0).getReg(0);
3900 return Zero32;
3901 };
3902 auto getZero64 = [&]() -> Register {
3903 if (!Zero64)
3904 Zero64 = B.buildConstant(S64, 0).getReg(0);
3905 return Zero64;
3906 };
3907
3908 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3909 for (unsigned i = 0; i < Src0.size(); ++i) {
3910 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3911 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3912 }
3913
3914 // Merge the given carries into the 32-bit LocalAccum, which is modified
3915 // in-place.
3916 //
3917 // Returns the carry-out, which is a single S1 register or null.
3918 auto mergeCarry =
3919 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3920 if (CarryIn.empty())
3921 return Register();
3922
3923 bool HaveCarryOut = true;
3924 Register CarryAccum;
3925 if (CarryIn.size() == 1) {
3926 if (!LocalAccum) {
3927 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3928 return Register();
3929 }
3930
3931 CarryAccum = getZero32();
3932 } else {
3933 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3934 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3935 CarryAccum =
3936 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3937 .getReg(0);
3938 }
3939
3940 if (!LocalAccum) {
3941 LocalAccum = getZero32();
3942 HaveCarryOut = false;
3943 }
3944 }
3945
3946 auto Add =
3947 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3948 LocalAccum = Add.getReg(0);
3949 return HaveCarryOut ? Add.getReg(1) : Register();
3950 };
3951
3952 // Build a multiply-add chain to compute
3953 //
3954 // LocalAccum + (partial products at DstIndex)
3955 // + (opportunistic subset of CarryIn)
3956 //
3957 // LocalAccum is an array of one or two 32-bit registers that are updated
3958 // in-place. The incoming registers may be null.
3959 //
3960 // In some edge cases, carry-ins can be consumed "for free". In that case,
3961 // the consumed carry bits are removed from CarryIn in-place.
3962 auto buildMadChain =
3963 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3964 -> Carry {
3965 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3966 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3967
3968 Carry CarryOut;
3969 unsigned j0 = 0;
3970
3971 // Use plain 32-bit multiplication for the most significant part of the
3972 // result by default.
3973 if (LocalAccum.size() == 1 &&
3974 (!UsePartialMad64_32 || !CarryIn.empty())) {
3975 do {
3976 // Skip multiplication if one of the operands is 0
3977 unsigned j1 = DstIndex - j0;
3978 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3979 ++j0;
3980 continue;
3981 }
3982 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3983 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3984 LocalAccum[0] = Mul.getReg(0);
3985 } else {
3986 if (CarryIn.empty()) {
3987 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3988 } else {
3989 LocalAccum[0] =
3990 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3991 .getReg(0);
3992 CarryIn.pop_back();
3993 }
3994 }
3995 ++j0;
3996 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3997 }
3998
3999 // Build full 64-bit multiplies.
4000 if (j0 <= DstIndex) {
4001 bool HaveSmallAccum = false;
4002 Register Tmp;
4003
4004 if (LocalAccum[0]) {
4005 if (LocalAccum.size() == 1) {
4006 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4007 HaveSmallAccum = true;
4008 } else if (LocalAccum[1]) {
4009 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4010 HaveSmallAccum = false;
4011 } else {
4012 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4013 HaveSmallAccum = true;
4014 }
4015 } else {
4016 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4017 Tmp = getZero64();
4018 HaveSmallAccum = true;
4019 }
4020
4021 do {
4022 unsigned j1 = DstIndex - j0;
4023 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4024 ++j0;
4025 continue;
4026 }
4027 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4028 {Src0[j0], Src1[j1], Tmp});
4029 Tmp = Mad.getReg(0);
4030 if (!HaveSmallAccum)
4031 CarryOut.push_back(Mad.getReg(1));
4032 HaveSmallAccum = false;
4033
4034 ++j0;
4035 } while (j0 <= DstIndex);
4036
4037 auto Unmerge = B.buildUnmerge(S32, Tmp);
4038 LocalAccum[0] = Unmerge.getReg(0);
4039 if (LocalAccum.size() > 1)
4040 LocalAccum[1] = Unmerge.getReg(1);
4041 }
4042
4043 return CarryOut;
4044 };
4045
4046 // Outer multiply loop, iterating over destination parts from least
4047 // significant to most significant parts.
4048 //
4049 // The columns of the following diagram correspond to the destination parts
4050 // affected by one iteration of the outer loop (ignoring boundary
4051 // conditions).
4052 //
4053 // Dest index relative to 2 * i: 1 0 -1
4054 // ------
4055 // Carries from previous iteration: e o
4056 // Even-aligned partial product sum: E E .
4057 // Odd-aligned partial product sum: O O
4058 //
4059 // 'o' is OddCarry, 'e' is EvenCarry.
4060 // EE and OO are computed from partial products via buildMadChain and use
4061 // accumulation where possible and appropriate.
4062 //
4063 Register SeparateOddCarry;
4064 Carry EvenCarry;
4065 Carry OddCarry;
4066
4067 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4068 Carry OddCarryIn = std::move(OddCarry);
4069 Carry EvenCarryIn = std::move(EvenCarry);
4070 OddCarry.clear();
4071 EvenCarry.clear();
4072
4073 // Partial products at offset 2 * i.
4074 if (2 * i < Accum.size()) {
4075 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4076 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4077 }
4078
4079 // Partial products at offset 2 * i - 1.
4080 if (i > 0) {
4081 if (!SeparateOddAlignedProducts) {
4082 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4083 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4084 } else {
4085 bool IsHighest = 2 * i >= Accum.size();
4086 Register SeparateOddOut[2];
4087 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4088 .take_front(IsHighest ? 1 : 2);
4089 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4090
4092
4093 if (i == 1) {
4094 if (!IsHighest)
4095 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4096 else
4097 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4098 } else {
4099 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4100 SeparateOddCarry);
4101 }
4102 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4103
4104 if (!IsHighest) {
4105 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4106 Lo->getOperand(1).getReg());
4107 Accum[2 * i] = Hi.getReg(0);
4108 SeparateOddCarry = Hi.getReg(1);
4109 }
4110 }
4111 }
4112
4113 // Add in the carries from the previous iteration
4114 if (i > 0) {
4115 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4116 EvenCarryIn.push_back(CarryOut);
4117
4118 if (2 * i < Accum.size()) {
4119 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4120 OddCarry.push_back(CarryOut);
4121 }
4122 }
4123 }
4124}
4125
4126// Custom narrowing of wide multiplies using wide multiply-add instructions.
4127//
4128// TODO: If the multiply is followed by an addition, we should attempt to
4129// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4131 MachineInstr &MI) const {
4132 assert(ST.hasMad64_32());
4133 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4134
4135 MachineIRBuilder &B = Helper.MIRBuilder;
4136 MachineRegisterInfo &MRI = *B.getMRI();
4137
4138 Register DstReg = MI.getOperand(0).getReg();
4139 Register Src0 = MI.getOperand(1).getReg();
4140 Register Src1 = MI.getOperand(2).getReg();
4141
4142 LLT Ty = MRI.getType(DstReg);
4143 assert(Ty.isScalar());
4144
4145 unsigned Size = Ty.getSizeInBits();
4146 unsigned NumParts = Size / 32;
4147 assert((Size % 32) == 0);
4148 assert(NumParts >= 2);
4149
4150 // Whether to use MAD_64_32 for partial products whose high half is
4151 // discarded. This avoids some ADD instructions but risks false dependency
4152 // stalls on some subtargets in some cases.
4153 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4154
4155 // Whether to compute odd-aligned partial products separately. This is
4156 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4157 // in an even-aligned VGPR.
4158 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4159
4160 LLT S32 = LLT::scalar(32);
4161 SmallVector<Register, 2> Src0Parts, Src1Parts;
4162 for (unsigned i = 0; i < NumParts; ++i) {
4163 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4164 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4165 }
4166 B.buildUnmerge(Src0Parts, Src0);
4167 B.buildUnmerge(Src1Parts, Src1);
4168
4169 SmallVector<Register, 2> AccumRegs(NumParts);
4170 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4171 SeparateOddAlignedProducts);
4172
4173 B.buildMergeLikeInstr(DstReg, AccumRegs);
4174 MI.eraseFromParent();
4175 return true;
4176}
4177
4178// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4179// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4180// case with a single min instruction instead of a compare+select.
4183 MachineIRBuilder &B) const {
4184 Register Dst = MI.getOperand(0).getReg();
4185 Register Src = MI.getOperand(1).getReg();
4186 LLT DstTy = MRI.getType(Dst);
4187 LLT SrcTy = MRI.getType(Src);
4188
4189 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4190 ? AMDGPU::G_AMDGPU_FFBH_U32
4191 : AMDGPU::G_AMDGPU_FFBL_B32;
4192 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4193 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4194
4195 MI.eraseFromParent();
4196 return true;
4197}
4198
4201 MachineIRBuilder &B) const {
4202 Register Dst = MI.getOperand(0).getReg();
4203 Register Src = MI.getOperand(1).getReg();
4204 LLT SrcTy = MRI.getType(Src);
4205 TypeSize NumBits = SrcTy.getSizeInBits();
4206
4207 assert(NumBits < 32u);
4208
4209 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4210 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4211 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4212 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4213 B.buildTrunc(Dst, Ctlz);
4214 MI.eraseFromParent();
4215 return true;
4216}
4217
4218// Check that this is a G_XOR x, -1
4219static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4220 if (MI.getOpcode() != TargetOpcode::G_XOR)
4221 return false;
4222 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4223 return ConstVal && *ConstVal == -1;
4224}
4225
4226// Return the use branch instruction, otherwise null if the usage is invalid.
4227static MachineInstr *
4229 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4230 Register CondDef = MI.getOperand(0).getReg();
4231 if (!MRI.hasOneNonDBGUse(CondDef))
4232 return nullptr;
4233
4234 MachineBasicBlock *Parent = MI.getParent();
4235 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4236
4237 if (isNot(MRI, *UseMI)) {
4238 Register NegatedCond = UseMI->getOperand(0).getReg();
4239 if (!MRI.hasOneNonDBGUse(NegatedCond))
4240 return nullptr;
4241
4242 // We're deleting the def of this value, so we need to remove it.
4243 eraseInstr(*UseMI, MRI);
4244
4245 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4246 Negated = true;
4247 }
4248
4249 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4250 return nullptr;
4251
4252 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4253 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4254 if (Next == Parent->end()) {
4255 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4256 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4257 return nullptr;
4258 UncondBrTarget = &*NextMBB;
4259 } else {
4260 if (Next->getOpcode() != AMDGPU::G_BR)
4261 return nullptr;
4262 Br = &*Next;
4263 UncondBrTarget = Br->getOperand(0).getMBB();
4264 }
4265
4266 return UseMI;
4267}
4268
4270 const ArgDescriptor *Arg,
4271 const TargetRegisterClass *ArgRC,
4272 LLT ArgTy) const {
4273 MCRegister SrcReg = Arg->getRegister();
4274 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4275 assert(DstReg.isVirtual() && "Virtual register expected");
4276
4277 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4278 *ArgRC, B.getDebugLoc(), ArgTy);
4279 if (Arg->isMasked()) {
4280 // TODO: Should we try to emit this once in the entry block?
4281 const LLT S32 = LLT::scalar(32);
4282 const unsigned Mask = Arg->getMask();
4283 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4284
4285 Register AndMaskSrc = LiveIn;
4286
4287 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4288 // 0.
4289 if (Shift != 0) {
4290 auto ShiftAmt = B.buildConstant(S32, Shift);
4291 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4292 }
4293
4294 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4295 } else {
4296 B.buildCopy(DstReg, LiveIn);
4297 }
4298
4299 return true;
4300}
4301
4303 Register DstReg, MachineIRBuilder &B,
4305 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4306 const ArgDescriptor *Arg = nullptr;
4307 const TargetRegisterClass *ArgRC;
4308 LLT ArgTy;
4309
4310 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4311 const ArgDescriptor WorkGroupIDX =
4312 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4313 // If GridZ is not programmed in an entry function then the hardware will set
4314 // it to all zeros, so there is no need to mask the GridY value in the low
4315 // order bits.
4316 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4317 AMDGPU::TTMP7,
4318 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4319 const ArgDescriptor WorkGroupIDZ =
4320 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4321 if (ST.hasArchitectedSGPRs() &&
4323 switch (ArgType) {
4325 Arg = &WorkGroupIDX;
4326 ArgRC = &AMDGPU::SReg_32RegClass;
4327 ArgTy = LLT::scalar(32);
4328 break;
4330 Arg = &WorkGroupIDY;
4331 ArgRC = &AMDGPU::SReg_32RegClass;
4332 ArgTy = LLT::scalar(32);
4333 break;
4335 Arg = &WorkGroupIDZ;
4336 ArgRC = &AMDGPU::SReg_32RegClass;
4337 ArgTy = LLT::scalar(32);
4338 break;
4339 default:
4340 break;
4341 }
4342 }
4343
4344 if (!Arg)
4345 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4346
4347 if (!Arg) {
4349 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4350 // case the pointer argument may be missing and we use null.
4351 B.buildConstant(DstReg, 0);
4352 return true;
4353 }
4354
4355 // It's undefined behavior if a function marked with the amdgpu-no-*
4356 // attributes uses the corresponding intrinsic.
4357 B.buildUndef(DstReg);
4358 return true;
4359 }
4360
4361 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4362 return false; // TODO: Handle these
4363 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4364}
4365
4369 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4370 return false;
4371
4372 MI.eraseFromParent();
4373 return true;
4374}
4375
4377 int64_t C) {
4378 B.buildConstant(MI.getOperand(0).getReg(), C);
4379 MI.eraseFromParent();
4380 return true;
4381}
4382
4385 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4386 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4387 if (MaxID == 0)
4388 return replaceWithConstant(B, MI, 0);
4389
4390 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4391 const ArgDescriptor *Arg;
4392 const TargetRegisterClass *ArgRC;
4393 LLT ArgTy;
4394 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4395
4396 Register DstReg = MI.getOperand(0).getReg();
4397 if (!Arg) {
4398 // It's undefined behavior if a function marked with the amdgpu-no-*
4399 // attributes uses the corresponding intrinsic.
4400 B.buildUndef(DstReg);
4401 MI.eraseFromParent();
4402 return true;
4403 }
4404
4405 if (Arg->isMasked()) {
4406 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4407 // masking operations anyway.
4408 //
4409 // TODO: We could assert the top bit is 0 for the source copy.
4410 if (!loadInputValue(DstReg, B, ArgType))
4411 return false;
4412 } else {
4413 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4414 if (!loadInputValue(TmpReg, B, ArgType))
4415 return false;
4416 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4417 }
4418
4419 MI.eraseFromParent();
4420 return true;
4421}
4422
4424 int64_t Offset) const {
4426 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4427
4428 // TODO: If we passed in the base kernel offset we could have a better
4429 // alignment than 4, but we don't really need it.
4430 if (!loadInputValue(KernArgReg, B,
4432 llvm_unreachable("failed to find kernarg segment ptr");
4433
4434 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4435 // TODO: Should get nuw
4436 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4437}
4438
4439/// Legalize a value that's loaded from kernel arguments. This is only used by
4440/// legacy intrinsics.
4444 Align Alignment) const {
4445 Register DstReg = MI.getOperand(0).getReg();
4446
4447 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4448 "unexpected kernarg parameter type");
4449
4452 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4455 MI.eraseFromParent();
4456 return true;
4457}
4458
4461 MachineIRBuilder &B) const {
4462 Register Dst = MI.getOperand(0).getReg();
4463 LLT DstTy = MRI.getType(Dst);
4464 LLT S16 = LLT::scalar(16);
4465 LLT S32 = LLT::scalar(32);
4466 LLT S64 = LLT::scalar(64);
4467
4468 if (DstTy == S16)
4469 return legalizeFDIV16(MI, MRI, B);
4470 if (DstTy == S32)
4471 return legalizeFDIV32(MI, MRI, B);
4472 if (DstTy == S64)
4473 return legalizeFDIV64(MI, MRI, B);
4474
4475 return false;
4476}
4477
4479 Register DstDivReg,
4480 Register DstRemReg,
4481 Register X,
4482 Register Y) const {
4483 const LLT S1 = LLT::scalar(1);
4484 const LLT S32 = LLT::scalar(32);
4485
4486 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4487 // algorithm used here.
4488
4489 // Initial estimate of inv(y).
4490 auto FloatY = B.buildUITOFP(S32, Y);
4491 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4492 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4493 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4494 auto Z = B.buildFPTOUI(S32, ScaledY);
4495
4496 // One round of UNR.
4497 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4498 auto NegYZ = B.buildMul(S32, NegY, Z);
4499 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4500
4501 // Quotient/remainder estimate.
4502 auto Q = B.buildUMulH(S32, X, Z);
4503 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4504
4505 // First quotient/remainder refinement.
4506 auto One = B.buildConstant(S32, 1);
4507 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4508 if (DstDivReg)
4509 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4510 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4511
4512 // Second quotient/remainder refinement.
4513 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4514 if (DstDivReg)
4515 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4516
4517 if (DstRemReg)
4518 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4519}
4520
4521// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4522//
4523// Return lo, hi of result
4524//
4525// %cvt.lo = G_UITOFP Val.lo
4526// %cvt.hi = G_UITOFP Val.hi
4527// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4528// %rcp = G_AMDGPU_RCP_IFLAG %mad
4529// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4530// %mul2 = G_FMUL %mul1, 2**(-32)
4531// %trunc = G_INTRINSIC_TRUNC %mul2
4532// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4533// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4534static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4535 Register Val) {
4536 const LLT S32 = LLT::scalar(32);
4537 auto Unmerge = B.buildUnmerge(S32, Val);
4538
4539 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4540 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4541
4542 auto Mad = B.buildFMAD(
4543 S32, CvtHi, // 2**32
4544 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4545
4546 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4547 auto Mul1 = B.buildFMul(
4548 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4549
4550 // 2**(-32)
4551 auto Mul2 = B.buildFMul(
4552 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4553 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4554
4555 // -(2**32)
4556 auto Mad2 = B.buildFMAD(
4557 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4558 Mul1);
4559
4560 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4561 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4562
4563 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4564}
4565
4567 Register DstDivReg,
4568 Register DstRemReg,
4569 Register Numer,
4570 Register Denom) const {
4571 const LLT S32 = LLT::scalar(32);
4572 const LLT S64 = LLT::scalar(64);
4573 const LLT S1 = LLT::scalar(1);
4574 Register RcpLo, RcpHi;
4575
4576 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4577
4578 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4579
4580 auto Zero64 = B.buildConstant(S64, 0);
4581 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4582
4583 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4584 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4585
4586 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4587 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4588 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4589
4590 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4591 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4592 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4593
4594 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4595 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4596 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4597 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4598 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4599
4600 auto Zero32 = B.buildConstant(S32, 0);
4601 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4602 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4603 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4604
4605 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4606 Register NumerLo = UnmergeNumer.getReg(0);
4607 Register NumerHi = UnmergeNumer.getReg(1);
4608
4609 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4610 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4611 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4612 Register Mul3_Lo = UnmergeMul3.getReg(0);
4613 Register Mul3_Hi = UnmergeMul3.getReg(1);
4614 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4615 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4616 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4617 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4618
4619 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4620 Register DenomLo = UnmergeDenom.getReg(0);
4621 Register DenomHi = UnmergeDenom.getReg(1);
4622
4623 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4624 auto C1 = B.buildSExt(S32, CmpHi);
4625
4626 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4627 auto C2 = B.buildSExt(S32, CmpLo);
4628
4629 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4630 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4631
4632 // TODO: Here and below portions of the code can be enclosed into if/endif.
4633 // Currently control flow is unconditional and we have 4 selects after
4634 // potential endif to substitute PHIs.
4635
4636 // if C3 != 0 ...
4637 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4638 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4639 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4640 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4641
4642 auto One64 = B.buildConstant(S64, 1);
4643 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4644
4645 auto C4 =
4646 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4647 auto C5 =
4648 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4649 auto C6 = B.buildSelect(
4650 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4651
4652 // if (C6 != 0)
4653 auto Add4 = B.buildAdd(S64, Add3, One64);
4654 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4655
4656 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4657 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4658 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4659
4660 // endif C6
4661 // endif C3
4662
4663 if (DstDivReg) {
4664 auto Sel1 = B.buildSelect(
4665 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4666 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4667 Sel1, MulHi3);
4668 }
4669
4670 if (DstRemReg) {
4671 auto Sel2 = B.buildSelect(
4672 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4673 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4674 Sel2, Sub1);
4675 }
4676}
4677
4680 MachineIRBuilder &B) const {
4681 Register DstDivReg, DstRemReg;
4682 switch (MI.getOpcode()) {
4683 default:
4684 llvm_unreachable("Unexpected opcode!");
4685 case AMDGPU::G_UDIV: {
4686 DstDivReg = MI.getOperand(0).getReg();
4687 break;
4688 }
4689 case AMDGPU::G_UREM: {
4690 DstRemReg = MI.getOperand(0).getReg();
4691 break;
4692 }
4693 case AMDGPU::G_UDIVREM: {
4694 DstDivReg = MI.getOperand(0).getReg();
4695 DstRemReg = MI.getOperand(1).getReg();
4696 break;
4697 }
4698 }
4699
4700 const LLT S64 = LLT::scalar(64);
4701 const LLT S32 = LLT::scalar(32);
4702 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4703 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4704 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4705 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4706
4707 if (Ty == S32)
4708 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4709 else if (Ty == S64)
4710 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4711 else
4712 return false;
4713
4714 MI.eraseFromParent();
4715 return true;
4716}
4717
4720 MachineIRBuilder &B) const {
4721 const LLT S64 = LLT::scalar(64);
4722 const LLT S32 = LLT::scalar(32);
4723
4724 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4725 if (Ty != S32 && Ty != S64)
4726 return false;
4727
4728 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4729 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4730 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4731
4732 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4733 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4734 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4735
4736 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4737 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4738
4739 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4740 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4741
4742 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4743 switch (MI.getOpcode()) {
4744 default:
4745 llvm_unreachable("Unexpected opcode!");
4746 case AMDGPU::G_SDIV: {
4747 DstDivReg = MI.getOperand(0).getReg();
4748 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4749 break;
4750 }
4751 case AMDGPU::G_SREM: {
4752 DstRemReg = MI.getOperand(0).getReg();
4753 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4754 break;
4755 }
4756 case AMDGPU::G_SDIVREM: {
4757 DstDivReg = MI.getOperand(0).getReg();
4758 DstRemReg = MI.getOperand(1).getReg();
4759 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4760 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4761 break;
4762 }
4763 }
4764
4765 if (Ty == S32)
4766 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4767 else
4768 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4769
4770 if (DstDivReg) {
4771 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4772 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4773 B.buildSub(DstDivReg, SignXor, Sign);
4774 }
4775
4776 if (DstRemReg) {
4777 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4778 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4779 B.buildSub(DstRemReg, SignXor, Sign);
4780 }
4781
4782 MI.eraseFromParent();
4783 return true;
4784}
4785
4788 MachineIRBuilder &B) const {
4789 Register Res = MI.getOperand(0).getReg();
4790 Register LHS = MI.getOperand(1).getReg();
4791 Register RHS = MI.getOperand(2).getReg();
4792 uint16_t Flags = MI.getFlags();
4793 LLT ResTy = MRI.getType(Res);
4794
4795 const MachineFunction &MF = B.getMF();
4796 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4798
4799 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4800 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4801 return false;
4802
4803 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4804 // the CI documentation has a worst case error of 1 ulp.
4805 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4806 // use it as long as we aren't trying to use denormals.
4807 //
4808 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4809
4810 // 1 / x -> RCP(x)
4811 if (CLHS->isExactlyValue(1.0)) {
4812 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4813 .addUse(RHS)
4814 .setMIFlags(Flags);
4815
4816 MI.eraseFromParent();
4817 return true;
4818 }
4819
4820 // -1 / x -> RCP( FNEG(x) )
4821 if (CLHS->isExactlyValue(-1.0)) {
4822 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4823 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4824 .addUse(FNeg.getReg(0))
4825 .setMIFlags(Flags);
4826
4827 MI.eraseFromParent();
4828 return true;
4829 }
4830 }
4831
4832 // For f16 require afn or arcp.
4833 // For f32 require afn.
4834 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4835 !MI.getFlag(MachineInstr::FmArcp)))
4836 return false;
4837
4838 // x / y -> x * (1.0 / y)
4839 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4840 .addUse(RHS)
4841 .setMIFlags(Flags);
4842 B.buildFMul(Res, LHS, RCP, Flags);
4843
4844 MI.eraseFromParent();
4845 return true;
4846}
4847
4850 MachineIRBuilder &B) const {
4851 Register Res = MI.getOperand(0).getReg();
4852 Register X = MI.getOperand(1).getReg();
4853 Register Y = MI.getOperand(2).getReg();
4854 uint16_t Flags = MI.getFlags();
4855 LLT ResTy = MRI.getType(Res);
4856
4857 const MachineFunction &MF = B.getMF();
4858 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4859 MI.getFlag(MachineInstr::FmAfn);
4860
4861 if (!AllowInaccurateRcp)
4862 return false;
4863
4864 auto NegY = B.buildFNeg(ResTy, Y);
4865 auto One = B.buildFConstant(ResTy, 1.0);
4866
4867 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4868 .addUse(Y)
4869 .setMIFlags(Flags);
4870
4871 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4872 R = B.buildFMA(ResTy, Tmp0, R, R);
4873
4874 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4875 R = B.buildFMA(ResTy, Tmp1, R, R);
4876
4877 auto Ret = B.buildFMul(ResTy, X, R);
4878 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4879
4880 B.buildFMA(Res, Tmp2, R, Ret);
4881 MI.eraseFromParent();
4882 return true;
4883}
4884
4887 MachineIRBuilder &B) const {
4889 return true;
4890
4891 Register Res = MI.getOperand(0).getReg();
4892 Register LHS = MI.getOperand(1).getReg();
4893 Register RHS = MI.getOperand(2).getReg();
4894
4895 uint16_t Flags = MI.getFlags();
4896
4897 LLT S16 = LLT::scalar(16);
4898 LLT S32 = LLT::scalar(32);
4899
4900 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4901 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4902
4903 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4904 .addUse(RHSExt.getReg(0))
4905 .setMIFlags(Flags);
4906
4907 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4908 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4909
4910 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4911 .addUse(RDst.getReg(0))
4912 .addUse(RHS)
4913 .addUse(LHS)
4914 .setMIFlags(Flags);
4915
4916 MI.eraseFromParent();
4917 return true;
4918}
4919
4920static constexpr unsigned SPDenormModeBitField =
4922
4923// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4924// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4926 const GCNSubtarget &ST,
4928 // Set SP denorm mode to this value.
4929 unsigned SPDenormMode =
4930 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4931
4932 if (ST.hasDenormModeInst()) {
4933 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4934 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4935
4936 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4937 B.buildInstr(AMDGPU::S_DENORM_MODE)
4938 .addImm(NewDenormModeValue);
4939
4940 } else {
4941 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4942 .addImm(SPDenormMode)
4943 .addImm(SPDenormModeBitField);
4944 }
4945}
4946
4949 MachineIRBuilder &B) const {
4951 return true;
4952
4953 Register Res = MI.getOperand(0).getReg();
4954 Register LHS = MI.getOperand(1).getReg();
4955 Register RHS = MI.getOperand(2).getReg();
4956 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4957 SIModeRegisterDefaults Mode = MFI->getMode();
4958
4959 uint16_t Flags = MI.getFlags();
4960
4961 LLT S32 = LLT::scalar(32);
4962 LLT S1 = LLT::scalar(1);
4963
4964 auto One = B.buildFConstant(S32, 1.0f);
4965
4966 auto DenominatorScaled =
4967 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4968 .addUse(LHS)
4969 .addUse(RHS)
4970 .addImm(0)
4971 .setMIFlags(Flags);
4972 auto NumeratorScaled =
4973 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4974 .addUse(LHS)
4975 .addUse(RHS)
4976 .addImm(1)
4977 .setMIFlags(Flags);
4978
4979 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4980 .addUse(DenominatorScaled.getReg(0))
4981 .setMIFlags(Flags);
4982 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4983
4984 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4985 const bool HasDynamicDenormals =
4986 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4987 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4988
4989 Register SavedSPDenormMode;
4990 if (!PreservesDenormals) {
4991 if (HasDynamicDenormals) {
4992 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4993 B.buildInstr(AMDGPU::S_GETREG_B32)
4994 .addDef(SavedSPDenormMode)
4995 .addImm(SPDenormModeBitField);
4996 }
4997 toggleSPDenormMode(true, B, ST, Mode);
4998 }
4999
5000 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5001 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5002 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5003 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5004 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5005 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5006
5007 if (!PreservesDenormals) {
5008 if (HasDynamicDenormals) {
5009 assert(SavedSPDenormMode);
5010 B.buildInstr(AMDGPU::S_SETREG_B32)
5011 .addReg(SavedSPDenormMode)
5012 .addImm(SPDenormModeBitField);
5013 } else
5014 toggleSPDenormMode(false, B, ST, Mode);
5015 }
5016
5017 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5018 .addUse(Fma4.getReg(0))
5019 .addUse(Fma1.getReg(0))
5020 .addUse(Fma3.getReg(0))
5021 .addUse(NumeratorScaled.getReg(1))
5022 .setMIFlags(Flags);
5023
5024 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5025 .addUse(Fmas.getReg(0))
5026 .addUse(RHS)
5027 .addUse(LHS)
5028 .setMIFlags(Flags);
5029
5030 MI.eraseFromParent();
5031 return true;
5032}
5033
5036 MachineIRBuilder &B) const {
5038 return true;
5039
5040 Register Res = MI.getOperand(0).getReg();
5041 Register LHS = MI.getOperand(1).getReg();
5042 Register RHS = MI.getOperand(2).getReg();
5043
5044 uint16_t Flags = MI.getFlags();
5045
5046 LLT S64 = LLT::scalar(64);
5047 LLT S1 = LLT::scalar(1);
5048
5049 auto One = B.buildFConstant(S64, 1.0);
5050
5051 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5052 .addUse(LHS)
5053 .addUse(RHS)
5054 .addImm(0)
5055 .setMIFlags(Flags);
5056
5057 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5058
5059 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5060 .addUse(DivScale0.getReg(0))
5061 .setMIFlags(Flags);
5062
5063 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5064 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5065 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5066
5067 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5068 .addUse(LHS)
5069 .addUse(RHS)
5070 .addImm(1)
5071 .setMIFlags(Flags);
5072
5073 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5074 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5075 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5076
5077 Register Scale;
5079 // Workaround a hardware bug on SI where the condition output from div_scale
5080 // is not usable.
5081
5082 LLT S32 = LLT::scalar(32);
5083
5084 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5085 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5086 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5087 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5088
5089 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5090 Scale1Unmerge.getReg(1));
5091 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5092 Scale0Unmerge.getReg(1));
5093 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5094 } else {
5095 Scale = DivScale1.getReg(1);
5096 }
5097
5098 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5099 .addUse(Fma4.getReg(0))
5100 .addUse(Fma3.getReg(0))
5101 .addUse(Mul.getReg(0))
5102 .addUse(Scale)
5103 .setMIFlags(Flags);
5104
5105 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5106 .addUse(Fmas.getReg(0))
5107 .addUse(RHS)
5108 .addUse(LHS)
5109 .setMIFlags(Flags);
5110
5111 MI.eraseFromParent();
5112 return true;
5113}
5114
5117 MachineIRBuilder &B) const {
5118 Register Res0 = MI.getOperand(0).getReg();
5119 Register Res1 = MI.getOperand(1).getReg();
5120 Register Val = MI.getOperand(2).getReg();
5121 uint16_t Flags = MI.getFlags();
5122
5123 LLT Ty = MRI.getType(Res0);
5124 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5125
5126 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5127 .addUse(Val)
5128 .setMIFlags(Flags);
5129 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5130 .addUse(Val)
5131 .setMIFlags(Flags);
5132
5133 if (ST.hasFractBug()) {
5134 auto Fabs = B.buildFAbs(Ty, Val);
5135 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5136 auto IsFinite =
5137 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5138 auto Zero = B.buildConstant(InstrExpTy, 0);
5139 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5140 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5141 }
5142
5143 B.buildCopy(Res0, Mant);
5144 B.buildSExtOrTrunc(Res1, Exp);
5145
5146 MI.eraseFromParent();
5147 return true;
5148}
5149
5152 MachineIRBuilder &B) const {
5153 Register Res = MI.getOperand(0).getReg();
5154 Register LHS = MI.getOperand(2).getReg();
5155 Register RHS = MI.getOperand(3).getReg();
5156 uint16_t Flags = MI.getFlags();
5157
5158 LLT S32 = LLT::scalar(32);
5159 LLT S1 = LLT::scalar(1);
5160
5161 auto Abs = B.buildFAbs(S32, RHS, Flags);
5162 const APFloat C0Val(1.0f);
5163
5164 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5165 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5166 auto C2 = B.buildFConstant(S32, 1.0f);
5167
5168 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5169 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5170
5171 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5172
5173 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5174 .addUse(Mul0.getReg(0))
5175 .setMIFlags(Flags);
5176
5177 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5178
5179 B.buildFMul(Res, Sel, Mul1, Flags);
5180
5181 MI.eraseFromParent();
5182 return true;
5183}
5184
5187 MachineIRBuilder &B) const {
5188 // Bypass the correct expansion a standard promotion through G_FSQRT would
5189 // get. The f32 op is accurate enough for the f16 cas.
5190 unsigned Flags = MI.getFlags();
5191 assert(!ST.has16BitInsts());
5192 const LLT F32 = LLT::scalar(32);
5193 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5194 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5195 .addUse(Ext.getReg(0))
5196 .setMIFlags(Flags);
5197 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5198 MI.eraseFromParent();
5199 return true;
5200}
5201
5204 MachineIRBuilder &B) const {
5205 MachineFunction &MF = B.getMF();
5206 Register Dst = MI.getOperand(0).getReg();
5207 Register X = MI.getOperand(1).getReg();
5208 const unsigned Flags = MI.getFlags();
5209 const LLT S1 = LLT::scalar(1);
5210 const LLT F32 = LLT::scalar(32);
5211 const LLT I32 = LLT::scalar(32);
5212
5213 if (allowApproxFunc(MF, Flags)) {
5214 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5215 .addUse(X)
5216 .setMIFlags(Flags);
5217 MI.eraseFromParent();
5218 return true;
5219 }
5220
5221 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5222 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5223 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5224 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5225 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5226
5227 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5228 if (needsDenormHandlingF32(MF, X, Flags)) {
5229 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5230 .addUse(SqrtX.getReg(0))
5231 .setMIFlags(Flags);
5232
5233 auto NegOne = B.buildConstant(I32, -1);
5234 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5235
5236 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5237 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5238
5239 auto PosOne = B.buildConstant(I32, 1);
5240 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5241
5242 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5243 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5244
5245 auto Zero = B.buildFConstant(F32, 0.0f);
5246 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5247
5248 SqrtS =
5249 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5250
5251 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5252 SqrtS =
5253 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5254 } else {
5255 auto SqrtR =
5256 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5257 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5258
5259 auto Half = B.buildFConstant(F32, 0.5f);
5260 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5261 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5262 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5263 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5264 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5265 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5266 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5267 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5268 }
5269
5270 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5271
5272 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5273
5274 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5275
5276 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5277 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5278
5279 MI.eraseFromParent();
5280 return true;
5281}
5282
5285 MachineIRBuilder &B) const {
5286 // For double type, the SQRT and RSQ instructions don't have required
5287 // precision, we apply Goldschmidt's algorithm to improve the result:
5288 //
5289 // y0 = rsq(x)
5290 // g0 = x * y0
5291 // h0 = 0.5 * y0
5292 //
5293 // r0 = 0.5 - h0 * g0
5294 // g1 = g0 * r0 + g0
5295 // h1 = h0 * r0 + h0
5296 //
5297 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5298 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5299 // h2 = h1 * r1 + h1
5300 //
5301 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5302 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5303 //
5304 // sqrt(x) = g3
5305
5306 const LLT S1 = LLT::scalar(1);
5307 const LLT S32 = LLT::scalar(32);
5308 const LLT F64 = LLT::scalar(64);
5309
5310 Register Dst = MI.getOperand(0).getReg();
5311 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5312
5313 Register X = MI.getOperand(1).getReg();
5314 unsigned Flags = MI.getFlags();
5315
5316 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5317
5318 auto ZeroInt = B.buildConstant(S32, 0);
5319 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5320
5321 // Scale up input if it is too small.
5322 auto ScaleUpFactor = B.buildConstant(S32, 256);
5323 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5324 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5325
5326 auto SqrtY =
5327 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5328
5329 auto Half = B.buildFConstant(F64, 0.5);
5330 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5331 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5332
5333 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5334 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5335
5336 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5337 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5338
5339 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5340 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5341
5342 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5343
5344 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5345 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5346
5347 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5348
5349 // Scale down the result.
5350 auto ScaleDownFactor = B.buildConstant(S32, -128);
5351 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5352 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5353
5354 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5355 // with finite only or nsz because rsq(+/-0) = +/-inf
5356
5357 // TODO: Check for DAZ and expand to subnormals
5358 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5359
5360 // If x is +INF, +0, or -0, use its original value
5361 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5362
5363 MI.eraseFromParent();
5364 return true;
5365}
5366
5369 MachineIRBuilder &B) const {
5370 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5371 if (Ty == LLT::scalar(32))
5372 return legalizeFSQRTF32(MI, MRI, B);
5373 if (Ty == LLT::scalar(64))
5374 return legalizeFSQRTF64(MI, MRI, B);
5375 if (Ty == LLT::scalar(16))
5376 return legalizeFSQRTF16(MI, MRI, B);
5377 return false;
5378}
5379
5380// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5381// FIXME: Why do we handle this one but not other removed instructions?
5382//
5383// Reciprocal square root. The clamp prevents infinite results, clamping
5384// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5385// +-max_float.
5388 MachineIRBuilder &B) const {
5390 return true;
5391
5392 Register Dst = MI.getOperand(0).getReg();
5393 Register Src = MI.getOperand(2).getReg();
5394 auto Flags = MI.getFlags();
5395
5396 LLT Ty = MRI.getType(Dst);
5397
5398 const fltSemantics *FltSemantics;
5399 if (Ty == LLT::scalar(32))
5400 FltSemantics = &APFloat::IEEEsingle();
5401 else if (Ty == LLT::scalar(64))
5402 FltSemantics = &APFloat::IEEEdouble();
5403 else
5404 return false;
5405
5406 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5407 .addUse(Src)
5408 .setMIFlags(Flags);
5409
5410 // We don't need to concern ourselves with the snan handling difference, since
5411 // the rsq quieted (or not) so use the one which will directly select.
5412 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5413 const bool UseIEEE = MFI->getMode().IEEE;
5414
5415 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5416 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5417 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5418
5419 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5420
5421 if (UseIEEE)
5422 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5423 else
5424 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5425 MI.eraseFromParent();
5426 return true;
5427}
5428
5429// TODO: Fix pointer type handling
5432 Intrinsic::ID IID) const {
5433
5434 MachineIRBuilder &B = Helper.MIRBuilder;
5435 MachineRegisterInfo &MRI = *B.getMRI();
5436
5437 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5438 IID == Intrinsic::amdgcn_permlanex16;
5439
5440 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5441 Register Src2, LLT VT) -> Register {
5442 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5443 switch (IID) {
5444 case Intrinsic::amdgcn_readfirstlane:
5445 case Intrinsic::amdgcn_permlane64:
5446 return LaneOp.getReg(0);
5447 case Intrinsic::amdgcn_readlane:
5448 return LaneOp.addUse(Src1).getReg(0);
5449 case Intrinsic::amdgcn_writelane:
5450 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5451 case Intrinsic::amdgcn_permlane16:
5452 case Intrinsic::amdgcn_permlanex16: {
5453 Register Src3 = MI.getOperand(5).getReg();
5454 Register Src4 = MI.getOperand(6).getImm();
5455 Register Src5 = MI.getOperand(7).getImm();
5456 return LaneOp.addUse(Src1)
5457 .addUse(Src2)
5458 .addUse(Src3)
5459 .addImm(Src4)
5460 .addImm(Src5)
5461 .getReg(0);
5462 }
5463 default:
5464 llvm_unreachable("unhandled lane op");
5465 }
5466 };
5467
5468 Register DstReg = MI.getOperand(0).getReg();
5469 Register Src0 = MI.getOperand(2).getReg();
5470 Register Src1, Src2;
5471 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5472 IsPermLane16) {
5473 Src1 = MI.getOperand(3).getReg();
5474 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5475 Src2 = MI.getOperand(4).getReg();
5476 }
5477 }
5478
5479 LLT Ty = MRI.getType(DstReg);
5480 unsigned Size = Ty.getSizeInBits();
5481
5482 if (Size == 32) {
5483 // Already legal
5484 return true;
5485 }
5486
5487 if (Size < 32) {
5488 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5489
5490 if (IsPermLane16)
5491 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5492
5493 if (IID == Intrinsic::amdgcn_writelane)
5494 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5495
5496 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5497 B.buildTrunc(DstReg, LaneOpDst);
5498 MI.eraseFromParent();
5499 return true;
5500 }
5501
5502 if (Size % 32 != 0)
5503 return false;
5504
5505 LLT PartialResTy = S32;
5506 if (Ty.isVector()) {
5507 LLT EltTy = Ty.getElementType();
5508 switch (EltTy.getSizeInBits()) {
5509 case 16:
5510 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5511 break;
5512 case 32:
5513 PartialResTy = EltTy;
5514 break;
5515 default:
5516 // Handle all other cases via S32 pieces;
5517 break;
5518 }
5519 }
5520
5521 SmallVector<Register, 2> PartialRes;
5522 unsigned NumParts = Size / 32;
5523 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5524 MachineInstrBuilder Src1Parts, Src2Parts;
5525
5526 if (IsPermLane16)
5527 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5528
5529 if (IID == Intrinsic::amdgcn_writelane)
5530 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5531
5532 for (unsigned i = 0; i < NumParts; ++i) {
5533 Src0 = Src0Parts.getReg(i);
5534
5535 if (IsPermLane16)
5536 Src1 = Src1Parts.getReg(i);
5537
5538 if (IID == Intrinsic::amdgcn_writelane)
5539 Src2 = Src2Parts.getReg(i);
5540
5541 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5542 }
5543
5544 B.buildMergeLikeInstr(DstReg, PartialRes);
5545 MI.eraseFromParent();
5546 return true;
5547}
5548
5551 MachineIRBuilder &B) const {
5555 LLT DstTy = MRI.getType(DstReg);
5556 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5557
5558 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5559 if (!loadInputValue(KernargPtrReg, B,
5561 return false;
5562
5563 // FIXME: This should be nuw
5564 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5565 return true;
5566}
5567
5568/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5569/// bits of the pointer and replace them with the stride argument, then
5570/// merge_values everything together. In the common case of a raw buffer (the
5571/// stride component is 0), we can just AND off the upper half.
5574 Register Result = MI.getOperand(0).getReg();
5575 Register Pointer = MI.getOperand(2).getReg();
5576 Register Stride = MI.getOperand(3).getReg();
5577 Register NumRecords = MI.getOperand(4).getReg();
5578 Register Flags = MI.getOperand(5).getReg();
5579
5580 LLT S32 = LLT::scalar(32);
5581
5582 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5583 auto Unmerge = B.buildUnmerge(S32, Pointer);
5584 Register LowHalf = Unmerge.getReg(0);
5585 Register HighHalf = Unmerge.getReg(1);
5586
5587 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5588 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5589
5590 MachineInstrBuilder NewHighHalf = Masked;
5591 std::optional<ValueAndVReg> StrideConst =
5593 if (!StrideConst || !StrideConst->Value.isZero()) {
5594 MachineInstrBuilder ShiftedStride;
5595 if (StrideConst) {
5596 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5597 uint32_t ShiftedStrideVal = StrideVal << 16;
5598 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5599 } else {
5600 auto ExtStride = B.buildAnyExt(S32, Stride);
5601 auto ShiftConst = B.buildConstant(S32, 16);
5602 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5603 }
5604 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5605 }
5606 Register NewHighHalfReg = NewHighHalf.getReg(0);
5607 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5608 MI.eraseFromParent();
5609 return true;
5610}
5611
5614 MachineIRBuilder &B) const {
5615 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5616 if (!MFI->isEntryFunction()) {
5619 }
5620
5621 Register DstReg = MI.getOperand(0).getReg();
5622 if (!getImplicitArgPtr(DstReg, MRI, B))
5623 return false;
5624
5625 MI.eraseFromParent();
5626 return true;
5627}
5628
5631 MachineIRBuilder &B) const {
5632 Function &F = B.getMF().getFunction();
5633 std::optional<uint32_t> KnownSize =
5635 if (KnownSize.has_value())
5636 B.buildConstant(DstReg, *KnownSize);
5637 return false;
5638}
5639
5642 MachineIRBuilder &B) const {
5643
5644 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5645 if (!MFI->isEntryFunction()) {
5648 }
5649
5650 Register DstReg = MI.getOperand(0).getReg();
5651 if (!getLDSKernelId(DstReg, MRI, B))
5652 return false;
5653
5654 MI.eraseFromParent();
5655 return true;
5656}
5657
5661 unsigned AddrSpace) const {
5662 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5663 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5664 Register Hi32 = Unmerge.getReg(1);
5665
5666 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5667 MI.eraseFromParent();
5668 return true;
5669}
5670
5671// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5672// offset (the offset that is included in bounds checking and swizzling, to be
5673// split between the instruction's voffset and immoffset fields) and soffset
5674// (the offset that is excluded from bounds checking and swizzling, to go in
5675// the instruction's soffset field). This function takes the first kind of
5676// offset and figures out how to split it between voffset and immoffset.
5677std::pair<Register, unsigned>
5679 Register OrigOffset) const {
5680 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5681 Register BaseReg;
5682 unsigned ImmOffset;
5683 const LLT S32 = LLT::scalar(32);
5684 MachineRegisterInfo &MRI = *B.getMRI();
5685
5686 std::tie(BaseReg, ImmOffset) =
5688
5689 // If BaseReg is a pointer, convert it to int.
5690 if (MRI.getType(BaseReg).isPointer())
5691 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5692
5693 // If the immediate value is too big for the immoffset field, put only bits
5694 // that would normally fit in the immoffset field. The remaining value that
5695 // is copied/added for the voffset field is a large power of 2, and it
5696 // stands more chance of being CSEd with the copy/add for another similar
5697 // load/store.
5698 // However, do not do that rounding down if that is a negative
5699 // number, as it appears to be illegal to have a negative offset in the
5700 // vgpr, even if adding the immediate offset makes it positive.
5701 unsigned Overflow = ImmOffset & ~MaxImm;
5702 ImmOffset -= Overflow;
5703 if ((int32_t)Overflow < 0) {
5704 Overflow += ImmOffset;
5705 ImmOffset = 0;
5706 }
5707
5708 if (Overflow != 0) {
5709 if (!BaseReg) {
5710 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5711 } else {
5712 auto OverflowVal = B.buildConstant(S32, Overflow);
5713 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5714 }
5715 }
5716
5717 if (!BaseReg)
5718 BaseReg = B.buildConstant(S32, 0).getReg(0);
5719
5720 return std::pair(BaseReg, ImmOffset);
5721}
5722
5723/// Handle register layout difference for f16 images for some subtargets.
5726 Register Reg,
5727 bool ImageStore) const {
5728 const LLT S16 = LLT::scalar(16);
5729 const LLT S32 = LLT::scalar(32);
5730 LLT StoreVT = MRI.getType(Reg);
5731 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5732
5733 if (ST.hasUnpackedD16VMem()) {
5734 auto Unmerge = B.buildUnmerge(S16, Reg);
5735
5736 SmallVector<Register, 4> WideRegs;
5737 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5738 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5739
5740 int NumElts = StoreVT.getNumElements();
5741
5742 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5743 .getReg(0);
5744 }
5745
5746 if (ImageStore && ST.hasImageStoreD16Bug()) {
5747 if (StoreVT.getNumElements() == 2) {
5748 SmallVector<Register, 4> PackedRegs;
5749 Reg = B.buildBitcast(S32, Reg).getReg(0);
5750 PackedRegs.push_back(Reg);
5751 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5752 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5753 .getReg(0);
5754 }
5755
5756 if (StoreVT.getNumElements() == 3) {
5757 SmallVector<Register, 4> PackedRegs;
5758 auto Unmerge = B.buildUnmerge(S16, Reg);
5759 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5760 PackedRegs.push_back(Unmerge.getReg(I));
5761 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5762 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5763 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5764 }
5765
5766 if (StoreVT.getNumElements() == 4) {
5767 SmallVector<Register, 4> PackedRegs;
5768 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5769 auto Unmerge = B.buildUnmerge(S32, Reg);
5770 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5771 PackedRegs.push_back(Unmerge.getReg(I));
5772 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5773 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5774 .getReg(0);
5775 }
5776
5777 llvm_unreachable("invalid data type");
5778 }
5779
5780 if (StoreVT == LLT::fixed_vector(3, S16)) {
5781 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5782 .getReg(0);
5783 }
5784 return Reg;
5785}
5786
5788 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5789 MachineRegisterInfo *MRI = B.getMRI();
5790 LLT Ty = MRI->getType(VData);
5791
5792 const LLT S16 = LLT::scalar(16);
5793
5794 // Fixup buffer resources themselves needing to be v4i128.
5796 return castBufferRsrcToV4I32(VData, B);
5797
5798 // Fixup illegal register types for i8 stores.
5799 if (Ty == LLT::scalar(8) || Ty == S16) {
5800 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5801 return AnyExt;
5802 }
5803
5804 if (Ty.isVector()) {
5805 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5806 if (IsFormat)
5807 return handleD16VData(B, *MRI, VData);
5808 }
5809 }
5810
5811 return VData;
5812}
5813
5817 bool IsTyped,
5818 bool IsFormat) const {
5819 Register VData = MI.getOperand(1).getReg();
5820 LLT Ty = MRI.getType(VData);
5821 LLT EltTy = Ty.getScalarType();
5822 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5823 const LLT S32 = LLT::scalar(32);
5824
5825 VData = fixStoreSourceType(B, VData, IsFormat);
5827 Register RSrc = MI.getOperand(2).getReg();
5828
5829 MachineMemOperand *MMO = *MI.memoperands_begin();
5830 const int MemSize = MMO->getSize().getValue();
5831
5832 unsigned ImmOffset;
5833
5834 // The typed intrinsics add an immediate after the registers.
5835 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5836
5837 // The struct intrinsic variants add one additional operand over raw.
5838 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5839 Register VIndex;
5840 int OpOffset = 0;
5841 if (HasVIndex) {
5842 VIndex = MI.getOperand(3).getReg();
5843 OpOffset = 1;
5844 } else {
5845 VIndex = B.buildConstant(S32, 0).getReg(0);
5846 }
5847
5848 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5849 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5850
5851 unsigned Format = 0;
5852 if (IsTyped) {
5853 Format = MI.getOperand(5 + OpOffset).getImm();
5854 ++OpOffset;
5855 }
5856
5857 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5858
5859 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5860
5861 unsigned Opc;
5862 if (IsTyped) {
5863 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5864 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5865 } else if (IsFormat) {
5866 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5867 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5868 } else {
5869 switch (MemSize) {
5870 case 1:
5871 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5872 break;
5873 case 2:
5874 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5875 break;
5876 default:
5877 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5878 break;
5879 }
5880 }
5881
5882 auto MIB = B.buildInstr(Opc)
5883 .addUse(VData) // vdata
5884 .addUse(RSrc) // rsrc
5885 .addUse(VIndex) // vindex
5886 .addUse(VOffset) // voffset
5887 .addUse(SOffset) // soffset
5888 .addImm(ImmOffset); // offset(imm)
5889
5890 if (IsTyped)
5891 MIB.addImm(Format);
5892
5893 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5894 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5895 .addMemOperand(MMO);
5896
5897 MI.eraseFromParent();
5898 return true;
5899}
5900
5901static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5902 Register VIndex, Register VOffset, Register SOffset,
5903 unsigned ImmOffset, unsigned Format,
5904 unsigned AuxiliaryData, MachineMemOperand *MMO,
5905 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5906 auto MIB = B.buildInstr(Opc)
5907 .addDef(LoadDstReg) // vdata
5908 .addUse(RSrc) // rsrc
5909 .addUse(VIndex) // vindex
5910 .addUse(VOffset) // voffset
5911 .addUse(SOffset) // soffset
5912 .addImm(ImmOffset); // offset(imm)
5913
5914 if (IsTyped)
5915 MIB.addImm(Format);
5916
5917 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5918 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5919 .addMemOperand(MMO);
5920}
5921
5925 bool IsFormat,
5926 bool IsTyped) const {
5927 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5928 MachineMemOperand *MMO = *MI.memoperands_begin();
5929 const LLT MemTy = MMO->getMemoryType();
5930 const LLT S32 = LLT::scalar(32);
5931
5932 Register Dst = MI.getOperand(0).getReg();
5933
5934 Register StatusDst;
5935 int OpOffset = 0;
5936 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5937 bool IsTFE = MI.getNumExplicitDefs() == 2;
5938 if (IsTFE) {
5939 StatusDst = MI.getOperand(1).getReg();
5940 ++OpOffset;
5941 }
5942
5943 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5944 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5945
5946 // The typed intrinsics add an immediate after the registers.
5947 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5948
5949 // The struct intrinsic variants add one additional operand over raw.
5950 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5951 Register VIndex;
5952 if (HasVIndex) {
5953 VIndex = MI.getOperand(3 + OpOffset).getReg();
5954 ++OpOffset;
5955 } else {
5956 VIndex = B.buildConstant(S32, 0).getReg(0);
5957 }
5958
5959 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5960 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5961
5962 unsigned Format = 0;
5963 if (IsTyped) {
5964 Format = MI.getOperand(5 + OpOffset).getImm();
5965 ++OpOffset;
5966 }
5967
5968 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5969 unsigned ImmOffset;
5970
5971 LLT Ty = MRI.getType(Dst);
5972 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5973 // logic doesn't have to handle that case.
5974 if (hasBufferRsrcWorkaround(Ty)) {
5975 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5976 Dst = MI.getOperand(0).getReg();
5977 }
5978 LLT EltTy = Ty.getScalarType();
5979 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5980 const bool Unpacked = ST.hasUnpackedD16VMem();
5981
5982 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5983
5984 unsigned Opc;
5985
5986 // TODO: Support TFE for typed and narrow loads.
5987 if (IsTyped) {
5988 if (IsTFE)
5989 return false;
5990 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5991 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5992 } else if (IsFormat) {
5993 if (IsD16) {
5994 if (IsTFE)
5995 return false;
5996 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5997 } else {
5998 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5999 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6000 }
6001 } else {
6002 switch (MemTy.getSizeInBits()) {
6003 case 8:
6004 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6005 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6006 break;
6007 case 16:
6008 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6009 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6010 break;
6011 default:
6012 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6013 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6014 break;
6015 }
6016 }
6017
6018 if (IsTFE) {
6019 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6020 unsigned NumLoadDWords = NumValueDWords + 1;
6021 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6022 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6023 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6024 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6025 if (MemTy.getSizeInBits() < 32) {
6026 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6027 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6028 B.buildTrunc(Dst, ExtDst);
6029 } else if (NumValueDWords == 1) {
6030 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6031 } else {
6032 SmallVector<Register, 5> LoadElts;
6033 for (unsigned I = 0; I != NumValueDWords; ++I)
6034 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6035 LoadElts.push_back(StatusDst);
6036 B.buildUnmerge(LoadElts, LoadDstReg);
6037 LoadElts.truncate(NumValueDWords);
6038 B.buildMergeLikeInstr(Dst, LoadElts);
6039 }
6040 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6041 (IsD16 && !Ty.isVector())) {
6042 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6043 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6044 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6045 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6046 B.buildTrunc(Dst, LoadDstReg);
6047 } else if (Unpacked && IsD16 && Ty.isVector()) {
6048 LLT UnpackedTy = Ty.changeElementSize(32);
6049 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6050 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6051 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6052 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6053 // FIXME: G_TRUNC should work, but legalization currently fails
6054 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6056 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6057 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6058 B.buildMergeLikeInstr(Dst, Repack);
6059 } else {
6060 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6061 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6062 }
6063
6064 MI.eraseFromParent();
6065 return true;
6066}
6067
6068static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6069 switch (IntrID) {
6070 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6072 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6074 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6075 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6077 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6080 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6082 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6085 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6087 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6090 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6092 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6095 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6097 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6100 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6102 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6105 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6107 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6110 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6112 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6115 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6117 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6120 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6122 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6125 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6127 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6130 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6132 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6135 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6137 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6140 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6141 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6142 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6145 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6147 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6148 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6149 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6150 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6151 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6152 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6153 default:
6154 llvm_unreachable("unhandled atomic opcode");
6155 }
6156}
6157
6160 Intrinsic::ID IID) const {
6161 const bool IsCmpSwap =
6162 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6163 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6164 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6165 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6166
6167 Register Dst = MI.getOperand(0).getReg();
6168 // Since we don't have 128-bit atomics, we don't need to handle the case of
6169 // p8 argmunents to the atomic itself
6170 Register VData = MI.getOperand(2).getReg();
6171
6172 Register CmpVal;
6173 int OpOffset = 0;
6174
6175 if (IsCmpSwap) {
6176 CmpVal = MI.getOperand(3).getReg();
6177 ++OpOffset;
6178 }
6179
6180 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6181 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6182 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6183
6184 // The struct intrinsic variants add one additional operand over raw.
6185 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6186 Register VIndex;
6187 if (HasVIndex) {
6188 VIndex = MI.getOperand(4 + OpOffset).getReg();
6189 ++OpOffset;
6190 } else {
6191 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6192 }
6193
6194 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6195 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6196 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6197
6198 MachineMemOperand *MMO = *MI.memoperands_begin();
6199
6200 unsigned ImmOffset;
6201 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6202
6203 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6204 .addDef(Dst)
6205 .addUse(VData); // vdata
6206
6207 if (IsCmpSwap)
6208 MIB.addReg(CmpVal);
6209
6210 MIB.addUse(RSrc) // rsrc
6211 .addUse(VIndex) // vindex
6212 .addUse(VOffset) // voffset
6213 .addUse(SOffset) // soffset
6214 .addImm(ImmOffset) // offset(imm)
6215 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6216 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6217 .addMemOperand(MMO);
6218
6219 MI.eraseFromParent();
6220 return true;
6221}
6222
6223/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6224/// vector with s16 typed elements.
6226 SmallVectorImpl<Register> &PackedAddrs,
6227 unsigned ArgOffset,
6229 bool IsA16, bool IsG16) {
6230 const LLT S16 = LLT::scalar(16);
6231 const LLT V2S16 = LLT::fixed_vector(2, 16);
6232 auto EndIdx = Intr->VAddrEnd;
6233
6234 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6235 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6236 if (!SrcOp.isReg())
6237 continue; // _L to _LZ may have eliminated this.
6238
6239 Register AddrReg = SrcOp.getReg();
6240
6241 if ((I < Intr->GradientStart) ||
6242 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6243 (I >= Intr->CoordStart && !IsA16)) {
6244 if ((I < Intr->GradientStart) && IsA16 &&
6245 (B.getMRI()->getType(AddrReg) == S16)) {
6246 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6247 // Special handling of bias when A16 is on. Bias is of type half but
6248 // occupies full 32-bit.
6249 PackedAddrs.push_back(
6250 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6251 .getReg(0));
6252 } else {
6253 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6254 "Bias needs to be converted to 16 bit in A16 mode");
6255 // Handle any gradient or coordinate operands that should not be packed
6256 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6257 PackedAddrs.push_back(AddrReg);
6258 }
6259 } else {
6260 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6261 // derivatives dx/dh and dx/dv are packed with undef.
6262 if (((I + 1) >= EndIdx) ||
6263 ((Intr->NumGradients / 2) % 2 == 1 &&
6264 (I == static_cast<unsigned>(Intr->GradientStart +
6265 (Intr->NumGradients / 2) - 1) ||
6266 I == static_cast<unsigned>(Intr->GradientStart +
6267 Intr->NumGradients - 1))) ||
6268 // Check for _L to _LZ optimization
6269 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6270 PackedAddrs.push_back(
6271 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6272 .getReg(0));
6273 } else {
6274 PackedAddrs.push_back(
6275 B.buildBuildVector(
6276 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6277 .getReg(0));
6278 ++I;
6279 }
6280 }
6281 }
6282}
6283
6284/// Convert from separate vaddr components to a single vector address register,
6285/// and replace the remaining operands with $noreg.
6287 int DimIdx, int NumVAddrs) {
6288 const LLT S32 = LLT::scalar(32);
6289 (void)S32;
6290 SmallVector<Register, 8> AddrRegs;
6291 for (int I = 0; I != NumVAddrs; ++I) {
6292 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6293 if (SrcOp.isReg()) {
6294 AddrRegs.push_back(SrcOp.getReg());
6295 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6296 }
6297 }
6298
6299 int NumAddrRegs = AddrRegs.size();
6300 if (NumAddrRegs != 1) {
6301 auto VAddr =
6302 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6303 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6304 }
6305
6306 for (int I = 1; I != NumVAddrs; ++I) {
6307 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6308 if (SrcOp.isReg())
6309 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6310 }
6311}
6312
6313/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6314///
6315/// Depending on the subtarget, load/store with 16-bit element data need to be
6316/// rewritten to use the low half of 32-bit registers, or directly use a packed
6317/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6318/// registers.
6319///
6320/// We don't want to directly select image instructions just yet, but also want
6321/// to exposes all register repacking to the legalizer/combiners. We also don't
6322/// want a selected instruction entering RegBankSelect. In order to avoid
6323/// defining a multitude of intermediate image instructions, directly hack on
6324/// the intrinsic's arguments. In cases like a16 addresses, this requires
6325/// padding now unnecessary arguments with $noreg.
6328 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6329
6330 const MachineFunction &MF = *MI.getMF();
6331 const unsigned NumDefs = MI.getNumExplicitDefs();
6332 const unsigned ArgOffset = NumDefs + 1;
6333 bool IsTFE = NumDefs == 2;
6334 // We are only processing the operands of d16 image operations on subtargets
6335 // that use the unpacked register layout, or need to repack the TFE result.
6336
6337 // TODO: Do we need to guard against already legalized intrinsics?
6338 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6340
6341 MachineRegisterInfo *MRI = B.getMRI();
6342 const LLT S32 = LLT::scalar(32);
6343 const LLT S16 = LLT::scalar(16);
6344 const LLT V2S16 = LLT::fixed_vector(2, 16);
6345
6346 unsigned DMask = 0;
6347 Register VData;
6348 LLT Ty;
6349
6350 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6351 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6352 Ty = MRI->getType(VData);
6353 }
6354
6355 const bool IsAtomicPacked16Bit =
6356 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6357 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6358
6359 // Check for 16 bit addresses and pack if true.
6360 LLT GradTy =
6361 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6362 LLT AddrTy =
6363 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6364 const bool IsG16 =
6365 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6366 const bool IsA16 = AddrTy == S16;
6367 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6368
6369 int DMaskLanes = 0;
6370 if (!BaseOpcode->Atomic) {
6371 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6372 if (BaseOpcode->Gather4) {
6373 DMaskLanes = 4;
6374 } else if (DMask != 0) {
6375 DMaskLanes = llvm::popcount(DMask);
6376 } else if (!IsTFE && !BaseOpcode->Store) {
6377 // If dmask is 0, this is a no-op load. This can be eliminated.
6378 B.buildUndef(MI.getOperand(0));
6379 MI.eraseFromParent();
6380 return true;
6381 }
6382 }
6383
6384 Observer.changingInstr(MI);
6385 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6386
6387 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6388 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6389 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6390 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6391 unsigned NewOpcode = LoadOpcode;
6392 if (BaseOpcode->Store)
6393 NewOpcode = StoreOpcode;
6394 else if (BaseOpcode->NoReturn)
6395 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6396
6397 // Track that we legalized this
6398 MI.setDesc(B.getTII().get(NewOpcode));
6399
6400 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6401 // dmask to be at least 1 otherwise the instruction will fail
6402 if (IsTFE && DMask == 0) {
6403 DMask = 0x1;
6404 DMaskLanes = 1;
6405 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6406 }
6407
6408 if (BaseOpcode->Atomic) {
6409 Register VData0 = MI.getOperand(2).getReg();
6410 LLT Ty = MRI->getType(VData0);
6411
6412 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6413 if (Ty.isVector() && !IsAtomicPacked16Bit)
6414 return false;
6415
6416 if (BaseOpcode->AtomicX2) {
6417 Register VData1 = MI.getOperand(3).getReg();
6418 // The two values are packed in one register.
6419 LLT PackedTy = LLT::fixed_vector(2, Ty);
6420 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6421 MI.getOperand(2).setReg(Concat.getReg(0));
6422 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6423 }
6424 }
6425
6426 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6427
6428 // Rewrite the addressing register layout before doing anything else.
6429 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6430 // 16 bit gradients are supported, but are tied to the A16 control
6431 // so both gradients and addresses must be 16 bit
6432 return false;
6433 }
6434
6435 if (IsA16 && !ST.hasA16()) {
6436 // A16 not supported
6437 return false;
6438 }
6439
6440 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6441 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6442
6443 if (IsA16 || IsG16) {
6444 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6445 // instructions expect VGPR_32
6446 SmallVector<Register, 4> PackedRegs;
6447
6448 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6449
6450 // See also below in the non-a16 branch
6451 const bool UseNSA = ST.hasNSAEncoding() &&
6452 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6453 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6454 const bool UsePartialNSA =
6455 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6456
6457 if (UsePartialNSA) {
6458 // Pack registers that would go over NSAMaxSize into last VAddr register
6459 LLT PackedAddrTy =
6460 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6461 auto Concat = B.buildConcatVectors(
6462 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6463 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6464 PackedRegs.resize(NSAMaxSize);
6465 } else if (!UseNSA && PackedRegs.size() > 1) {
6466 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6467 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6468 PackedRegs[0] = Concat.getReg(0);
6469 PackedRegs.resize(1);
6470 }
6471
6472 const unsigned NumPacked = PackedRegs.size();
6473 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6474 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6475 if (!SrcOp.isReg()) {
6476 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6477 continue;
6478 }
6479
6480 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6481
6482 if (I - Intr->VAddrStart < NumPacked)
6483 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6484 else
6485 SrcOp.setReg(AMDGPU::NoRegister);
6486 }
6487 } else {
6488 // If the register allocator cannot place the address registers contiguously
6489 // without introducing moves, then using the non-sequential address encoding
6490 // is always preferable, since it saves VALU instructions and is usually a
6491 // wash in terms of code size or even better.
6492 //
6493 // However, we currently have no way of hinting to the register allocator
6494 // that MIMG addresses should be placed contiguously when it is possible to
6495 // do so, so force non-NSA for the common 2-address case as a heuristic.
6496 //
6497 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6498 // allocation when possible.
6499 //
6500 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6501 // set of the remaining addresses.
6502 const bool UseNSA = ST.hasNSAEncoding() &&
6503 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6504 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6505 const bool UsePartialNSA =
6506 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6507
6508 if (UsePartialNSA) {
6510 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6511 Intr->NumVAddrs - NSAMaxSize + 1);
6512 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6513 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6514 Intr->NumVAddrs);
6515 }
6516 }
6517
6518 int Flags = 0;
6519 if (IsA16)
6520 Flags |= 1;
6521 if (IsG16)
6522 Flags |= 2;
6523 MI.addOperand(MachineOperand::CreateImm(Flags));
6524
6525 if (BaseOpcode->NoReturn) { // No TFE for stores?
6526 // TODO: Handle dmask trim
6527 if (!Ty.isVector() || !IsD16)
6528 return true;
6529
6530 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6531 if (RepackedReg != VData) {
6532 MI.getOperand(1).setReg(RepackedReg);
6533 }
6534
6535 return true;
6536 }
6537
6538 Register DstReg = MI.getOperand(0).getReg();
6539 const LLT EltTy = Ty.getScalarType();
6540 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6541
6542 // Confirm that the return type is large enough for the dmask specified
6543 if (NumElts < DMaskLanes)
6544 return false;
6545
6546 if (NumElts > 4 || DMaskLanes > 4)
6547 return false;
6548
6549 // Image atomic instructions are using DMask to specify how many bits
6550 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6551 // DMaskLanes for image atomic has default value '0'.
6552 // We must be sure that atomic variants (especially packed) will not be
6553 // truncated from v2s16 or v4s16 to s16 type.
6554 //
6555 // ChangeElementCount will be needed for image load where Ty is always scalar.
6556 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6557 const LLT AdjustedTy =
6558 DMaskLanes == 0
6559 ? Ty
6560 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6561
6562 // The raw dword aligned data component of the load. The only legal cases
6563 // where this matters should be when using the packed D16 format, for
6564 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6565 LLT RoundedTy;
6566
6567 // S32 vector to cover all data, plus TFE result element.
6568 LLT TFETy;
6569
6570 // Register type to use for each loaded component. Will be S32 or V2S16.
6571 LLT RegTy;
6572
6573 if (IsD16 && ST.hasUnpackedD16VMem()) {
6574 RoundedTy =
6575 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6576 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6577 RegTy = S32;
6578 } else {
6579 unsigned EltSize = EltTy.getSizeInBits();
6580 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6581 unsigned RoundedSize = 32 * RoundedElts;
6582 RoundedTy = LLT::scalarOrVector(
6583 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6584 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6585 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6586 }
6587
6588 // The return type does not need adjustment.
6589 // TODO: Should we change s16 case to s32 or <2 x s16>?
6590 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6591 return true;
6592
6593 Register Dst1Reg;
6594
6595 // Insert after the instruction.
6596 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6597
6598 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6599 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6600 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6601 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6602
6603 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6604
6605 MI.getOperand(0).setReg(NewResultReg);
6606
6607 // In the IR, TFE is supposed to be used with a 2 element struct return
6608 // type. The instruction really returns these two values in one contiguous
6609 // register, with one additional dword beyond the loaded data. Rewrite the
6610 // return type to use a single register result.
6611
6612 if (IsTFE) {
6613 Dst1Reg = MI.getOperand(1).getReg();
6614 if (MRI->getType(Dst1Reg) != S32)
6615 return false;
6616
6617 // TODO: Make sure the TFE operand bit is set.
6618 MI.removeOperand(1);
6619
6620 // Handle the easy case that requires no repack instructions.
6621 if (Ty == S32) {
6622 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6623 return true;
6624 }
6625 }
6626
6627 // Now figure out how to copy the new result register back into the old
6628 // result.
6629 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6630
6631 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6632
6633 if (ResultNumRegs == 1) {
6634 assert(!IsTFE);
6635 ResultRegs[0] = NewResultReg;
6636 } else {
6637 // We have to repack into a new vector of some kind.
6638 for (int I = 0; I != NumDataRegs; ++I)
6639 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6640 B.buildUnmerge(ResultRegs, NewResultReg);
6641
6642 // Drop the final TFE element to get the data part. The TFE result is
6643 // directly written to the right place already.
6644 if (IsTFE)
6645 ResultRegs.resize(NumDataRegs);
6646 }
6647
6648 // For an s16 scalar result, we form an s32 result with a truncate regardless
6649 // of packed vs. unpacked.
6650 if (IsD16 && !Ty.isVector()) {
6651 B.buildTrunc(DstReg, ResultRegs[0]);
6652 return true;
6653 }
6654
6655 // Avoid a build/concat_vector of 1 entry.
6656 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6657 B.buildBitcast(DstReg, ResultRegs[0]);
6658 return true;
6659 }
6660
6661 assert(Ty.isVector());
6662
6663 if (IsD16) {
6664 // For packed D16 results with TFE enabled, all the data components are
6665 // S32. Cast back to the expected type.
6666 //
6667 // TODO: We don't really need to use load s32 elements. We would only need one
6668 // cast for the TFE result if a multiple of v2s16 was used.
6669 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6670 for (Register &Reg : ResultRegs)
6671 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6672 } else if (ST.hasUnpackedD16VMem()) {
6673 for (Register &Reg : ResultRegs)
6674 Reg = B.buildTrunc(S16, Reg).getReg(0);
6675 }
6676 }
6677
6678 auto padWithUndef = [&](LLT Ty, int NumElts) {
6679 if (NumElts == 0)
6680 return;
6681 Register Undef = B.buildUndef(Ty).getReg(0);
6682 for (int I = 0; I != NumElts; ++I)
6683 ResultRegs.push_back(Undef);
6684 };
6685
6686 // Pad out any elements eliminated due to the dmask.
6687 LLT ResTy = MRI->getType(ResultRegs[0]);
6688 if (!ResTy.isVector()) {
6689 padWithUndef(ResTy, NumElts - ResultRegs.size());
6690 B.buildBuildVector(DstReg, ResultRegs);
6691 return true;
6692 }
6693
6694 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6695 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6696
6697 // Deal with the one annoying legal case.
6698 const LLT V3S16 = LLT::fixed_vector(3, 16);
6699 if (Ty == V3S16) {
6700 if (IsTFE) {
6701 if (ResultRegs.size() == 1) {
6702 NewResultReg = ResultRegs[0];
6703 } else if (ResultRegs.size() == 2) {
6704 LLT V4S16 = LLT::fixed_vector(4, 16);
6705 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6706 } else {
6707 return false;
6708 }
6709 }
6710
6711 if (MRI->getType(DstReg).getNumElements() <
6712 MRI->getType(NewResultReg).getNumElements()) {
6713 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6714 } else {
6715 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6716 }
6717 return true;
6718 }
6719
6720 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6721 B.buildConcatVectors(DstReg, ResultRegs);
6722 return true;
6723}
6724
6726 MachineInstr &MI) const {
6727 MachineIRBuilder &B = Helper.MIRBuilder;
6728 GISelChangeObserver &Observer = Helper.Observer;
6729
6730 Register OrigDst = MI.getOperand(0).getReg();
6731 Register Dst;
6732 LLT Ty = B.getMRI()->getType(OrigDst);
6733 unsigned Size = Ty.getSizeInBits();
6734 MachineFunction &MF = B.getMF();
6735 unsigned Opc = 0;
6736 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6737 assert(Size == 8 || Size == 16);
6738 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6739 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6740 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6741 // destination register.
6742 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6743 } else {
6744 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6745 Dst = OrigDst;
6746 }
6747
6748 Observer.changingInstr(MI);
6749
6750 // Handle needing to s.buffer.load() a p8 value.
6751 if (hasBufferRsrcWorkaround(Ty)) {
6752 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6753 B.setInsertPt(B.getMBB(), MI);
6754 }
6756 Ty = getBitcastRegisterType(Ty);
6757 Helper.bitcastDst(MI, Ty, 0);
6758 B.setInsertPt(B.getMBB(), MI);
6759 }
6760
6761 // FIXME: We don't really need this intermediate instruction. The intrinsic
6762 // should be fixed to have a memory operand. Since it's readnone, we're not
6763 // allowed to add one.
6764 MI.setDesc(B.getTII().get(Opc));
6765 MI.removeOperand(1); // Remove intrinsic ID
6766
6767 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6768 const unsigned MemSize = (Size + 7) / 8;
6769 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6775 MemSize, MemAlign);
6776 MI.addMemOperand(MF, MMO);
6777 if (Dst != OrigDst) {
6778 MI.getOperand(0).setReg(Dst);
6779 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6780 B.buildTrunc(OrigDst, Dst);
6781 }
6782
6783 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6784 // always be legal. We may need to restore this to a 96-bit result if it turns
6785 // out this needs to be converted to a vector load during RegBankSelect.
6786 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6787 if (Ty.isVector())
6789 else
6790 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6791 }
6792
6793 Observer.changedInstr(MI);
6794 return true;
6795}
6796
6797// TODO: Move to selection
6800 MachineIRBuilder &B) const {
6801 if (!ST.isTrapHandlerEnabled() ||
6803 return legalizeTrapEndpgm(MI, MRI, B);
6804
6805 return ST.supportsGetDoorbellID() ?
6807}
6808
6811 const DebugLoc &DL = MI.getDebugLoc();
6812 MachineBasicBlock &BB = B.getMBB();
6813 MachineFunction *MF = BB.getParent();
6814
6815 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6816 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6817 .addImm(0);
6818 MI.eraseFromParent();
6819 return true;
6820 }
6821
6822 // We need a block split to make the real endpgm a terminator. We also don't
6823 // want to break phis in successor blocks, so we can't just delete to the
6824 // end of the block.
6825 BB.splitAt(MI, false /*UpdateLiveIns*/);
6827 MF->push_back(TrapBB);
6828 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6829 .addImm(0);
6830 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6831 .addMBB(TrapBB);
6832
6833 BB.addSuccessor(TrapBB);
6834 MI.eraseFromParent();
6835 return true;
6836}
6837
6840 MachineFunction &MF = B.getMF();
6841 const LLT S64 = LLT::scalar(64);
6842
6843 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6844 // For code object version 5, queue_ptr is passed through implicit kernarg.
6850 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6851
6852 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6854
6855 if (!loadInputValue(KernargPtrReg, B,
6857 return false;
6858
6859 // TODO: can we be smarter about machine pointer info?
6862 PtrInfo,
6866
6867 // Pointer address
6868 Register LoadAddr = MRI.createGenericVirtualRegister(
6870 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6871 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6872 // Load address
6873 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6874 B.buildCopy(SGPR01, Temp);
6875 B.buildInstr(AMDGPU::S_TRAP)
6876 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6877 .addReg(SGPR01, RegState::Implicit);
6878 MI.eraseFromParent();
6879 return true;
6880 }
6881
6882 // Pass queue pointer to trap handler as input, and insert trap instruction
6883 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6884 Register LiveIn =
6885 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6887 return false;
6888
6889 B.buildCopy(SGPR01, LiveIn);
6890 B.buildInstr(AMDGPU::S_TRAP)
6891 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6892 .addReg(SGPR01, RegState::Implicit);
6893
6894 MI.eraseFromParent();
6895 return true;
6896}
6897
6900 MachineIRBuilder &B) const {
6901 // We need to simulate the 's_trap 2' instruction on targets that run in
6902 // PRIV=1 (where it is treated as a nop).
6903 if (ST.hasPrivEnabledTrap2NopBug()) {
6904 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6905 MI.getDebugLoc());
6906 MI.eraseFromParent();
6907 return true;
6908 }
6909
6910 B.buildInstr(AMDGPU::S_TRAP)
6911 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6912 MI.eraseFromParent();
6913 return true;
6914}
6915
6918 MachineIRBuilder &B) const {
6919 // Is non-HSA path or trap-handler disabled? Then, report a warning
6920 // accordingly
6921 if (!ST.isTrapHandlerEnabled() ||
6923 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6924 "debugtrap handler not supported",
6925 MI.getDebugLoc(), DS_Warning);
6926 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6927 Ctx.diagnose(NoTrap);
6928 } else {
6929 // Insert debug-trap instruction
6930 B.buildInstr(AMDGPU::S_TRAP)
6931 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6932 }
6933
6934 MI.eraseFromParent();
6935 return true;
6936}
6937
6939 MachineIRBuilder &B) const {
6940 MachineRegisterInfo &MRI = *B.getMRI();
6941 const LLT S16 = LLT::scalar(16);
6942 const LLT S32 = LLT::scalar(32);
6943 const LLT V2S16 = LLT::fixed_vector(2, 16);
6944 const LLT V3S32 = LLT::fixed_vector(3, 32);
6945
6946 Register DstReg = MI.getOperand(0).getReg();
6947 Register NodePtr = MI.getOperand(2).getReg();
6948 Register RayExtent = MI.getOperand(3).getReg();
6949 Register RayOrigin = MI.getOperand(4).getReg();
6950 Register RayDir = MI.getOperand(5).getReg();
6951 Register RayInvDir = MI.getOperand(6).getReg();
6952 Register TDescr = MI.getOperand(7).getReg();
6953
6954 if (!ST.hasGFX10_AEncoding()) {
6955 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6956 "intrinsic not supported on subtarget",
6957 MI.getDebugLoc());
6958 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6959 return false;
6960 }
6961
6962 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6963 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6964 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6965 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6966 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6967 const unsigned NumVDataDwords = 4;
6968 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6969 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6970 const bool UseNSA =
6971 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6972
6973 const unsigned BaseOpcodes[2][2] = {
6974 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6975 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6976 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6977 int Opcode;
6978 if (UseNSA) {
6979 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6980 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6981 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6982 : AMDGPU::MIMGEncGfx10NSA,
6983 NumVDataDwords, NumVAddrDwords);
6984 } else {
6985 assert(!IsGFX12Plus);
6986 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6987 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6988 : AMDGPU::MIMGEncGfx10Default,
6989 NumVDataDwords, NumVAddrDwords);
6990 }
6991 assert(Opcode != -1);
6992
6994 if (UseNSA && IsGFX11Plus) {
6995 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6996 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6997 auto Merged = B.buildMergeLikeInstr(
6998 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6999 Ops.push_back(Merged.getReg(0));
7000 };
7001
7002 Ops.push_back(NodePtr);
7003 Ops.push_back(RayExtent);
7004 packLanes(RayOrigin);
7005
7006 if (IsA16) {
7007 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7008 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7009 auto MergedDir = B.buildMergeLikeInstr(
7010 V3S32,
7011 {B.buildBitcast(
7012 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7013 UnmergeRayDir.getReg(0)}))
7014 .getReg(0),
7015 B.buildBitcast(
7016 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7017 UnmergeRayDir.getReg(1)}))
7018 .getReg(0),
7019 B.buildBitcast(
7020 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7021 UnmergeRayDir.getReg(2)}))
7022 .getReg(0)});
7023 Ops.push_back(MergedDir.getReg(0));
7024 } else {
7025 packLanes(RayDir);
7026 packLanes(RayInvDir);
7027 }
7028 } else {
7029 if (Is64) {
7030 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7031 Ops.push_back(Unmerge.getReg(0));
7032 Ops.push_back(Unmerge.getReg(1));
7033 } else {
7034 Ops.push_back(NodePtr);
7035 }
7036 Ops.push_back(RayExtent);
7037
7038 auto packLanes = [&Ops, &S32, &B](Register Src) {
7039 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7040 Ops.push_back(Unmerge.getReg(0));
7041 Ops.push_back(Unmerge.getReg(1));
7042 Ops.push_back(Unmerge.getReg(2));
7043 };
7044
7045 packLanes(RayOrigin);
7046 if (IsA16) {
7047 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7048 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7049 Register R1 = MRI.createGenericVirtualRegister(S32);
7050 Register R2 = MRI.createGenericVirtualRegister(S32);
7051 Register R3 = MRI.createGenericVirtualRegister(S32);
7052 B.buildMergeLikeInstr(R1,
7053 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7054 B.buildMergeLikeInstr(
7055 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7056 B.buildMergeLikeInstr(
7057 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7058 Ops.push_back(R1);
7059 Ops.push_back(R2);
7060 Ops.push_back(R3);
7061 } else {
7062 packLanes(RayDir);
7063 packLanes(RayInvDir);
7064 }
7065 }
7066
7067 if (!UseNSA) {
7068 // Build a single vector containing all the operands so far prepared.
7069 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7070 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7071 Ops.clear();
7072 Ops.push_back(MergedOps);
7073 }
7074
7075 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7076 .addDef(DstReg)
7077 .addImm(Opcode);
7078
7079 for (Register R : Ops) {
7080 MIB.addUse(R);
7081 }
7082
7083 MIB.addUse(TDescr)
7084 .addImm(IsA16 ? 1 : 0)
7085 .cloneMemRefs(MI);
7086
7087 MI.eraseFromParent();
7088 return true;
7089}
7090
7092 MachineIRBuilder &B) const {
7093 unsigned Opc;
7094 int RoundMode = MI.getOperand(2).getImm();
7095
7096 if (RoundMode == (int)RoundingMode::TowardPositive)
7097 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7098 else if (RoundMode == (int)RoundingMode::TowardNegative)
7099 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7100 else
7101 return false;
7102
7103 B.buildInstr(Opc)
7104 .addDef(MI.getOperand(0).getReg())
7105 .addUse(MI.getOperand(1).getReg());
7106
7107 MI.eraseFromParent();
7108
7109 return true;
7110}
7111
7113 MachineIRBuilder &B) const {
7114 const SITargetLowering *TLI = ST.getTargetLowering();
7116 Register DstReg = MI.getOperand(0).getReg();
7117 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7118 MI.eraseFromParent();
7119 return true;
7120}
7121
7123 MachineIRBuilder &B) const {
7124 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7125 if (!ST.hasArchitectedSGPRs())
7126 return false;
7127 LLT S32 = LLT::scalar(32);
7128 Register DstReg = MI.getOperand(0).getReg();
7129 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7130 auto LSB = B.buildConstant(S32, 25);
7131 auto Width = B.buildConstant(S32, 5);
7132 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7133 MI.eraseFromParent();
7134 return true;
7135}
7136
7137static constexpr unsigned FPEnvModeBitField =
7139
7140static constexpr unsigned FPEnvTrapBitField =
7142
7145 MachineIRBuilder &B) const {
7146 Register Src = MI.getOperand(0).getReg();
7147 if (MRI.getType(Src) != S64)
7148 return false;
7149
7150 auto ModeReg =
7151 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7152 /*HasSideEffects=*/true, /*isConvergent=*/false)
7153 .addImm(FPEnvModeBitField);
7154 auto TrapReg =
7155 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7156 /*HasSideEffects=*/true, /*isConvergent=*/false)
7157 .addImm(FPEnvTrapBitField);
7158 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7159 MI.eraseFromParent();
7160 return true;
7161}
7162
7165 MachineIRBuilder &B) const {
7166 Register Src = MI.getOperand(0).getReg();
7167 if (MRI.getType(Src) != S64)
7168 return false;
7169
7170 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7171 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7172 /*HasSideEffects=*/true, /*isConvergent=*/false)
7173 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7174 .addReg(Unmerge.getReg(0));
7175 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7176 /*HasSideEffects=*/true, /*isConvergent=*/false)
7177 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7178 .addReg(Unmerge.getReg(1));
7179 MI.eraseFromParent();
7180 return true;
7181}
7182
7184 MachineInstr &MI) const {
7185 MachineIRBuilder &B = Helper.MIRBuilder;
7186 MachineRegisterInfo &MRI = *B.getMRI();
7187
7188 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7189 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7190 switch (IntrID) {
7191 case Intrinsic::amdgcn_if:
7192 case Intrinsic::amdgcn_else: {
7193 MachineInstr *Br = nullptr;
7194 MachineBasicBlock *UncondBrTarget = nullptr;
7195 bool Negated = false;
7196 if (MachineInstr *BrCond =
7197 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7198 const SIRegisterInfo *TRI
7199 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7200
7201 Register Def = MI.getOperand(1).getReg();
7202 Register Use = MI.getOperand(3).getReg();
7203
7204 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7205
7206 if (Negated)
7207 std::swap(CondBrTarget, UncondBrTarget);
7208
7209 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7210 if (IntrID == Intrinsic::amdgcn_if) {
7211 B.buildInstr(AMDGPU::SI_IF)
7212 .addDef(Def)
7213 .addUse(Use)
7214 .addMBB(UncondBrTarget);
7215 } else {
7216 B.buildInstr(AMDGPU::SI_ELSE)
7217 .addDef(Def)
7218 .addUse(Use)
7219 .addMBB(UncondBrTarget);
7220 }
7221
7222 if (Br) {
7223 Br->getOperand(0).setMBB(CondBrTarget);
7224 } else {
7225 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7226 // since we're swapping branch targets it needs to be reinserted.
7227 // FIXME: IRTranslator should probably not do this
7228 B.buildBr(*CondBrTarget);
7229 }
7230
7231 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7232 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7233 MI.eraseFromParent();
7234 BrCond->eraseFromParent();
7235 return true;
7236 }
7237
7238 return false;
7239 }
7240 case Intrinsic::amdgcn_loop: {
7241 MachineInstr *Br = nullptr;
7242 MachineBasicBlock *UncondBrTarget = nullptr;
7243 bool Negated = false;
7244 if (MachineInstr *BrCond =
7245 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7246 const SIRegisterInfo *TRI
7247 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7248
7249 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7250 Register Reg = MI.getOperand(2).getReg();
7251
7252 if (Negated)
7253 std::swap(CondBrTarget, UncondBrTarget);
7254
7255 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7256 B.buildInstr(AMDGPU::SI_LOOP)
7257 .addUse(Reg)
7258 .addMBB(UncondBrTarget);
7259
7260 if (Br)
7261 Br->getOperand(0).setMBB(CondBrTarget);
7262 else
7263 B.buildBr(*CondBrTarget);
7264
7265 MI.eraseFromParent();
7266 BrCond->eraseFromParent();
7267 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7268 return true;
7269 }
7270
7271 return false;
7272 }
7273 case Intrinsic::amdgcn_addrspacecast_nonnull:
7274 return legalizeAddrSpaceCast(MI, MRI, B);
7275 case Intrinsic::amdgcn_make_buffer_rsrc:
7277 case Intrinsic::amdgcn_kernarg_segment_ptr:
7278 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7279 // This only makes sense to call in a kernel, so just lower to null.
7280 B.buildConstant(MI.getOperand(0).getReg(), 0);
7281 MI.eraseFromParent();
7282 return true;
7283 }
7284
7287 case Intrinsic::amdgcn_implicitarg_ptr:
7288 return legalizeImplicitArgPtr(MI, MRI, B);
7289 case Intrinsic::amdgcn_workitem_id_x:
7290 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7292 case Intrinsic::amdgcn_workitem_id_y:
7293 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7295 case Intrinsic::amdgcn_workitem_id_z:
7296 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7298 case Intrinsic::amdgcn_workgroup_id_x:
7301 case Intrinsic::amdgcn_workgroup_id_y:
7304 case Intrinsic::amdgcn_workgroup_id_z:
7307 case Intrinsic::amdgcn_wave_id:
7308 return legalizeWaveID(MI, B);
7309 case Intrinsic::amdgcn_lds_kernel_id:
7312 case Intrinsic::amdgcn_dispatch_ptr:
7315 case Intrinsic::amdgcn_queue_ptr:
7318 case Intrinsic::amdgcn_implicit_buffer_ptr:
7321 case Intrinsic::amdgcn_dispatch_id:
7324 case Intrinsic::r600_read_ngroups_x:
7325 // TODO: Emit error for hsa
7328 case Intrinsic::r600_read_ngroups_y:
7331 case Intrinsic::r600_read_ngroups_z:
7334 case Intrinsic::r600_read_local_size_x:
7335 // TODO: Could insert G_ASSERT_ZEXT from s16
7337 case Intrinsic::r600_read_local_size_y:
7338 // TODO: Could insert G_ASSERT_ZEXT from s16
7340 // TODO: Could insert G_ASSERT_ZEXT from s16
7341 case Intrinsic::r600_read_local_size_z:
7343 case Intrinsic::r600_read_global_size_x:
7345 case Intrinsic::r600_read_global_size_y:
7347 case Intrinsic::r600_read_global_size_z:
7349 case Intrinsic::amdgcn_fdiv_fast:
7350 return legalizeFDIVFastIntrin(MI, MRI, B);
7351 case Intrinsic::amdgcn_is_shared:
7353 case Intrinsic::amdgcn_is_private:
7355 case Intrinsic::amdgcn_wavefrontsize: {
7356 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7357 MI.eraseFromParent();
7358 return true;
7359 }
7360 case Intrinsic::amdgcn_s_buffer_load:
7361 return legalizeSBufferLoad(Helper, MI);
7362 case Intrinsic::amdgcn_raw_buffer_store:
7363 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7364 case Intrinsic::amdgcn_struct_buffer_store:
7365 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7366 return legalizeBufferStore(MI, MRI, B, false, false);
7367 case Intrinsic::amdgcn_raw_buffer_store_format:
7368 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7369 case Intrinsic::amdgcn_struct_buffer_store_format:
7370 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7371 return legalizeBufferStore(MI, MRI, B, false, true);
7372 case Intrinsic::amdgcn_raw_tbuffer_store:
7373 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7374 case Intrinsic::amdgcn_struct_tbuffer_store:
7375 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7376 return legalizeBufferStore(MI, MRI, B, true, true);
7377 case Intrinsic::amdgcn_raw_buffer_load:
7378 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7379 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7380 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7381 case Intrinsic::amdgcn_struct_buffer_load:
7382 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7383 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7384 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7385 return legalizeBufferLoad(MI, MRI, B, false, false);
7386 case Intrinsic::amdgcn_raw_buffer_load_format:
7387 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7388 case Intrinsic::amdgcn_struct_buffer_load_format:
7389 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7390 return legalizeBufferLoad(MI, MRI, B, true, false);
7391 case Intrinsic::amdgcn_raw_tbuffer_load:
7392 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7393 case Intrinsic::amdgcn_struct_tbuffer_load:
7394 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7395 return legalizeBufferLoad(MI, MRI, B, true, true);
7396 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7398 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7400 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7402 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7404 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7406 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7408 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7410 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7412 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7414 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7415 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7416 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7417 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7418 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7420 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7421 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7422 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7424 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7426 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7427 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7428 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7430 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7431 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7432 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7433 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7434 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7436 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7438 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7440 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7442 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7443 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7444 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7446 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7447 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7448 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7449 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7450 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7451 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7452 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7454 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7455 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7456 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7458 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7459 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7460 return legalizeBufferAtomic(MI, B, IntrID);
7461 case Intrinsic::amdgcn_rsq_clamp:
7463 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7464 return legalizeBVHIntrinsic(MI, B);
7465 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7466 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7467 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7468 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7469 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7470 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7471 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7472 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7473 Register Index = MI.getOperand(5).getReg();
7474 LLT S32 = LLT::scalar(32);
7475 if (MRI.getType(Index) != S32)
7476 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7477 return true;
7478 }
7479 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7480 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7481 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7482 Register Index = MI.getOperand(7).getReg();
7483 LLT S32 = LLT::scalar(32);
7484 if (MRI.getType(Index) != S32)
7485 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7486 return true;
7487 }
7488 case Intrinsic::amdgcn_fmed3: {
7489 GISelChangeObserver &Observer = Helper.Observer;
7490
7491 // FIXME: This is to workaround the inability of tablegen match combiners to
7492 // match intrinsics in patterns.
7493 Observer.changingInstr(MI);
7494 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7495 MI.removeOperand(1);
7496 Observer.changedInstr(MI);
7497 return true;
7498 }
7499 case Intrinsic::amdgcn_readlane:
7500 case Intrinsic::amdgcn_writelane:
7501 case Intrinsic::amdgcn_readfirstlane:
7502 case Intrinsic::amdgcn_permlane16:
7503 case Intrinsic::amdgcn_permlanex16:
7504 case Intrinsic::amdgcn_permlane64:
7505 return legalizeLaneOp(Helper, MI, IntrID);
7506 default: {
7507 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7509 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7510 return true;
7511 }
7512 }
7513
7514 return true;
7515}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
unsigned Intr
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static const LLT V3S64
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static const LLT V16S16
static const LLT S128
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static const LLT V4S32
static const LLT V2S32
static const LLT V8S64
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static const LLT V12S32
static const LLT V8S32
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static const LLT V2S16
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static const LLT V4S64
static const LLT S1
static const LLT V3S32
static const LLT S64
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static const LLT S32
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static const LLT V6S32
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static const LLT V7S32
static const LLT V5S32
static const LLT V4S16
static const LLT V11S32
static const LLT F64
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static const LLT V32S32
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static const LLT V9S32
static const LLT V10S32
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static const LLT V12S16
static const LLT V16S64
static const LLT S512
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V16S32
static const LLT V7S64
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static const LLT V5S64
static const LLT S160
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static const LLT V4S128
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static const LLT V2BF16
static const LLT V6S64
static constexpr unsigned MaxRegisterSize
static const LLT V2S8
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static const LLT S96
static const LLT V2S64
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static const LLT S16
static const LLT V10S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V2S128
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static const LLT V2F16
static const LLT S256
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static const LLT S8
static const LLT V6S16
static bool isRegisterVectorType(LLT Ty)
static const LLT S224
static const LLT V8S16
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
static const LLT F32
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition: MachO.cpp:71
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define R2(n)
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
ppc ctr loops verify
const char LLVMTargetMachineRef TM
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1171
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
Value * RHS
Value * LHS
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1064
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1044
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1004
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasA16() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:477
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:455
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:461
bool hasMad64_32() const
Definition: GCNSubtarget.h:741
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:457
bool hasIntClamp() const
Definition: GCNSubtarget.h:357
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:274
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:377
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:601
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:730
bool hasNSAEncoding() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:728
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasFractBug() const
Definition: GCNSubtarget.h:395
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
static constexpr LLT float64()
Get a 64-bit IEEE double value.
Definition: LowLevelType.h:94
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
Definition: LowLevelType.h:214
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:64
constexpr bool isPointerVector() const
Definition: LowLevelType.h:152
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:159
constexpr bool isVector() const
Definition: LowLevelType.h:148
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr bool isPointer() const
Definition: LowLevelType.h:149
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:184
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
static constexpr LLT float16()
Get a 16-bit IEEE half value.
Definition: LowLevelType.h:84
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr bool isPointerOrPointerVector() const
Definition: LowLevelType.h:153
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
Definition: LowLevelType.h:230
constexpr LLT getScalarType() const
Definition: LowLevelType.h:208
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
Definition: LowLevelType.h:124
static constexpr LLT float32()
Get a 32-bit IEEE float value.
Definition: LowLevelType.h:89
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:387
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:412
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void truncate(size_type N)
Like resize, but requires that N is less than size().
Definition: SmallVector.h:657
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:903
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:353
@ Offset
Definition: DWP.cpp:480
Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition: Utils.cpp:1974
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:639
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:452
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
void * PointerTy
Definition: GenericValue.h:21
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:307
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Add
Sum of integers.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition: Utils.cpp:1666
@ DS_Warning
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:426
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:382
@ Enable
Enable colors.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static constexpr uint64_t encode(Fields... Values)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:276
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:277
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:76
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.