LLVM 20.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT F32 = LLT::float32();
287static const LLT S64 = LLT::scalar(64);
288static const LLT F64 = LLT::float64();
289static const LLT S96 = LLT::scalar(96);
290static const LLT S128 = LLT::scalar(128);
291static const LLT S160 = LLT::scalar(160);
292static const LLT S224 = LLT::scalar(224);
293static const LLT S256 = LLT::scalar(256);
294static const LLT S512 = LLT::scalar(512);
296
297static const LLT V2S8 = LLT::fixed_vector(2, 8);
298static const LLT V2S16 = LLT::fixed_vector(2, 16);
299static const LLT V4S16 = LLT::fixed_vector(4, 16);
300static const LLT V6S16 = LLT::fixed_vector(6, 16);
301static const LLT V8S16 = LLT::fixed_vector(8, 16);
302static const LLT V10S16 = LLT::fixed_vector(10, 16);
303static const LLT V12S16 = LLT::fixed_vector(12, 16);
304static const LLT V16S16 = LLT::fixed_vector(16, 16);
305
307static const LLT V2BF16 = V2F16; // FIXME
308
309static const LLT V2S32 = LLT::fixed_vector(2, 32);
310static const LLT V3S32 = LLT::fixed_vector(3, 32);
311static const LLT V4S32 = LLT::fixed_vector(4, 32);
312static const LLT V5S32 = LLT::fixed_vector(5, 32);
313static const LLT V6S32 = LLT::fixed_vector(6, 32);
314static const LLT V7S32 = LLT::fixed_vector(7, 32);
315static const LLT V8S32 = LLT::fixed_vector(8, 32);
316static const LLT V9S32 = LLT::fixed_vector(9, 32);
317static const LLT V10S32 = LLT::fixed_vector(10, 32);
318static const LLT V11S32 = LLT::fixed_vector(11, 32);
319static const LLT V12S32 = LLT::fixed_vector(12, 32);
320static const LLT V16S32 = LLT::fixed_vector(16, 32);
321static const LLT V32S32 = LLT::fixed_vector(32, 32);
322
323static const LLT V2S64 = LLT::fixed_vector(2, 64);
324static const LLT V3S64 = LLT::fixed_vector(3, 64);
325static const LLT V4S64 = LLT::fixed_vector(4, 64);
326static const LLT V5S64 = LLT::fixed_vector(5, 64);
327static const LLT V6S64 = LLT::fixed_vector(6, 64);
328static const LLT V7S64 = LLT::fixed_vector(7, 64);
329static const LLT V8S64 = LLT::fixed_vector(8, 64);
330static const LLT V16S64 = LLT::fixed_vector(16, 64);
331
332static const LLT V2S128 = LLT::fixed_vector(2, 128);
333static const LLT V4S128 = LLT::fixed_vector(4, 128);
334
335static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336 S160, S224, S256, S512};
337
338static std::initializer_list<LLT> AllS16Vectors{
340
341static std::initializer_list<LLT> AllS32Vectors = {
344
345static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
347
348// Checks whether a type is in the list of legal register types.
349static bool isRegisterClassType(LLT Ty) {
352
355}
356
357static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358 return [TypeIdx](const LegalityQuery &Query) {
359 return isRegisterClassType(Query.Types[TypeIdx]);
360 };
361}
362
363// If we have a truncating store or an extending load with a data size larger
364// than 32-bits, we need to reduce to a 32-bit type.
366 return [=](const LegalityQuery &Query) {
367 const LLT Ty = Query.Types[TypeIdx];
368 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370 };
371}
372
373// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374// handle some operations by just promoting the register during
375// selection. There are also d16 loads on GFX9+ which preserve the high bits.
376static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377 bool IsLoad, bool IsAtomic) {
378 switch (AS) {
380 // FIXME: Private element size.
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
393 // kernel.
394 return IsLoad ? 512 : 128;
395 default:
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
400 }
401}
402
403static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404 const LegalityQuery &Query) {
405 const LLT Ty = Query.Types[0];
406
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409
410 unsigned RegSize = Ty.getSizeInBits();
411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413 unsigned AS = Query.Types[1].getAddressSpace();
414
415 // All of these need to be custom lowered to cast the pointer operand.
417 return false;
418
419 // Do not handle extending vector loads.
420 if (Ty.isVector() && MemSize != RegSize)
421 return false;
422
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
425#if 0
426 // Accept widening loads based on alignment.
427 if (IsLoad && MemSize < Size)
428 MemSize = std::max(MemSize, Align);
429#endif
430
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize != RegSize && RegSize != 32)
433 return false;
434
435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436 Query.MMODescrs[0].Ordering !=
437 AtomicOrdering::NotAtomic))
438 return false;
439
440 switch (MemSize) {
441 case 8:
442 case 16:
443 case 32:
444 case 64:
445 case 128:
446 break;
447 case 96:
448 if (!ST.hasDwordx3LoadStores())
449 return false;
450 break;
451 case 256:
452 case 512:
453 // These may contextually need to be broken down.
454 break;
455 default:
456 return false;
457 }
458
459 assert(RegSize >= MemSize);
460
461 if (AlignBits < MemSize) {
462 const SITargetLowering *TLI = ST.getTargetLowering();
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464 Align(AlignBits / 8)))
465 return false;
466 }
467
468 return true;
469}
470
471// The newer buffer intrinsic forms take their resource arguments as
472// pointers in address space 8, aka s128 values. However, in order to not break
473// SelectionDAG, the underlying operations have to continue to take v4i32
474// arguments. Therefore, we convert resource pointers - or vectors of them
475// to integer values here.
476static bool hasBufferRsrcWorkaround(const LLT Ty) {
478 return true;
479 if (Ty.isVector()) {
480 const LLT ElemTy = Ty.getElementType();
481 return hasBufferRsrcWorkaround(ElemTy);
482 }
483 return false;
484}
485
486// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487// workaround this. Eventually it should ignore the type for loads and only care
488// about the size. Return true in cases where we will workaround this for now by
489// bitcasting.
490static bool loadStoreBitcastWorkaround(const LLT Ty) {
492 return false;
493
494 const unsigned Size = Ty.getSizeInBits();
495 if (Size <= 64)
496 return false;
497 // Address space 8 pointers get their own workaround.
499 return false;
500 if (!Ty.isVector())
501 return true;
502
503 if (Ty.isPointerVector())
504 return true;
505
506 unsigned EltSize = Ty.getScalarSizeInBits();
507 return EltSize != 32 && EltSize != 64;
508}
509
510static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511 const LLT Ty = Query.Types[0];
512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
514}
515
516/// Return true if a load or store of the type should be lowered with a bitcast
517/// to a different type.
518static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519 const LLT MemTy) {
520 const unsigned MemSizeInBits = MemTy.getSizeInBits();
521 const unsigned Size = Ty.getSizeInBits();
522 if (Size != MemSizeInBits)
523 return Size <= 32 && Ty.isVector();
524
526 return true;
527
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530 (Size <= 32 || isRegisterSize(Size)) &&
532}
533
534/// Return true if we should legalize a load by widening an odd sized memory
535/// access up to the alignment. Note this case when the memory access itself
536/// changes, not the size of the result register.
537static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538 uint64_t AlignInBits, unsigned AddrSpace,
539 unsigned Opcode) {
540 unsigned SizeInBits = MemoryTy.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(SizeInBits))
543 return false;
544
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549 return false;
550
551 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552 return false;
553
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
555 // to it.
556 //
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize = NextPowerOf2(SizeInBits);
559 if (AlignInBits < RoundedSize)
560 return false;
561
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering *TLI = ST.getTargetLowering();
564 unsigned Fast = 0;
566 RoundedSize, AddrSpace, Align(AlignInBits / 8),
568 Fast;
569}
570
571static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572 unsigned Opcode) {
573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574 return false;
575
576 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577 Query.MMODescrs[0].AlignInBits,
578 Query.Types[1].getAddressSpace(), Opcode);
579}
580
581/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
585 MachineRegisterInfo &MRI, unsigned Idx) {
586 MachineOperand &MO = MI.getOperand(Idx);
587
588 const LLT PointerTy = MRI.getType(MO.getReg());
589
590 // Paranoidly prevent us from doing this multiple times.
592 return PointerTy;
593
594 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
595 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
596 if (!PointerTy.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599 const LLT S32 = LLT::scalar(32);
600
601 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
604 for (unsigned I = 0; I < NumParts; ++I)
605 VectorElems[I] =
606 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
608 MO.setReg(VectorReg);
609 return VectorTy;
610 }
611 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
613 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
615 MO.setReg(BitcastReg);
616
617 return VectorTy;
618}
619
620/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621/// the form in which the value must be in order to be passed to the low-level
622/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623/// needed in order to account for the fact that we can't define a register
624/// class for s128 without breaking SelectionDAG.
626 MachineRegisterInfo &MRI = *B.getMRI();
627 const LLT PointerTy = MRI.getType(Pointer);
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630
631 if (!PointerTy.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector<Register, 4> PointerParts;
634 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
636 for (unsigned I = 0; I < NumParts; ++I)
637 PointerParts.push_back(Unmerged.getReg(I));
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
639 }
640 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
642}
643
645 unsigned Idx) {
646 MachineOperand &MO = MI.getOperand(Idx);
647
648 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
651 return;
653}
654
656 const GCNTargetMachine &TM)
657 : ST(ST_) {
658 using namespace TargetOpcode;
659
660 auto GetAddrSpacePtr = [&TM](unsigned AS) {
661 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
662 };
663
664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673 const LLT BufferStridedPtr =
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
675
676 const LLT CodePtr = FlatPtr;
677
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
680 };
681
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684 };
685
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687
688 const std::initializer_list<LLT> FPTypesBase = {
689 S32, S64
690 };
691
692 const std::initializer_list<LLT> FPTypes16 = {
693 S32, S64, S16
694 };
695
696 const std::initializer_list<LLT> FPTypesPK16 = {
697 S32, S64, S16, V2S16
698 };
699
700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701
702 // s1 for VCC branches, s32 for SCC branches.
704
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
709 .legalFor(AllS32Vectors)
711 .legalFor(AddrSpaces64)
712 .legalFor(AddrSpaces32)
713 .legalFor(AddrSpaces128)
714 .legalIf(isPointer(0))
715 .clampScalar(0, S16, S256)
717 .clampMaxNumElements(0, S32, 16)
719 .scalarize(0);
720
721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder({G_ADD, G_SUB})
725 .legalFor({S64, S32, S16, V2S16})
726 .clampMaxNumElementsStrict(0, S16, 2)
727 .scalarize(0)
728 .minScalar(0, S16)
730 .maxScalar(0, S32);
731 } else {
732 getActionDefinitionsBuilder({G_ADD, G_SUB})
733 .legalFor({S32, S16, V2S16})
734 .clampMaxNumElementsStrict(0, S16, 2)
735 .scalarize(0)
736 .minScalar(0, S16)
738 .maxScalar(0, S32);
739 }
740
741 if (ST.hasScalarSMulU64()) {
743 .legalFor({S64, S32, S16, V2S16})
744 .clampMaxNumElementsStrict(0, S16, 2)
745 .scalarize(0)
746 .minScalar(0, S16)
748 .custom();
749 } else {
751 .legalFor({S32, S16, V2S16})
752 .clampMaxNumElementsStrict(0, S16, 2)
753 .scalarize(0)
754 .minScalar(0, S16)
756 .custom();
757 }
758 assert(ST.hasMad64_32());
759
760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761 .legalFor({S32, S16, V2S16}) // Clamp modifier
762 .minScalarOrElt(0, S16)
764 .scalarize(0)
766 .lower();
767 } else if (ST.has16BitInsts()) {
768 getActionDefinitionsBuilder({G_ADD, G_SUB})
769 .legalFor({S32, S16})
770 .minScalar(0, S16)
772 .maxScalar(0, S32)
773 .scalarize(0);
774
776 .legalFor({S32, S16})
777 .scalarize(0)
778 .minScalar(0, S16)
779 .widenScalarToNextMultipleOf(0, 32)
780 .custom();
781 assert(ST.hasMad64_32());
782
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786 .legalFor({S32, S16}) // Clamp modifier
787 .minScalar(0, S16)
788 .scalarize(0)
790 .lower();
791
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795 .minScalar(0, S16)
796 .scalarize(0)
797 .lower();
798 } else {
799 getActionDefinitionsBuilder({G_ADD, G_SUB})
800 .legalFor({S32})
801 .widenScalarToNextMultipleOf(0, 32)
802 .clampScalar(0, S32, S32)
803 .scalarize(0);
804
805 auto &Mul = getActionDefinitionsBuilder(G_MUL)
806 .legalFor({S32})
807 .scalarize(0)
808 .minScalar(0, S32)
809 .widenScalarToNextMultipleOf(0, 32);
810
811 if (ST.hasMad64_32())
812 Mul.custom();
813 else
814 Mul.maxScalar(0, S32);
815
816 if (ST.hasIntClamp()) {
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32}) // Clamp modifier.
819 .scalarize(0)
820 .minScalarOrElt(0, S32)
821 .lower();
822 } else {
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825 .minScalar(0, S32)
826 .scalarize(0)
827 .lower();
828 }
829
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833 .minScalar(0, S32)
834 .scalarize(0)
835 .lower();
836 }
837
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840 .customFor({S32, S64})
841 .clampScalar(0, S32, S64)
843 .scalarize(0);
844
845 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
846 .legalFor({S32})
847 .maxScalar(0, S32);
848
849 if (ST.hasVOP3PInsts()) {
850 Mulh
851 .clampMaxNumElements(0, S8, 2)
852 .lowerFor({V2S8});
853 }
854
855 Mulh
856 .scalarize(0)
857 .lower();
858
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
862 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
863 .clampScalar(0, S32, S64)
867 .scalarize(0);
868
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871 .legalFor({{S32, S1}, {S32, S32}})
872 .clampScalar(0, S32, S32)
873 .scalarize(0);
874
876 // Don't worry about the size constraint.
878 .lower();
879
881 .legalFor({S1, S32, S64, S16, GlobalPtr,
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883 .legalIf(isPointer(0))
884 .clampScalar(0, S32, S64)
886
887 getActionDefinitionsBuilder(G_FCONSTANT)
888 .legalFor({S32, S64, S16})
889 .clampScalar(0, S16, S64);
890
891 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
892 .legalIf(isRegisterType(0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
895 .legalFor({S1, S16})
896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
899 .clampMaxNumElements(0, S32, 16);
900
901 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
902
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
906 .legalFor({{PrivatePtr, S32}});
907
908 getActionDefinitionsBuilder(G_STACKSAVE)
909 .customFor({PrivatePtr});
910 getActionDefinitionsBuilder(G_STACKRESTORE)
911 .legalFor({PrivatePtr});
912
913 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
914
915 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916 .customIf(typeIsNot(0, PrivatePtr));
917
918 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
919
920 auto &FPOpActions = getActionDefinitionsBuilder(
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923 .legalFor({S32, S64});
924 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
925 .customFor({S32, S64});
926 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
927 .customFor({S32, S64});
928
929 if (ST.has16BitInsts()) {
930 if (ST.hasVOP3PInsts())
931 FPOpActions.legalFor({S16, V2S16});
932 else
933 FPOpActions.legalFor({S16});
934
935 TrigActions.customFor({S16});
936 FDIVActions.customFor({S16});
937 }
938
939 if (ST.hasPackedFP32Ops()) {
940 FPOpActions.legalFor({V2S32});
941 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
942 }
943
944 auto &MinNumMaxNum = getActionDefinitionsBuilder({
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946
947 if (ST.hasVOP3PInsts()) {
948 MinNumMaxNum.customFor(FPTypesPK16)
951 .clampScalar(0, S16, S64)
952 .scalarize(0);
953 } else if (ST.has16BitInsts()) {
954 MinNumMaxNum.customFor(FPTypes16)
955 .clampScalar(0, S16, S64)
956 .scalarize(0);
957 } else {
958 MinNumMaxNum.customFor(FPTypesBase)
959 .clampScalar(0, S32, S64)
960 .scalarize(0);
961 }
962
963 if (ST.hasVOP3PInsts())
964 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
965
966 FPOpActions
967 .scalarize(0)
968 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
969
970 TrigActions
971 .scalarize(0)
972 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
973
974 FDIVActions
975 .scalarize(0)
976 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
977
978 getActionDefinitionsBuilder({G_FNEG, G_FABS})
979 .legalFor(FPTypesPK16)
981 .scalarize(0)
982 .clampScalar(0, S16, S64);
983
984 if (ST.has16BitInsts()) {
986 .legalFor({S16})
987 .customFor({S32, S64})
988 .scalarize(0)
989 .unsupported();
991 .legalFor({S32, S64, S16})
992 .scalarize(0)
993 .clampScalar(0, S16, S64);
994
995 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
996 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
997 .scalarize(0)
998 .maxScalarIf(typeIs(0, S16), 1, S16)
999 .clampScalar(1, S32, S32)
1000 .lower();
1001
1003 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004 .scalarize(0)
1005 .lower();
1006 } else {
1008 .customFor({S32, S64, S16})
1009 .scalarize(0)
1010 .unsupported();
1011
1012
1013 if (ST.hasFractBug()) {
1015 .customFor({S64})
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 } else {
1021 .legalFor({S32, S64})
1022 .scalarize(0)
1023 .clampScalar(0, S32, S64);
1024 }
1025
1026 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1027 .legalFor({{S32, S32}, {S64, S32}})
1028 .scalarize(0)
1029 .clampScalar(0, S32, S64)
1030 .clampScalar(1, S32, S32)
1031 .lower();
1032
1034 .customFor({{S32, S32}, {S64, S32}})
1035 .scalarize(0)
1036 .minScalar(0, S32)
1037 .clampScalar(1, S32, S32)
1038 .lower();
1039 }
1040
1042 .legalFor({{S32, S64}, {S16, S32}})
1043 .scalarize(0)
1044 .lower();
1045
1047 .legalFor({{S64, S32}, {S32, S16}})
1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1049 .scalarize(0);
1050
1051 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1052 if (ST.has16BitInsts()) {
1053 FSubActions
1054 // Use actual fsub instruction
1055 .legalFor({S32, S16})
1056 // Must use fadd + fneg
1057 .lowerFor({S64, V2S16});
1058 } else {
1059 FSubActions
1060 // Use actual fsub instruction
1061 .legalFor({S32})
1062 // Must use fadd + fneg
1063 .lowerFor({S64, S16, V2S16});
1064 }
1065
1066 FSubActions
1067 .scalarize(0)
1068 .clampScalar(0, S32, S64);
1069
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073 FMad.customFor({S32, S16});
1074 else if (ST.hasMadMacF32Insts())
1075 FMad.customFor({S32});
1076 else if (ST.hasMadF16())
1077 FMad.customFor({S16});
1078 FMad.scalarize(0)
1079 .lower();
1080
1081 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082 if (ST.has16BitInsts()) {
1083 FRem.customFor({S16, S32, S64});
1084 } else {
1085 FRem.minScalar(0, S32)
1086 .customFor({S32, S64});
1087 }
1088 FRem.scalarize(0);
1089
1090 // TODO: Do we need to clamp maximum bitwidth?
1092 .legalIf(isScalar(0))
1093 .legalFor({{V2S16, V2S32}})
1094 .clampMaxNumElements(0, S16, 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1099 .alwaysLegal();
1100
1101 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1102 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1103 {S32, S1}, {S64, S1}, {S16, S1}})
1104 .scalarize(0)
1105 .clampScalar(0, S32, S64)
1106 .widenScalarToNextPow2(1, 32);
1107
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111 .lowerIf(typeIs(1, S1))
1112 .customFor({{S32, S64}, {S64, S64}});
1113 if (ST.has16BitInsts())
1114 IToFP.legalFor({{S16, S16}});
1115 IToFP.clampScalar(1, S32, S64)
1116 .minScalar(0, S32)
1117 .scalarize(0)
1119
1120 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1121 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122 .customFor({{S64, S32}, {S64, S64}})
1123 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1124 if (ST.has16BitInsts())
1125 FPToI.legalFor({{S16, S16}});
1126 else
1127 FPToI.minScalar(1, S32);
1128
1129 FPToI.minScalar(0, S32)
1130 .widenScalarToNextPow2(0, 32)
1131 .scalarize(0)
1132 .lower();
1133
1134 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1135 .clampScalar(0, S16, S64)
1136 .scalarize(0)
1137 .lower();
1138
1139 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1140 .customFor({S16, S32})
1141 .scalarize(0)
1142 .lower();
1143
1144 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1145 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1146 .scalarize(0)
1147 .lower();
1148
1149 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1150 .clampScalar(0, S16, S64)
1151 .scalarize(0)
1152 .lower();
1153
1154 if (ST.has16BitInsts()) {
1155 getActionDefinitionsBuilder(
1156 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1157 .legalFor({S16, S32, S64})
1158 .clampScalar(0, S16, S64)
1159 .scalarize(0);
1160 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1161 getActionDefinitionsBuilder(
1162 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1163 .legalFor({S32, S64})
1164 .clampScalar(0, S32, S64)
1165 .scalarize(0);
1166 } else {
1167 getActionDefinitionsBuilder(
1168 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1169 .legalFor({S32})
1170 .customFor({S64})
1171 .clampScalar(0, S32, S64)
1172 .scalarize(0);
1173 }
1174
1175 getActionDefinitionsBuilder(G_PTR_ADD)
1176 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1177 .legalIf(all(isPointer(0), sameSize(0, 1)))
1178 .scalarize(0)
1179 .scalarSameSizeAs(1, 0);
1180
1181 getActionDefinitionsBuilder(G_PTRMASK)
1182 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1183 .scalarSameSizeAs(1, 0)
1184 .scalarize(0);
1185
1186 auto &CmpBuilder =
1187 getActionDefinitionsBuilder(G_ICMP)
1188 // The compare output type differs based on the register bank of the output,
1189 // so make both s1 and s32 legal.
1190 //
1191 // Scalar compares producing output in scc will be promoted to s32, as that
1192 // is the allocatable register type that will be needed for the copy from
1193 // scc. This will be promoted during RegBankSelect, and we assume something
1194 // before that won't try to use s32 result types.
1195 //
1196 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1197 // bank.
1198 .legalForCartesianProduct(
1199 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1200 .legalForCartesianProduct(
1201 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1202 if (ST.has16BitInsts()) {
1203 CmpBuilder.legalFor({{S1, S16}});
1204 }
1205
1206 CmpBuilder
1207 .widenScalarToNextPow2(1)
1208 .clampScalar(1, S32, S64)
1209 .scalarize(0)
1210 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1211
1212 auto &FCmpBuilder =
1213 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1214 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1215
1216 if (ST.hasSALUFloatInsts())
1217 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1218
1219 FCmpBuilder
1220 .widenScalarToNextPow2(1)
1221 .clampScalar(1, S32, S64)
1222 .scalarize(0);
1223
1224 // FIXME: fpow has a selection pattern that should move to custom lowering.
1225 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1226 if (ST.has16BitInsts())
1227 ExpOps.customFor({{S32}, {S16}});
1228 else
1229 ExpOps.customFor({S32});
1230 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1231 .scalarize(0);
1232
1233 getActionDefinitionsBuilder(G_FPOWI)
1234 .clampScalar(0, MinScalarFPTy, S32)
1235 .lower();
1236
1237 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1238 Log2Ops.customFor({S32});
1239 if (ST.has16BitInsts())
1240 Log2Ops.legalFor({S16});
1241 else
1242 Log2Ops.customFor({S16});
1243 Log2Ops.scalarize(0)
1244 .lower();
1245
1246 auto &LogOps =
1247 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1248 LogOps.customFor({S32, S16});
1249 LogOps.clampScalar(0, MinScalarFPTy, S32)
1250 .scalarize(0);
1251
1252 // The 64-bit versions produce 32-bit results, but only on the SALU.
1253 getActionDefinitionsBuilder(G_CTPOP)
1254 .legalFor({{S32, S32}, {S32, S64}})
1255 .clampScalar(0, S32, S32)
1256 .widenScalarToNextPow2(1, 32)
1257 .clampScalar(1, S32, S64)
1258 .scalarize(0)
1259 .widenScalarToNextPow2(0, 32);
1260
1261 // If no 16 bit instr is available, lower into different instructions.
1262 if (ST.has16BitInsts())
1263 getActionDefinitionsBuilder(G_IS_FPCLASS)
1264 .legalForCartesianProduct({S1}, FPTypes16)
1265 .widenScalarToNextPow2(1)
1266 .scalarize(0)
1267 .lower();
1268 else
1269 getActionDefinitionsBuilder(G_IS_FPCLASS)
1270 .legalForCartesianProduct({S1}, FPTypesBase)
1271 .lowerFor({S1, S16})
1272 .widenScalarToNextPow2(1)
1273 .scalarize(0)
1274 .lower();
1275
1276 // The hardware instructions return a different result on 0 than the generic
1277 // instructions expect. The hardware produces -1, but these produce the
1278 // bitwidth.
1279 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1280 .scalarize(0)
1281 .clampScalar(0, S32, S32)
1282 .clampScalar(1, S32, S64)
1283 .widenScalarToNextPow2(0, 32)
1284 .widenScalarToNextPow2(1, 32)
1285 .custom();
1286
1287 // The 64-bit versions produce 32-bit results, but only on the SALU.
1288 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1289 .legalFor({{S32, S32}, {S32, S64}})
1290 .customIf(scalarNarrowerThan(1, 32))
1291 .clampScalar(0, S32, S32)
1292 .clampScalar(1, S32, S64)
1293 .scalarize(0)
1294 .widenScalarToNextPow2(0, 32)
1295 .widenScalarToNextPow2(1, 32);
1296
1297 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1298 .legalFor({{S32, S32}, {S32, S64}})
1299 .clampScalar(0, S32, S32)
1300 .clampScalar(1, S32, S64)
1301 .scalarize(0)
1302 .widenScalarToNextPow2(0, 32)
1303 .widenScalarToNextPow2(1, 32);
1304
1305 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1306 // RegBankSelect.
1307 getActionDefinitionsBuilder(G_BITREVERSE)
1308 .legalFor({S32, S64})
1309 .clampScalar(0, S32, S64)
1310 .scalarize(0)
1311 .widenScalarToNextPow2(0);
1312
1313 if (ST.has16BitInsts()) {
1314 getActionDefinitionsBuilder(G_BSWAP)
1315 .legalFor({S16, S32, V2S16})
1316 .clampMaxNumElementsStrict(0, S16, 2)
1317 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1318 // narrowScalar limitation.
1319 .widenScalarToNextPow2(0)
1320 .clampScalar(0, S16, S32)
1321 .scalarize(0);
1322
1323 if (ST.hasVOP3PInsts()) {
1324 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1325 .legalFor({S32, S16, V2S16})
1326 .clampMaxNumElements(0, S16, 2)
1327 .minScalar(0, S16)
1328 .widenScalarToNextPow2(0)
1329 .scalarize(0)
1330 .lower();
1331 } else {
1332 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1333 .legalFor({S32, S16})
1334 .widenScalarToNextPow2(0)
1335 .minScalar(0, S16)
1336 .scalarize(0)
1337 .lower();
1338 }
1339 } else {
1340 // TODO: Should have same legality without v_perm_b32
1341 getActionDefinitionsBuilder(G_BSWAP)
1342 .legalFor({S32})
1343 .lowerIf(scalarNarrowerThan(0, 32))
1344 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1345 // narrowScalar limitation.
1346 .widenScalarToNextPow2(0)
1347 .maxScalar(0, S32)
1348 .scalarize(0)
1349 .lower();
1350
1351 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1352 .legalFor({S32})
1353 .minScalar(0, S32)
1354 .widenScalarToNextPow2(0)
1355 .scalarize(0)
1356 .lower();
1357 }
1358
1359 getActionDefinitionsBuilder(G_INTTOPTR)
1360 // List the common cases
1361 .legalForCartesianProduct(AddrSpaces64, {S64})
1362 .legalForCartesianProduct(AddrSpaces32, {S32})
1363 .scalarize(0)
1364 // Accept any address space as long as the size matches
1365 .legalIf(sameSize(0, 1))
1366 .widenScalarIf(smallerThan(1, 0),
1367 [](const LegalityQuery &Query) {
1368 return std::pair(
1369 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1370 })
1371 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1372 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1373 });
1374
1375 getActionDefinitionsBuilder(G_PTRTOINT)
1376 // List the common cases
1377 .legalForCartesianProduct(AddrSpaces64, {S64})
1378 .legalForCartesianProduct(AddrSpaces32, {S32})
1379 .scalarize(0)
1380 // Accept any address space as long as the size matches
1381 .legalIf(sameSize(0, 1))
1382 .widenScalarIf(smallerThan(0, 1),
1383 [](const LegalityQuery &Query) {
1384 return std::pair(
1385 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1386 })
1387 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1388 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1389 });
1390
1391 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1392 .scalarize(0)
1393 .custom();
1394
1395 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1396 bool IsLoad) -> bool {
1397 const LLT DstTy = Query.Types[0];
1398
1399 // Split vector extloads.
1400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1401
1402 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1403 return true;
1404
1405 const LLT PtrTy = Query.Types[1];
1406 unsigned AS = PtrTy.getAddressSpace();
1407 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1408 Query.MMODescrs[0].Ordering !=
1410 return true;
1411
1412 // Catch weird sized loads that don't evenly divide into the access sizes
1413 // TODO: May be able to widen depending on alignment etc.
1414 unsigned NumRegs = (MemSize + 31) / 32;
1415 if (NumRegs == 3) {
1416 if (!ST.hasDwordx3LoadStores())
1417 return true;
1418 } else {
1419 // If the alignment allows, these should have been widened.
1420 if (!isPowerOf2_32(NumRegs))
1421 return true;
1422 }
1423
1424 return false;
1425 };
1426
1427 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1428 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1429 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1430
1431 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1432 // LDS
1433 // TODO: Unsupported flat for SI.
1434
1435 for (unsigned Op : {G_LOAD, G_STORE}) {
1436 const bool IsStore = Op == G_STORE;
1437
1438 auto &Actions = getActionDefinitionsBuilder(Op);
1439 // Explicitly list some common cases.
1440 // TODO: Does this help compile time at all?
1441 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1442 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1443 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1444 {S64, GlobalPtr, S64, GlobalAlign32},
1445 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1446 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1447 {S32, GlobalPtr, S8, GlobalAlign8},
1448 {S32, GlobalPtr, S16, GlobalAlign16},
1449
1450 {S32, LocalPtr, S32, 32},
1451 {S64, LocalPtr, S64, 32},
1452 {V2S32, LocalPtr, V2S32, 32},
1453 {S32, LocalPtr, S8, 8},
1454 {S32, LocalPtr, S16, 16},
1455 {V2S16, LocalPtr, S32, 32},
1456
1457 {S32, PrivatePtr, S32, 32},
1458 {S32, PrivatePtr, S8, 8},
1459 {S32, PrivatePtr, S16, 16},
1460 {V2S16, PrivatePtr, S32, 32},
1461
1462 {S32, ConstantPtr, S32, GlobalAlign32},
1463 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1464 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1465 {S64, ConstantPtr, S64, GlobalAlign32},
1466 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1467 Actions.legalIf(
1468 [=](const LegalityQuery &Query) -> bool {
1469 return isLoadStoreLegal(ST, Query);
1470 });
1471
1472 // The custom pointers (fat pointers, buffer resources) don't work with load
1473 // and store at this level. Fat pointers should have been lowered to
1474 // intrinsics before the translation to MIR.
1475 Actions.unsupportedIf(
1476 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1477
1478 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1479 // ptrtoint. This is needed to account for the fact that we can't have i128
1480 // as a register class for SelectionDAG reasons.
1481 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1482 return hasBufferRsrcWorkaround(Query.Types[0]);
1483 });
1484
1485 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1486 // 64-bits.
1487 //
1488 // TODO: Should generalize bitcast action into coerce, which will also cover
1489 // inserting addrspacecasts.
1490 Actions.customIf(typeIs(1, Constant32Ptr));
1491
1492 // Turn any illegal element vectors into something easier to deal
1493 // with. These will ultimately produce 32-bit scalar shifts to extract the
1494 // parts anyway.
1495 //
1496 // For odd 16-bit element vectors, prefer to split those into pieces with
1497 // 16-bit vector parts.
1498 Actions.bitcastIf(
1499 [=](const LegalityQuery &Query) -> bool {
1500 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1501 Query.MMODescrs[0].MemoryTy);
1502 }, bitcastToRegisterType(0));
1503
1504 if (!IsStore) {
1505 // Widen suitably aligned loads by loading extra bytes. The standard
1506 // legalization actions can't properly express widening memory operands.
1507 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1508 return shouldWidenLoad(ST, Query, G_LOAD);
1509 });
1510 }
1511
1512 // FIXME: load/store narrowing should be moved to lower action
1513 Actions
1514 .narrowScalarIf(
1515 [=](const LegalityQuery &Query) -> bool {
1516 return !Query.Types[0].isVector() &&
1517 needToSplitMemOp(Query, Op == G_LOAD);
1518 },
1519 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1520 const LLT DstTy = Query.Types[0];
1521 const LLT PtrTy = Query.Types[1];
1522
1523 const unsigned DstSize = DstTy.getSizeInBits();
1524 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1525
1526 // Split extloads.
1527 if (DstSize > MemSize)
1528 return std::pair(0, LLT::scalar(MemSize));
1529
1530 unsigned MaxSize = maxSizeForAddrSpace(
1531 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1532 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1533 if (MemSize > MaxSize)
1534 return std::pair(0, LLT::scalar(MaxSize));
1535
1536 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1537 return std::pair(0, LLT::scalar(Align));
1538 })
1539 .fewerElementsIf(
1540 [=](const LegalityQuery &Query) -> bool {
1541 return Query.Types[0].isVector() &&
1542 needToSplitMemOp(Query, Op == G_LOAD);
1543 },
1544 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1545 const LLT DstTy = Query.Types[0];
1546 const LLT PtrTy = Query.Types[1];
1547
1548 LLT EltTy = DstTy.getElementType();
1549 unsigned MaxSize = maxSizeForAddrSpace(
1550 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1551 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1552
1553 // FIXME: Handle widened to power of 2 results better. This ends
1554 // up scalarizing.
1555 // FIXME: 3 element stores scalarized on SI
1556
1557 // Split if it's too large for the address space.
1558 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1559 if (MemSize > MaxSize) {
1560 unsigned NumElts = DstTy.getNumElements();
1561 unsigned EltSize = EltTy.getSizeInBits();
1562
1563 if (MaxSize % EltSize == 0) {
1564 return std::pair(
1566 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1567 }
1568
1569 unsigned NumPieces = MemSize / MaxSize;
1570
1571 // FIXME: Refine when odd breakdowns handled
1572 // The scalars will need to be re-legalized.
1573 if (NumPieces == 1 || NumPieces >= NumElts ||
1574 NumElts % NumPieces != 0)
1575 return std::pair(0, EltTy);
1576
1577 return std::pair(0,
1578 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1579 }
1580
1581 // FIXME: We could probably handle weird extending loads better.
1582 if (DstTy.getSizeInBits() > MemSize)
1583 return std::pair(0, EltTy);
1584
1585 unsigned EltSize = EltTy.getSizeInBits();
1586 unsigned DstSize = DstTy.getSizeInBits();
1587 if (!isPowerOf2_32(DstSize)) {
1588 // We're probably decomposing an odd sized store. Try to split
1589 // to the widest type. TODO: Account for alignment. As-is it
1590 // should be OK, since the new parts will be further legalized.
1591 unsigned FloorSize = llvm::bit_floor(DstSize);
1592 return std::pair(
1594 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1595 }
1596
1597 // May need relegalization for the scalars.
1598 return std::pair(0, EltTy);
1599 })
1600 .minScalar(0, S32)
1601 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1602 .widenScalarToNextPow2(0)
1603 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1604 .lower();
1605 }
1606
1607 // FIXME: Unaligned accesses not lowered.
1608 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1609 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1610 {S32, GlobalPtr, S16, 2 * 8},
1611 {S32, LocalPtr, S8, 8},
1612 {S32, LocalPtr, S16, 16},
1613 {S32, PrivatePtr, S8, 8},
1614 {S32, PrivatePtr, S16, 16},
1615 {S32, ConstantPtr, S8, 8},
1616 {S32, ConstantPtr, S16, 2 * 8}})
1617 .legalIf(
1618 [=](const LegalityQuery &Query) -> bool {
1619 return isLoadStoreLegal(ST, Query);
1620 });
1621
1622 if (ST.hasFlatAddressSpace()) {
1623 ExtLoads.legalForTypesWithMemDesc(
1624 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1625 }
1626
1627 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1628 // 64-bits.
1629 //
1630 // TODO: Should generalize bitcast action into coerce, which will also cover
1631 // inserting addrspacecasts.
1632 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1633
1634 ExtLoads.clampScalar(0, S32, S32)
1635 .widenScalarToNextPow2(0)
1636 .lower();
1637
1638 auto &Atomics = getActionDefinitionsBuilder(
1639 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1640 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1641 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1642 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1643 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1644 {S64, GlobalPtr}, {S64, LocalPtr},
1645 {S32, RegionPtr}, {S64, RegionPtr}});
1646 if (ST.hasFlatAddressSpace()) {
1647 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1648 }
1649
1650 // TODO: v2bf16 operations, and fat buffer pointer support.
1651 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1652 if (ST.hasLDSFPAtomicAddF32()) {
1653 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1654 if (ST.hasLdsAtomicAddF64())
1655 Atomic.legalFor({{S64, LocalPtr}});
1656 if (ST.hasAtomicDsPkAdd16Insts())
1657 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1658 }
1659 if (ST.hasAtomicFaddInsts())
1660 Atomic.legalFor({{S32, GlobalPtr}});
1661 if (ST.hasFlatAtomicFaddF32Inst())
1662 Atomic.legalFor({{S32, FlatPtr}});
1663
1664 if (ST.hasGFX90AInsts()) {
1665 // These are legal with some caveats, and should have undergone expansion in
1666 // the IR in most situations
1667 // TODO: Move atomic expansion into legalizer
1668 Atomic.legalFor({
1669 {S32, GlobalPtr},
1670 {S64, GlobalPtr},
1671 {S64, FlatPtr}
1672 });
1673 }
1674
1675 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1676 ST.hasAtomicBufferGlobalPkAddF16Insts())
1677 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1678 if (ST.hasAtomicGlobalPkAddBF16Inst())
1679 Atomic.legalFor({{V2BF16, GlobalPtr}});
1680 if (ST.hasAtomicFlatPkAdd16Insts())
1681 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1682
1683
1684 // Most of the legalization work here is done by AtomicExpand. We could
1685 // probably use a simpler legality rule that just assumes anything is OK.
1686 auto &AtomicFMinFMax =
1687 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1688 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1689
1690 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1691 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1692 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1693 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1694 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1695 AtomicFMinFMax.legalFor({F32, FlatPtr});
1696 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1697 AtomicFMinFMax.legalFor({F64, FlatPtr});
1698
1699 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1700 // demarshalling
1701 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1702 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1703 {S32, FlatPtr}, {S64, FlatPtr}})
1704 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1705 {S32, RegionPtr}, {S64, RegionPtr}});
1706 // TODO: Pointer types, any 32-bit or 64-bit vector
1707
1708 // Condition should be s32 for scalar, s1 for vector.
1709 getActionDefinitionsBuilder(G_SELECT)
1710 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1711 LocalPtr, FlatPtr, PrivatePtr,
1712 LLT::fixed_vector(2, LocalPtr),
1713 LLT::fixed_vector(2, PrivatePtr)},
1714 {S1, S32})
1715 .clampScalar(0, S16, S64)
1716 .scalarize(1)
1717 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1718 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1719 .clampMaxNumElements(0, S32, 2)
1720 .clampMaxNumElements(0, LocalPtr, 2)
1721 .clampMaxNumElements(0, PrivatePtr, 2)
1722 .scalarize(0)
1723 .widenScalarToNextPow2(0)
1724 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1725
1726 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1727 // be more flexible with the shift amount type.
1728 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1729 .legalFor({{S32, S32}, {S64, S32}});
1730 if (ST.has16BitInsts()) {
1731 if (ST.hasVOP3PInsts()) {
1732 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1733 .clampMaxNumElements(0, S16, 2);
1734 } else
1735 Shifts.legalFor({{S16, S16}});
1736
1737 // TODO: Support 16-bit shift amounts for all types
1738 Shifts.widenScalarIf(
1739 [=](const LegalityQuery &Query) {
1740 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1741 // 32-bit amount.
1742 const LLT ValTy = Query.Types[0];
1743 const LLT AmountTy = Query.Types[1];
1744 return ValTy.getSizeInBits() <= 16 &&
1745 AmountTy.getSizeInBits() < 16;
1746 }, changeTo(1, S16));
1747 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1748 Shifts.clampScalar(1, S32, S32);
1749 Shifts.widenScalarToNextPow2(0, 16);
1750 Shifts.clampScalar(0, S16, S64);
1751
1752 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1753 .minScalar(0, S16)
1754 .scalarize(0)
1755 .lower();
1756 } else {
1757 // Make sure we legalize the shift amount type first, as the general
1758 // expansion for the shifted type will produce much worse code if it hasn't
1759 // been truncated already.
1760 Shifts.clampScalar(1, S32, S32);
1761 Shifts.widenScalarToNextPow2(0, 32);
1762 Shifts.clampScalar(0, S32, S64);
1763
1764 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1765 .minScalar(0, S32)
1766 .scalarize(0)
1767 .lower();
1768 }
1769 Shifts.scalarize(0);
1770
1771 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1772 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1773 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1774 unsigned IdxTypeIdx = 2;
1775
1776 getActionDefinitionsBuilder(Op)
1777 .customIf([=](const LegalityQuery &Query) {
1778 const LLT EltTy = Query.Types[EltTypeIdx];
1779 const LLT VecTy = Query.Types[VecTypeIdx];
1780 const LLT IdxTy = Query.Types[IdxTypeIdx];
1781 const unsigned EltSize = EltTy.getSizeInBits();
1782 const bool isLegalVecType =
1784 // Address space 8 pointers are 128-bit wide values, but the logic
1785 // below will try to bitcast them to 2N x s64, which will fail.
1786 // Therefore, as an intermediate step, wrap extracts/insertions from a
1787 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1788 // extraction result) in order to produce a vector operation that can
1789 // be handled by the logic below.
1790 if (EltTy.isPointer() && EltSize > 64)
1791 return true;
1792 return (EltSize == 32 || EltSize == 64) &&
1793 VecTy.getSizeInBits() % 32 == 0 &&
1794 VecTy.getSizeInBits() <= MaxRegisterSize &&
1795 IdxTy.getSizeInBits() == 32 &&
1796 isLegalVecType;
1797 })
1798 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1799 bitcastToVectorElement32(VecTypeIdx))
1800 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1801 .bitcastIf(
1802 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1803 [=](const LegalityQuery &Query) {
1804 // For > 64-bit element types, try to turn this into a 64-bit
1805 // element vector since we may be able to do better indexing
1806 // if this is scalar. If not, fall back to 32.
1807 const LLT EltTy = Query.Types[EltTypeIdx];
1808 const LLT VecTy = Query.Types[VecTypeIdx];
1809 const unsigned DstEltSize = EltTy.getSizeInBits();
1810 const unsigned VecSize = VecTy.getSizeInBits();
1811
1812 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1813 return std::pair(
1814 VecTypeIdx,
1815 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1816 })
1817 .clampScalar(EltTypeIdx, S32, S64)
1818 .clampScalar(VecTypeIdx, S32, S64)
1819 .clampScalar(IdxTypeIdx, S32, S32)
1820 .clampMaxNumElements(VecTypeIdx, S32, 32)
1821 // TODO: Clamp elements for 64-bit vectors?
1822 .moreElementsIf(
1823 isIllegalRegisterType(VecTypeIdx),
1825 // It should only be necessary with variable indexes.
1826 // As a last resort, lower to the stack
1827 .lower();
1828 }
1829
1830 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1831 .unsupportedIf([=](const LegalityQuery &Query) {
1832 const LLT &EltTy = Query.Types[1].getElementType();
1833 return Query.Types[0] != EltTy;
1834 });
1835
1836 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1837 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1838 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1839
1840 // FIXME: Doesn't handle extract of illegal sizes.
1841 getActionDefinitionsBuilder(Op)
1842 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1843 .lowerIf([=](const LegalityQuery &Query) {
1844 // Sub-vector(or single element) insert and extract.
1845 // TODO: verify immediate offset here since lower only works with
1846 // whole elements.
1847 const LLT BigTy = Query.Types[BigTyIdx];
1848 return BigTy.isVector();
1849 })
1850 // FIXME: Multiples of 16 should not be legal.
1851 .legalIf([=](const LegalityQuery &Query) {
1852 const LLT BigTy = Query.Types[BigTyIdx];
1853 const LLT LitTy = Query.Types[LitTyIdx];
1854 return (BigTy.getSizeInBits() % 32 == 0) &&
1855 (LitTy.getSizeInBits() % 16 == 0);
1856 })
1857 .widenScalarIf(
1858 [=](const LegalityQuery &Query) {
1859 const LLT BigTy = Query.Types[BigTyIdx];
1860 return (BigTy.getScalarSizeInBits() < 16);
1861 },
1863 .widenScalarIf(
1864 [=](const LegalityQuery &Query) {
1865 const LLT LitTy = Query.Types[LitTyIdx];
1866 return (LitTy.getScalarSizeInBits() < 16);
1867 },
1869 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1870 .widenScalarToNextPow2(BigTyIdx, 32);
1871
1872 }
1873
1874 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1875 .legalForCartesianProduct(AllS32Vectors, {S32})
1876 .legalForCartesianProduct(AllS64Vectors, {S64})
1877 .clampNumElements(0, V16S32, V32S32)
1878 .clampNumElements(0, V2S64, V16S64)
1879 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1880 .moreElementsIf(
1883
1884 if (ST.hasScalarPackInsts()) {
1885 BuildVector
1886 // FIXME: Should probably widen s1 vectors straight to s32
1887 .minScalarOrElt(0, S16)
1888 .minScalar(1, S16);
1889
1890 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1891 .legalFor({V2S16, S32})
1892 .lower();
1893 } else {
1894 BuildVector.customFor({V2S16, S16});
1895 BuildVector.minScalarOrElt(0, S32);
1896
1897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1898 .customFor({V2S16, S32})
1899 .lower();
1900 }
1901
1902 BuildVector.legalIf(isRegisterType(0));
1903
1904 // FIXME: Clamp maximum size
1905 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1906 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1907 .clampMaxNumElements(0, S32, 32)
1908 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1909 .clampMaxNumElements(0, S16, 64);
1910
1911 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1912
1913 // Merge/Unmerge
1914 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1915 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1916 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1917
1918 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1919 const LLT Ty = Query.Types[TypeIdx];
1920 if (Ty.isVector()) {
1921 const LLT &EltTy = Ty.getElementType();
1922 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1923 return true;
1924 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1925 return true;
1926 }
1927 return false;
1928 };
1929
1930 auto &Builder = getActionDefinitionsBuilder(Op)
1931 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1932 .lowerFor({{S16, V2S16}})
1933 .lowerIf([=](const LegalityQuery &Query) {
1934 const LLT BigTy = Query.Types[BigTyIdx];
1935 return BigTy.getSizeInBits() == 32;
1936 })
1937 // Try to widen to s16 first for small types.
1938 // TODO: Only do this on targets with legal s16 shifts
1939 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1940 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1941 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1942 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1943 elementTypeIs(1, S16)),
1944 changeTo(1, V2S16))
1945 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1946 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1947 // valid.
1948 .clampScalar(LitTyIdx, S32, S512)
1949 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1950 // Break up vectors with weird elements into scalars
1951 .fewerElementsIf(
1952 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1953 scalarize(0))
1954 .fewerElementsIf(
1955 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1956 scalarize(1))
1957 .clampScalar(BigTyIdx, S32, MaxScalar);
1958
1959 if (Op == G_MERGE_VALUES) {
1960 Builder.widenScalarIf(
1961 // TODO: Use 16-bit shifts if legal for 8-bit values?
1962 [=](const LegalityQuery &Query) {
1963 const LLT Ty = Query.Types[LitTyIdx];
1964 return Ty.getSizeInBits() < 32;
1965 },
1966 changeTo(LitTyIdx, S32));
1967 }
1968
1969 Builder.widenScalarIf(
1970 [=](const LegalityQuery &Query) {
1971 const LLT Ty = Query.Types[BigTyIdx];
1972 return Ty.getSizeInBits() % 16 != 0;
1973 },
1974 [=](const LegalityQuery &Query) {
1975 // Pick the next power of 2, or a multiple of 64 over 128.
1976 // Whichever is smaller.
1977 const LLT &Ty = Query.Types[BigTyIdx];
1978 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1979 if (NewSizeInBits >= 256) {
1980 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1981 if (RoundedTo < NewSizeInBits)
1982 NewSizeInBits = RoundedTo;
1983 }
1984 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1985 })
1986 // Any vectors left are the wrong size. Scalarize them.
1987 .scalarize(0)
1988 .scalarize(1);
1989 }
1990
1991 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1992 // RegBankSelect.
1993 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1994 .legalFor({{S32}, {S64}});
1995
1996 if (ST.hasVOP3PInsts()) {
1997 SextInReg.lowerFor({{V2S16}})
1998 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1999 // get more vector shift opportunities, since we'll get those when
2000 // expanded.
2001 .clampMaxNumElementsStrict(0, S16, 2);
2002 } else if (ST.has16BitInsts()) {
2003 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2004 } else {
2005 // Prefer to promote to s32 before lowering if we don't have 16-bit
2006 // shifts. This avoid a lot of intermediate truncate and extend operations.
2007 SextInReg.lowerFor({{S32}, {S64}});
2008 }
2009
2010 SextInReg
2011 .scalarize(0)
2012 .clampScalar(0, S32, S64)
2013 .lower();
2014
2015 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2016 .scalarize(0)
2017 .lower();
2018
2019 // TODO: Only Try to form v2s16 with legal packed instructions.
2020 getActionDefinitionsBuilder(G_FSHR)
2021 .legalFor({{S32, S32}})
2022 .lowerFor({{V2S16, V2S16}})
2023 .clampMaxNumElementsStrict(0, S16, 2)
2024 .scalarize(0)
2025 .lower();
2026
2027 if (ST.hasVOP3PInsts()) {
2028 getActionDefinitionsBuilder(G_FSHL)
2029 .lowerFor({{V2S16, V2S16}})
2030 .clampMaxNumElementsStrict(0, S16, 2)
2031 .scalarize(0)
2032 .lower();
2033 } else {
2034 getActionDefinitionsBuilder(G_FSHL)
2035 .scalarize(0)
2036 .lower();
2037 }
2038
2039 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2040 .legalFor({S64});
2041
2042 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2043
2044 getActionDefinitionsBuilder(G_FENCE)
2045 .alwaysLegal();
2046
2047 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2048 .scalarize(0)
2049 .minScalar(0, S32)
2050 .lower();
2051
2052 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2053 .legalFor({{S32, S32}, {S64, S32}})
2054 .clampScalar(1, S32, S32)
2055 .clampScalar(0, S32, S64)
2056 .widenScalarToNextPow2(0)
2057 .scalarize(0);
2058
2059 getActionDefinitionsBuilder(
2060 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2061 G_FCOPYSIGN,
2062
2063 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2064 G_READ_REGISTER, G_WRITE_REGISTER,
2065
2066 G_SADDO, G_SSUBO})
2067 .lower();
2068
2069 if (ST.hasIEEEMinMax()) {
2070 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2071 .legalFor(FPTypesPK16)
2072 .clampMaxNumElements(0, S16, 2)
2073 .scalarize(0);
2074 } else {
2075 // TODO: Implement
2076 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2077 }
2078
2079 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2080 .lower();
2081
2082 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2083
2084 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2085 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2086 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2087 .unsupported();
2088
2089 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2090
2091 getLegacyLegalizerInfo().computeTables();
2092 verify(*ST.getInstrInfo());
2093}
2094
2097 LostDebugLocObserver &LocObserver) const {
2098 MachineIRBuilder &B = Helper.MIRBuilder;
2099 MachineRegisterInfo &MRI = *B.getMRI();
2100
2101 switch (MI.getOpcode()) {
2102 case TargetOpcode::G_ADDRSPACE_CAST:
2103 return legalizeAddrSpaceCast(MI, MRI, B);
2104 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2105 return legalizeFroundeven(MI, MRI, B);
2106 case TargetOpcode::G_FCEIL:
2107 return legalizeFceil(MI, MRI, B);
2108 case TargetOpcode::G_FREM:
2109 return legalizeFrem(MI, MRI, B);
2110 case TargetOpcode::G_INTRINSIC_TRUNC:
2111 return legalizeIntrinsicTrunc(MI, MRI, B);
2112 case TargetOpcode::G_SITOFP:
2113 return legalizeITOFP(MI, MRI, B, true);
2114 case TargetOpcode::G_UITOFP:
2115 return legalizeITOFP(MI, MRI, B, false);
2116 case TargetOpcode::G_FPTOSI:
2117 return legalizeFPTOI(MI, MRI, B, true);
2118 case TargetOpcode::G_FPTOUI:
2119 return legalizeFPTOI(MI, MRI, B, false);
2120 case TargetOpcode::G_FMINNUM:
2121 case TargetOpcode::G_FMAXNUM:
2122 case TargetOpcode::G_FMINNUM_IEEE:
2123 case TargetOpcode::G_FMAXNUM_IEEE:
2124 return legalizeMinNumMaxNum(Helper, MI);
2125 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2126 return legalizeExtractVectorElt(MI, MRI, B);
2127 case TargetOpcode::G_INSERT_VECTOR_ELT:
2128 return legalizeInsertVectorElt(MI, MRI, B);
2129 case TargetOpcode::G_FSIN:
2130 case TargetOpcode::G_FCOS:
2131 return legalizeSinCos(MI, MRI, B);
2132 case TargetOpcode::G_GLOBAL_VALUE:
2133 return legalizeGlobalValue(MI, MRI, B);
2134 case TargetOpcode::G_LOAD:
2135 case TargetOpcode::G_SEXTLOAD:
2136 case TargetOpcode::G_ZEXTLOAD:
2137 return legalizeLoad(Helper, MI);
2138 case TargetOpcode::G_STORE:
2139 return legalizeStore(Helper, MI);
2140 case TargetOpcode::G_FMAD:
2141 return legalizeFMad(MI, MRI, B);
2142 case TargetOpcode::G_FDIV:
2143 return legalizeFDIV(MI, MRI, B);
2144 case TargetOpcode::G_FFREXP:
2145 return legalizeFFREXP(MI, MRI, B);
2146 case TargetOpcode::G_FSQRT:
2147 return legalizeFSQRT(MI, MRI, B);
2148 case TargetOpcode::G_UDIV:
2149 case TargetOpcode::G_UREM:
2150 case TargetOpcode::G_UDIVREM:
2151 return legalizeUnsignedDIV_REM(MI, MRI, B);
2152 case TargetOpcode::G_SDIV:
2153 case TargetOpcode::G_SREM:
2154 case TargetOpcode::G_SDIVREM:
2155 return legalizeSignedDIV_REM(MI, MRI, B);
2156 case TargetOpcode::G_ATOMIC_CMPXCHG:
2157 return legalizeAtomicCmpXChg(MI, MRI, B);
2158 case TargetOpcode::G_FLOG2:
2159 return legalizeFlog2(MI, B);
2160 case TargetOpcode::G_FLOG:
2161 case TargetOpcode::G_FLOG10:
2162 return legalizeFlogCommon(MI, B);
2163 case TargetOpcode::G_FEXP2:
2164 return legalizeFExp2(MI, B);
2165 case TargetOpcode::G_FEXP:
2166 case TargetOpcode::G_FEXP10:
2167 return legalizeFExp(MI, B);
2168 case TargetOpcode::G_FPOW:
2169 return legalizeFPow(MI, B);
2170 case TargetOpcode::G_FFLOOR:
2171 return legalizeFFloor(MI, MRI, B);
2172 case TargetOpcode::G_BUILD_VECTOR:
2173 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2174 return legalizeBuildVector(MI, MRI, B);
2175 case TargetOpcode::G_MUL:
2176 return legalizeMul(Helper, MI);
2177 case TargetOpcode::G_CTLZ:
2178 case TargetOpcode::G_CTTZ:
2179 return legalizeCTLZ_CTTZ(MI, MRI, B);
2180 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2181 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2182 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2183 return legalizeFPTruncRound(MI, B);
2184 case TargetOpcode::G_STACKSAVE:
2185 return legalizeStackSave(MI, B);
2186 case TargetOpcode::G_GET_FPENV:
2187 return legalizeGetFPEnv(MI, MRI, B);
2188 case TargetOpcode::G_SET_FPENV:
2189 return legalizeSetFPEnv(MI, MRI, B);
2190 case TargetOpcode::G_TRAP:
2191 return legalizeTrap(MI, MRI, B);
2192 case TargetOpcode::G_DEBUGTRAP:
2193 return legalizeDebugTrap(MI, MRI, B);
2194 default:
2195 return false;
2196 }
2197
2198 llvm_unreachable("expected switch to return");
2199}
2200
2202 unsigned AS,
2204 MachineIRBuilder &B) const {
2205 MachineFunction &MF = B.getMF();
2206 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2207 const LLT S32 = LLT::scalar(32);
2208 const LLT S64 = LLT::scalar(64);
2209
2211
2212 if (ST.hasApertureRegs()) {
2213 // Note: this register is somewhat broken. When used as a 32-bit operand,
2214 // it only returns zeroes. The real value is in the upper 32 bits.
2215 // Thus, we must emit extract the high 32 bits.
2216 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2219 // FIXME: It would be more natural to emit a COPY here, but then copy
2220 // coalescing would kick in and it would think it's okay to use the "HI"
2221 // subregister (instead of extracting the HI 32 bits) which is an artificial
2222 // (unusable) register.
2223 // Register TableGen definitions would need an overhaul to get rid of the
2224 // artificial "HI" aperture registers and prevent this kind of issue from
2225 // happening.
2226 Register Dst = MRI.createGenericVirtualRegister(S64);
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2229 return B.buildUnmerge(S32, Dst).getReg(1);
2230 }
2231
2232 // TODO: can we be smarter about machine pointer info?
2234 Register LoadAddr = MRI.createGenericVirtualRegister(
2236 // For code object version 5, private_base and shared_base are passed through
2237 // implicit kernargs.
2244 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2245
2246 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2248
2249 if (!loadInputValue(KernargPtrReg, B,
2251 return Register();
2252
2254 PtrInfo,
2258
2259 // Pointer address
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2261 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2262 // Load address
2263 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2264 }
2265
2266 Register QueuePtr = MRI.createGenericVirtualRegister(
2268
2270 return Register();
2271
2272 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2273 // private_segment_aperture_base_hi.
2274 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2275
2277 PtrInfo,
2280 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2281
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2285}
2286
2287/// Return true if the value is a known valid address, such that a null check is
2288/// not necessary.
2290 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2291 MachineInstr *Def = MRI.getVRegDef(Val);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2296 return true;
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2299 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2300 }
2301 default:
2302 return false;
2303 }
2304
2305 return false;
2306}
2307
2310 MachineIRBuilder &B) const {
2311 MachineFunction &MF = B.getMF();
2312
2313 // MI can either be a G_ADDRSPACE_CAST or a
2314 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2315 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2318
2319 const LLT S32 = LLT::scalar(32);
2320 Register Dst = MI.getOperand(0).getReg();
2321 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2322 : MI.getOperand(1).getReg();
2323 LLT DstTy = MRI.getType(Dst);
2324 LLT SrcTy = MRI.getType(Src);
2325 unsigned DestAS = DstTy.getAddressSpace();
2326 unsigned SrcAS = SrcTy.getAddressSpace();
2327
2328 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2329 // vector element.
2330 assert(!DstTy.isVector());
2331
2332 const AMDGPUTargetMachine &TM
2333 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2334
2335 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2337 return true;
2338 }
2339
2340 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2341 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2342 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2343 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2344 // G_ADDRSPACE_CAST we need to guess.
2345 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2346 // Extract low 32-bits of the pointer.
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2349 return true;
2350 }
2351
2352 unsigned NullVal = TM.getNullPointerValue(DestAS);
2353
2354 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2355 auto FlatNull = B.buildConstant(SrcTy, 0);
2356
2357 // Extract low 32-bits of the pointer.
2358 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2359
2360 auto CmpRes =
2361 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2363
2364 MI.eraseFromParent();
2365 return true;
2366 }
2367
2368 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2369 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2370 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2371 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2372 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2373 if (!ApertureReg.isValid())
2374 return false;
2375
2376 // Coerce the type of the low half of the result so we can use
2377 // merge_values.
2378 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2379
2380 // TODO: Should we allow mismatched types but matching sizes in merges to
2381 // avoid the ptrtoint?
2382 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2383 };
2384
2385 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2386 // G_ADDRSPACE_CAST we need to guess.
2387 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2388 castLocalOrPrivateToFlat(Dst);
2389 MI.eraseFromParent();
2390 return true;
2391 }
2392
2393 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2394
2395 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2396 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2397
2398 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2399 SegmentNull.getReg(0));
2400
2401 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2402
2403 MI.eraseFromParent();
2404 return true;
2405 }
2406
2407 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2408 SrcTy.getSizeInBits() == 64) {
2409 // Truncate.
2410 B.buildExtract(Dst, Src, 0);
2411 MI.eraseFromParent();
2412 return true;
2413 }
2414
2415 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2416 DstTy.getSizeInBits() == 64) {
2418 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2419 auto PtrLo = B.buildPtrToInt(S32, Src);
2420 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2421 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2422 MI.eraseFromParent();
2423 return true;
2424 }
2425
2426 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2427 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2428
2429 LLVMContext &Ctx = MF.getFunction().getContext();
2430 Ctx.diagnose(InvalidAddrSpaceCast);
2431 B.buildUndef(Dst);
2432 MI.eraseFromParent();
2433 return true;
2434}
2435
2438 MachineIRBuilder &B) const {
2439 Register Src = MI.getOperand(1).getReg();
2440 LLT Ty = MRI.getType(Src);
2441 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2442
2443 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2444 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2445
2446 auto C1 = B.buildFConstant(Ty, C1Val);
2447 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2448
2449 // TODO: Should this propagate fast-math-flags?
2450 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2451 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2452
2453 auto C2 = B.buildFConstant(Ty, C2Val);
2454 auto Fabs = B.buildFAbs(Ty, Src);
2455
2456 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2457 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2458 MI.eraseFromParent();
2459 return true;
2460}
2461
2464 MachineIRBuilder &B) const {
2465
2466 const LLT S1 = LLT::scalar(1);
2467 const LLT S64 = LLT::scalar(64);
2468
2469 Register Src = MI.getOperand(1).getReg();
2470 assert(MRI.getType(Src) == S64);
2471
2472 // result = trunc(src)
2473 // if (src > 0.0 && src != result)
2474 // result += 1.0
2475
2476 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2477
2478 const auto Zero = B.buildFConstant(S64, 0.0);
2479 const auto One = B.buildFConstant(S64, 1.0);
2480 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2481 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2482 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2483 auto Add = B.buildSelect(S64, And, One, Zero);
2484
2485 // TODO: Should this propagate fast-math-flags?
2486 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2487 MI.eraseFromParent();
2488 return true;
2489}
2490
2493 MachineIRBuilder &B) const {
2494 Register DstReg = MI.getOperand(0).getReg();
2495 Register Src0Reg = MI.getOperand(1).getReg();
2496 Register Src1Reg = MI.getOperand(2).getReg();
2497 auto Flags = MI.getFlags();
2498 LLT Ty = MRI.getType(DstReg);
2499
2500 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2501 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2502 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2503 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2504 MI.eraseFromParent();
2505 return true;
2506}
2507
2510 const unsigned FractBits = 52;
2511 const unsigned ExpBits = 11;
2512 LLT S32 = LLT::scalar(32);
2513
2514 auto Const0 = B.buildConstant(S32, FractBits - 32);
2515 auto Const1 = B.buildConstant(S32, ExpBits);
2516
2517 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2518 .addUse(Hi)
2519 .addUse(Const0.getReg(0))
2520 .addUse(Const1.getReg(0));
2521
2522 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2523}
2524
2527 MachineIRBuilder &B) const {
2528 const LLT S1 = LLT::scalar(1);
2529 const LLT S32 = LLT::scalar(32);
2530 const LLT S64 = LLT::scalar(64);
2531
2532 Register Src = MI.getOperand(1).getReg();
2533 assert(MRI.getType(Src) == S64);
2534
2535 // TODO: Should this use extract since the low half is unused?
2536 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2537 Register Hi = Unmerge.getReg(1);
2538
2539 // Extract the upper half, since this is where we will find the sign and
2540 // exponent.
2541 auto Exp = extractF64Exponent(Hi, B);
2542
2543 const unsigned FractBits = 52;
2544
2545 // Extract the sign bit.
2546 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2547 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2548
2549 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2550
2551 const auto Zero32 = B.buildConstant(S32, 0);
2552
2553 // Extend back to 64-bits.
2554 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2555
2556 auto Shr = B.buildAShr(S64, FractMask, Exp);
2557 auto Not = B.buildNot(S64, Shr);
2558 auto Tmp0 = B.buildAnd(S64, Src, Not);
2559 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2560
2561 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2562 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2563
2564 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2565 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2566 MI.eraseFromParent();
2567 return true;
2568}
2569
2572 MachineIRBuilder &B, bool Signed) const {
2573
2574 Register Dst = MI.getOperand(0).getReg();
2575 Register Src = MI.getOperand(1).getReg();
2576
2577 const LLT S64 = LLT::scalar(64);
2578 const LLT S32 = LLT::scalar(32);
2579
2580 assert(MRI.getType(Src) == S64);
2581
2582 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2583 auto ThirtyTwo = B.buildConstant(S32, 32);
2584
2585 if (MRI.getType(Dst) == S64) {
2586 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2587 : B.buildUITOFP(S64, Unmerge.getReg(1));
2588
2589 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2590 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2591
2592 // TODO: Should this propagate fast-math-flags?
2593 B.buildFAdd(Dst, LdExp, CvtLo);
2594 MI.eraseFromParent();
2595 return true;
2596 }
2597
2598 assert(MRI.getType(Dst) == S32);
2599
2600 auto One = B.buildConstant(S32, 1);
2601
2602 MachineInstrBuilder ShAmt;
2603 if (Signed) {
2604 auto ThirtyOne = B.buildConstant(S32, 31);
2605 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2606 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2607 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2608 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2609 .addUse(Unmerge.getReg(1));
2610 auto LS2 = B.buildSub(S32, LS, One);
2611 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2612 } else
2613 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2614 auto Norm = B.buildShl(S64, Src, ShAmt);
2615 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2616 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2617 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2618 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2619 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2620 B.buildFLdexp(Dst, FVal, Scale);
2621 MI.eraseFromParent();
2622 return true;
2623}
2624
2625// TODO: Copied from DAG implementation. Verify logic and document how this
2626// actually works.
2630 bool Signed) const {
2631
2632 Register Dst = MI.getOperand(0).getReg();
2633 Register Src = MI.getOperand(1).getReg();
2634
2635 const LLT S64 = LLT::scalar(64);
2636 const LLT S32 = LLT::scalar(32);
2637
2638 const LLT SrcLT = MRI.getType(Src);
2639 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2640
2641 unsigned Flags = MI.getFlags();
2642
2643 // The basic idea of converting a floating point number into a pair of 32-bit
2644 // integers is illustrated as follows:
2645 //
2646 // tf := trunc(val);
2647 // hif := floor(tf * 2^-32);
2648 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2649 // hi := fptoi(hif);
2650 // lo := fptoi(lof);
2651 //
2652 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2654 if (Signed && SrcLT == S32) {
2655 // However, a 32-bit floating point number has only 23 bits mantissa and
2656 // it's not enough to hold all the significant bits of `lof` if val is
2657 // negative. To avoid the loss of precision, We need to take the absolute
2658 // value after truncating and flip the result back based on the original
2659 // signedness.
2660 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2661 Trunc = B.buildFAbs(S32, Trunc, Flags);
2662 }
2663 MachineInstrBuilder K0, K1;
2664 if (SrcLT == S64) {
2665 K0 = B.buildFConstant(
2666 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2667 K1 = B.buildFConstant(
2668 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2669 } else {
2670 K0 = B.buildFConstant(
2671 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2672 K1 = B.buildFConstant(
2673 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2674 }
2675
2676 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2677 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2678 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2679
2680 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2681 : B.buildFPTOUI(S32, FloorMul);
2682 auto Lo = B.buildFPTOUI(S32, Fma);
2683
2684 if (Signed && SrcLT == S32) {
2685 // Flip the result based on the signedness, which is either all 0s or 1s.
2686 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2687 // r := xor({lo, hi}, sign) - sign;
2688 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2689 Sign);
2690 } else
2691 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2692 MI.eraseFromParent();
2693
2694 return true;
2695}
2696
2698 MachineInstr &MI) const {
2699 MachineFunction &MF = Helper.MIRBuilder.getMF();
2701
2702 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2703 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2704
2705 // With ieee_mode disabled, the instructions have the correct behavior
2706 // already for G_FMINNUM/G_FMAXNUM
2707 if (!MFI->getMode().IEEE)
2708 return !IsIEEEOp;
2709
2710 if (IsIEEEOp)
2711 return true;
2712
2714}
2715
2718 MachineIRBuilder &B) const {
2719 // TODO: Should move some of this into LegalizerHelper.
2720
2721 // TODO: Promote dynamic indexing of s16 to s32
2722
2723 Register Dst = MI.getOperand(0).getReg();
2724 Register Vec = MI.getOperand(1).getReg();
2725
2726 LLT VecTy = MRI.getType(Vec);
2727 LLT EltTy = VecTy.getElementType();
2728 assert(EltTy == MRI.getType(Dst));
2729
2730 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2731 // but we can't go directly to that logic becasue you can't bitcast a vector
2732 // of pointers to a vector of integers. Therefore, introduce an intermediate
2733 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2734 // drive the legalization forward.
2735 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2736 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2737 LLT IntVecTy = VecTy.changeElementType(IntTy);
2738
2739 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2740 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2741 B.buildIntToPtr(Dst, IntElt);
2742
2743 MI.eraseFromParent();
2744 return true;
2745 }
2746
2747 // FIXME: Artifact combiner probably should have replaced the truncated
2748 // constant before this, so we shouldn't need
2749 // getIConstantVRegValWithLookThrough.
2750 std::optional<ValueAndVReg> MaybeIdxVal =
2751 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2752 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2753 return true;
2754 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2755
2756 if (IdxVal < VecTy.getNumElements()) {
2757 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2758 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2759 } else {
2760 B.buildUndef(Dst);
2761 }
2762
2763 MI.eraseFromParent();
2764 return true;
2765}
2766
2769 MachineIRBuilder &B) const {
2770 // TODO: Should move some of this into LegalizerHelper.
2771
2772 // TODO: Promote dynamic indexing of s16 to s32
2773
2774 Register Dst = MI.getOperand(0).getReg();
2775 Register Vec = MI.getOperand(1).getReg();
2776 Register Ins = MI.getOperand(2).getReg();
2777
2778 LLT VecTy = MRI.getType(Vec);
2779 LLT EltTy = VecTy.getElementType();
2780 assert(EltTy == MRI.getType(Ins));
2781
2782 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2783 // but we can't go directly to that logic becasue you can't bitcast a vector
2784 // of pointers to a vector of integers. Therefore, make the pointer vector
2785 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2786 // new value, and then inttoptr the result vector back. This will then allow
2787 // the rest of legalization to take over.
2788 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2789 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2790 LLT IntVecTy = VecTy.changeElementType(IntTy);
2791
2792 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2793 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2794 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2795 MI.getOperand(3));
2796 B.buildIntToPtr(Dst, IntVecDest);
2797 MI.eraseFromParent();
2798 return true;
2799 }
2800
2801 // FIXME: Artifact combiner probably should have replaced the truncated
2802 // constant before this, so we shouldn't need
2803 // getIConstantVRegValWithLookThrough.
2804 std::optional<ValueAndVReg> MaybeIdxVal =
2805 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2806 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2807 return true;
2808
2809 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2810
2811 unsigned NumElts = VecTy.getNumElements();
2812 if (IdxVal < NumElts) {
2814 for (unsigned i = 0; i < NumElts; ++i)
2815 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2816 B.buildUnmerge(SrcRegs, Vec);
2817
2818 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2819 B.buildMergeLikeInstr(Dst, SrcRegs);
2820 } else {
2821 B.buildUndef(Dst);
2822 }
2823
2824 MI.eraseFromParent();
2825 return true;
2826}
2827
2830 MachineIRBuilder &B) const {
2831
2832 Register DstReg = MI.getOperand(0).getReg();
2833 Register SrcReg = MI.getOperand(1).getReg();
2834 LLT Ty = MRI.getType(DstReg);
2835 unsigned Flags = MI.getFlags();
2836
2837 Register TrigVal;
2838 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2839 if (ST.hasTrigReducedRange()) {
2840 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2841 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2842 .addUse(MulVal.getReg(0))
2843 .setMIFlags(Flags)
2844 .getReg(0);
2845 } else
2846 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2847
2848 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2849 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2850 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2851 .addUse(TrigVal)
2852 .setMIFlags(Flags);
2853 MI.eraseFromParent();
2854 return true;
2855}
2856
2859 const GlobalValue *GV,
2860 int64_t Offset,
2861 unsigned GAFlags) const {
2862 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2863 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2864 // to the following code sequence:
2865 //
2866 // For constant address space:
2867 // s_getpc_b64 s[0:1]
2868 // s_add_u32 s0, s0, $symbol
2869 // s_addc_u32 s1, s1, 0
2870 //
2871 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2872 // a fixup or relocation is emitted to replace $symbol with a literal
2873 // constant, which is a pc-relative offset from the encoding of the $symbol
2874 // operand to the global variable.
2875 //
2876 // For global address space:
2877 // s_getpc_b64 s[0:1]
2878 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2879 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2880 //
2881 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2882 // fixups or relocations are emitted to replace $symbol@*@lo and
2883 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2884 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2885 // operand to the global variable.
2886
2888
2889 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2890 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2891
2892 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2893 .addDef(PCReg);
2894
2895 MIB.addGlobalAddress(GV, Offset, GAFlags);
2896 if (GAFlags == SIInstrInfo::MO_NONE)
2897 MIB.addImm(0);
2898 else
2899 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2900
2901 if (!B.getMRI()->getRegClassOrNull(PCReg))
2902 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2903
2904 if (PtrTy.getSizeInBits() == 32)
2905 B.buildExtract(DstReg, PCReg, 0);
2906 return true;
2907}
2908
2909// Emit a ABS32_LO / ABS32_HI relocation stub.
2911 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2912 MachineRegisterInfo &MRI) const {
2913 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2914
2915 LLT S32 = LLT::scalar(32);
2916
2917 // Use the destination directly, if and only if we store the lower address
2918 // part only and we don't have a register class being set.
2919 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2920 ? DstReg
2921 : MRI.createGenericVirtualRegister(S32);
2922
2923 if (!MRI.getRegClassOrNull(AddrLo))
2924 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2925
2926 // Write the lower half.
2927 B.buildInstr(AMDGPU::S_MOV_B32)
2928 .addDef(AddrLo)
2929 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2930
2931 // If required, write the upper half as well.
2932 if (RequiresHighHalf) {
2933 assert(PtrTy.getSizeInBits() == 64 &&
2934 "Must provide a 64-bit pointer type!");
2935
2936 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2937 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2938
2939 B.buildInstr(AMDGPU::S_MOV_B32)
2940 .addDef(AddrHi)
2941 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2942
2943 // Use the destination directly, if and only if we don't have a register
2944 // class being set.
2945 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2946 ? DstReg
2947 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2948
2949 if (!MRI.getRegClassOrNull(AddrDst))
2950 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2951
2952 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2953
2954 // If we created a new register for the destination, cast the result into
2955 // the final output.
2956 if (AddrDst != DstReg)
2957 B.buildCast(DstReg, AddrDst);
2958 } else if (AddrLo != DstReg) {
2959 // If we created a new register for the destination, cast the result into
2960 // the final output.
2961 B.buildCast(DstReg, AddrLo);
2962 }
2963}
2964
2967 MachineIRBuilder &B) const {
2968 Register DstReg = MI.getOperand(0).getReg();
2969 LLT Ty = MRI.getType(DstReg);
2970 unsigned AS = Ty.getAddressSpace();
2971
2972 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2973 MachineFunction &MF = B.getMF();
2975
2977 if (!MFI->isModuleEntryFunction() &&
2978 GV->getName() != "llvm.amdgcn.module.lds") {
2979 const Function &Fn = MF.getFunction();
2980 DiagnosticInfoUnsupported BadLDSDecl(
2981 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2982 DS_Warning);
2983 Fn.getContext().diagnose(BadLDSDecl);
2984
2985 // We currently don't have a way to correctly allocate LDS objects that
2986 // aren't directly associated with a kernel. We do force inlining of
2987 // functions that use local objects. However, if these dead functions are
2988 // not eliminated, we don't want a compile time error. Just emit a warning
2989 // and a trap, since there should be no callable path here.
2990 B.buildTrap();
2991 B.buildUndef(DstReg);
2992 MI.eraseFromParent();
2993 return true;
2994 }
2995
2996 // TODO: We could emit code to handle the initialization somewhere.
2997 // We ignore the initializer for now and legalize it to allow selection.
2998 // The initializer will anyway get errored out during assembly emission.
2999 const SITargetLowering *TLI = ST.getTargetLowering();
3000 if (!TLI->shouldUseLDSConstAddress(GV)) {
3001 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3002 return true; // Leave in place;
3003 }
3004
3005 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3006 Type *Ty = GV->getValueType();
3007 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3008 // zero-sized type in other languages to declare the dynamic shared
3009 // memory which size is not known at the compile time. They will be
3010 // allocated by the runtime and placed directly after the static
3011 // allocated ones. They all share the same offset.
3012 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3013 // Adjust alignment for that dynamic shared memory array.
3014 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3015 LLT S32 = LLT::scalar(32);
3016 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3017 B.buildIntToPtr(DstReg, Sz);
3018 MI.eraseFromParent();
3019 return true;
3020 }
3021 }
3022
3023 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3024 *cast<GlobalVariable>(GV)));
3025 MI.eraseFromParent();
3026 return true;
3027 }
3028
3029 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3030 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3031 MI.eraseFromParent();
3032 return true;
3033 }
3034
3035 const SITargetLowering *TLI = ST.getTargetLowering();
3036
3037 if (TLI->shouldEmitFixup(GV)) {
3038 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3039 MI.eraseFromParent();
3040 return true;
3041 }
3042
3043 if (TLI->shouldEmitPCReloc(GV)) {
3044 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3045 MI.eraseFromParent();
3046 return true;
3047 }
3048
3050 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3051
3052 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3057 LoadTy, Align(8));
3058
3059 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3060
3061 if (Ty.getSizeInBits() == 32) {
3062 // Truncate if this is a 32-bit constant address.
3063 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3064 B.buildExtract(DstReg, Load, 0);
3065 } else
3066 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3067
3068 MI.eraseFromParent();
3069 return true;
3070}
3071
3073 if (Ty.isVector())
3074 return Ty.changeElementCount(
3077}
3078
3080 MachineInstr &MI) const {
3081 MachineIRBuilder &B = Helper.MIRBuilder;
3082 MachineRegisterInfo &MRI = *B.getMRI();
3083 GISelChangeObserver &Observer = Helper.Observer;
3084
3085 Register PtrReg = MI.getOperand(1).getReg();
3086 LLT PtrTy = MRI.getType(PtrReg);
3087 unsigned AddrSpace = PtrTy.getAddressSpace();
3088
3089 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3091 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3092 Observer.changingInstr(MI);
3093 MI.getOperand(1).setReg(Cast.getReg(0));
3094 Observer.changedInstr(MI);
3095 return true;
3096 }
3097
3098 if (MI.getOpcode() != AMDGPU::G_LOAD)
3099 return false;
3100
3101 Register ValReg = MI.getOperand(0).getReg();
3102 LLT ValTy = MRI.getType(ValReg);
3103
3104 if (hasBufferRsrcWorkaround(ValTy)) {
3105 Observer.changingInstr(MI);
3107 Observer.changedInstr(MI);
3108 return true;
3109 }
3110
3111 MachineMemOperand *MMO = *MI.memoperands_begin();
3112 const unsigned ValSize = ValTy.getSizeInBits();
3113 const LLT MemTy = MMO->getMemoryType();
3114 const Align MemAlign = MMO->getAlign();
3115 const unsigned MemSize = MemTy.getSizeInBits();
3116 const uint64_t AlignInBits = 8 * MemAlign.value();
3117
3118 // Widen non-power-of-2 loads to the alignment if needed
3119 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3120 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3121
3122 // This was already the correct extending load result type, so just adjust
3123 // the memory type.
3124 if (WideMemSize == ValSize) {
3125 MachineFunction &MF = B.getMF();
3126
3127 MachineMemOperand *WideMMO =
3128 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3129 Observer.changingInstr(MI);
3130 MI.setMemRefs(MF, {WideMMO});
3131 Observer.changedInstr(MI);
3132 return true;
3133 }
3134
3135 // Don't bother handling edge case that should probably never be produced.
3136 if (ValSize > WideMemSize)
3137 return false;
3138
3139 LLT WideTy = widenToNextPowerOf2(ValTy);
3140
3141 Register WideLoad;
3142 if (!WideTy.isVector()) {
3143 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3144 B.buildTrunc(ValReg, WideLoad).getReg(0);
3145 } else {
3146 // Extract the subvector.
3147
3148 if (isRegisterType(ValTy)) {
3149 // If this a case where G_EXTRACT is legal, use it.
3150 // (e.g. <3 x s32> -> <4 x s32>)
3151 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3152 B.buildExtract(ValReg, WideLoad, 0);
3153 } else {
3154 // For cases where the widened type isn't a nice register value, unmerge
3155 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3156 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3157 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3158 }
3159 }
3160
3161 MI.eraseFromParent();
3162 return true;
3163 }
3164
3165 return false;
3166}
3167
3169 MachineInstr &MI) const {
3170 MachineIRBuilder &B = Helper.MIRBuilder;
3171 MachineRegisterInfo &MRI = *B.getMRI();
3172 GISelChangeObserver &Observer = Helper.Observer;
3173
3174 Register DataReg = MI.getOperand(0).getReg();
3175 LLT DataTy = MRI.getType(DataReg);
3176
3177 if (hasBufferRsrcWorkaround(DataTy)) {
3178 Observer.changingInstr(MI);
3180 Observer.changedInstr(MI);
3181 return true;
3182 }
3183 return false;
3184}
3185
3188 MachineIRBuilder &B) const {
3189 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3190 assert(Ty.isScalar());
3191
3192 MachineFunction &MF = B.getMF();
3194
3195 // TODO: Always legal with future ftz flag.
3196 // FIXME: Do we need just output?
3197 if (Ty == LLT::float32() &&
3199 return true;
3200 if (Ty == LLT::float16() &&
3202 return true;
3203
3204 MachineIRBuilder HelperBuilder(MI);
3205 GISelObserverWrapper DummyObserver;
3206 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3207 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3208}
3209
3212 Register DstReg = MI.getOperand(0).getReg();
3213 Register PtrReg = MI.getOperand(1).getReg();
3214 Register CmpVal = MI.getOperand(2).getReg();
3215 Register NewVal = MI.getOperand(3).getReg();
3216
3217 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3218 "this should not have been custom lowered");
3219
3220 LLT ValTy = MRI.getType(CmpVal);
3221 LLT VecTy = LLT::fixed_vector(2, ValTy);
3222
3223 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3224
3225 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3226 .addDef(DstReg)
3227 .addUse(PtrReg)
3228 .addUse(PackedVal)
3229 .setMemRefs(MI.memoperands());
3230
3231 MI.eraseFromParent();
3232 return true;
3233}
3234
3235/// Return true if it's known that \p Src can never be an f32 denormal value.
3237 Register Src) {
3238 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3239 switch (DefMI->getOpcode()) {
3240 case TargetOpcode::G_INTRINSIC: {
3241 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3242 case Intrinsic::amdgcn_frexp_mant:
3243 return true;
3244 default:
3245 break;
3246 }
3247
3248 break;
3249 }
3250 case TargetOpcode::G_FFREXP: {
3251 if (DefMI->getOperand(0).getReg() == Src)
3252 return true;
3253 break;
3254 }
3255 case TargetOpcode::G_FPEXT: {
3256 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3257 }
3258 default:
3259 return false;
3260 }
3261
3262 return false;
3263}
3264
3265static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3266 if (Flags & MachineInstr::FmAfn)
3267 return true;
3268 const auto &Options = MF.getTarget().Options;
3269 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3270}
3271
3273 unsigned Flags) {
3274 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3277}
3278
3279std::pair<Register, Register>
3281 unsigned Flags) const {
3282 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3283 return {};
3284
3285 const LLT F32 = LLT::scalar(32);
3286 auto SmallestNormal = B.buildFConstant(
3288 auto IsLtSmallestNormal =
3289 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3290
3291 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3292 auto One = B.buildFConstant(F32, 1.0);
3293 auto ScaleFactor =
3294 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3295 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3296
3297 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3298}
3299
3301 MachineIRBuilder &B) const {
3302 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3303 // If we have to handle denormals, scale up the input and adjust the result.
3304
3305 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3306 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3307
3308 Register Dst = MI.getOperand(0).getReg();
3309 Register Src = MI.getOperand(1).getReg();
3310 LLT Ty = B.getMRI()->getType(Dst);
3311 unsigned Flags = MI.getFlags();
3312
3313 if (Ty == LLT::scalar(16)) {
3314 const LLT F32 = LLT::scalar(32);
3315 // Nothing in half is a denormal when promoted to f32.
3316 auto Ext = B.buildFPExt(F32, Src, Flags);
3317 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3318 .addUse(Ext.getReg(0))
3319 .setMIFlags(Flags);
3320 B.buildFPTrunc(Dst, Log2, Flags);
3321 MI.eraseFromParent();
3322 return true;
3323 }
3324
3325 assert(Ty == LLT::scalar(32));
3326
3327 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3328 if (!ScaledInput) {
3329 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3330 .addUse(Src)
3331 .setMIFlags(Flags);
3332 MI.eraseFromParent();
3333 return true;
3334 }
3335
3336 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3337 .addUse(ScaledInput)
3338 .setMIFlags(Flags);
3339
3340 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3341 auto Zero = B.buildFConstant(Ty, 0.0);
3342 auto ResultOffset =
3343 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3344 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3345
3346 MI.eraseFromParent();
3347 return true;
3348}
3349
3351 Register Z, unsigned Flags) {
3352 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3353 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3354}
3355
3357 MachineIRBuilder &B) const {
3358 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3359 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3360
3361 MachineRegisterInfo &MRI = *B.getMRI();
3362 Register Dst = MI.getOperand(0).getReg();
3363 Register X = MI.getOperand(1).getReg();
3364 unsigned Flags = MI.getFlags();
3365 const LLT Ty = MRI.getType(X);
3366 MachineFunction &MF = B.getMF();
3367
3368 const LLT F32 = LLT::scalar(32);
3369 const LLT F16 = LLT::scalar(16);
3370
3371 const AMDGPUTargetMachine &TM =
3372 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3373
3374 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3375 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3376 if (Ty == F16 && !ST.has16BitInsts()) {
3377 Register LogVal = MRI.createGenericVirtualRegister(F32);
3378 auto PromoteSrc = B.buildFPExt(F32, X);
3379 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3380 B.buildFPTrunc(Dst, LogVal);
3381 } else {
3382 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3383 }
3384
3385 MI.eraseFromParent();
3386 return true;
3387 }
3388
3389 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3390 if (ScaledInput)
3391 X = ScaledInput;
3392
3393 auto Y =
3394 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3395
3396 Register R;
3397 if (ST.hasFastFMAF32()) {
3398 // c+cc are ln(2)/ln(10) to more than 49 bits
3399 const float c_log10 = 0x1.344134p-2f;
3400 const float cc_log10 = 0x1.09f79ep-26f;
3401
3402 // c + cc is ln(2) to more than 49 bits
3403 const float c_log = 0x1.62e42ep-1f;
3404 const float cc_log = 0x1.efa39ep-25f;
3405
3406 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3407 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3408
3409 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3410 auto NegR = B.buildFNeg(Ty, R, Flags);
3411 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3412 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3413 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3414 } else {
3415 // ch+ct is ln(2)/ln(10) to more than 36 bits
3416 const float ch_log10 = 0x1.344000p-2f;
3417 const float ct_log10 = 0x1.3509f6p-18f;
3418
3419 // ch + ct is ln(2) to more than 36 bits
3420 const float ch_log = 0x1.62e000p-1f;
3421 const float ct_log = 0x1.0bfbe8p-15f;
3422
3423 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3424 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3425
3426 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3427 auto YH = B.buildAnd(Ty, Y, MaskConst);
3428 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3429 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3430
3431 Register Mad0 =
3432 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3433 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3434 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3435 }
3436
3437 const bool IsFiniteOnly =
3438 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3439 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3440
3441 if (!IsFiniteOnly) {
3442 // Expand isfinite(x) => fabs(x) < inf
3443 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3444 auto Fabs = B.buildFAbs(Ty, Y);
3445 auto IsFinite =
3446 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3447 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3448 }
3449
3450 if (ScaledInput) {
3451 auto Zero = B.buildFConstant(Ty, 0.0);
3452 auto ShiftK =
3453 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3454 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3455 B.buildFSub(Dst, R, Shift, Flags);
3456 } else {
3457 B.buildCopy(Dst, R);
3458 }
3459
3460 MI.eraseFromParent();
3461 return true;
3462}
3463
3465 Register Src, bool IsLog10,
3466 unsigned Flags) const {
3467 const double Log2BaseInverted =
3469
3470 LLT Ty = B.getMRI()->getType(Dst);
3471
3472 if (Ty == LLT::scalar(32)) {
3473 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3474 if (ScaledInput) {
3475 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3476 .addUse(Src)
3477 .setMIFlags(Flags);
3478 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3479 auto Zero = B.buildFConstant(Ty, 0.0);
3480 auto ResultOffset =
3481 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3482 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3483
3484 if (ST.hasFastFMAF32())
3485 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3486 else {
3487 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3488 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3489 }
3490
3491 return true;
3492 }
3493 }
3494
3495 auto Log2Operand = Ty == LLT::scalar(16)
3496 ? B.buildFLog2(Ty, Src, Flags)
3497 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3498 .addUse(Src)
3499 .setMIFlags(Flags);
3500 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3501 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3502 return true;
3503}
3504
3506 MachineIRBuilder &B) const {
3507 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3508 // If we have to handle denormals, scale up the input and adjust the result.
3509
3510 Register Dst = MI.getOperand(0).getReg();
3511 Register Src = MI.getOperand(1).getReg();
3512 unsigned Flags = MI.getFlags();
3513 LLT Ty = B.getMRI()->getType(Dst);
3514 const LLT F16 = LLT::scalar(16);
3515 const LLT F32 = LLT::scalar(32);
3516
3517 if (Ty == F16) {
3518 // Nothing in half is a denormal when promoted to f32.
3519 auto Ext = B.buildFPExt(F32, Src, Flags);
3520 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3521 .addUse(Ext.getReg(0))
3522 .setMIFlags(Flags);
3523 B.buildFPTrunc(Dst, Log2, Flags);
3524 MI.eraseFromParent();
3525 return true;
3526 }
3527
3528 assert(Ty == F32);
3529
3530 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3531 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3532 .addUse(Src)
3533 .setMIFlags(Flags);
3534 MI.eraseFromParent();
3535 return true;
3536 }
3537
3538 // bool needs_scaling = x < -0x1.f80000p+6f;
3539 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3540
3541 // -nextafter(128.0, -1)
3542 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3543 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3544 RangeCheckConst, Flags);
3545
3546 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3547 auto Zero = B.buildFConstant(Ty, 0.0);
3548 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3549 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3550
3551 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3552 .addUse(AddInput.getReg(0))
3553 .setMIFlags(Flags);
3554
3555 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3556 auto One = B.buildFConstant(Ty, 1.0);
3557 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3558 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3559 MI.eraseFromParent();
3560 return true;
3561}
3562
3564 Register X, unsigned Flags) const {
3565 LLT Ty = B.getMRI()->getType(Dst);
3566 LLT F32 = LLT::scalar(32);
3567
3568 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3569 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3570 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3571
3572 if (Ty == F32) {
3573 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3574 .addUse(Mul.getReg(0))
3575 .setMIFlags(Flags);
3576 } else {
3577 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3578 }
3579
3580 return true;
3581 }
3582
3583 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3584 auto NeedsScaling =
3585 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3586 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3587 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3588 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3589
3590 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3591 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3592
3593 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3594 .addUse(ExpInput.getReg(0))
3595 .setMIFlags(Flags);
3596
3597 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3598 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3599 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3600 return true;
3601}
3602
3604 MachineIRBuilder &B) const {
3605 Register Dst = MI.getOperand(0).getReg();
3606 Register X = MI.getOperand(1).getReg();
3607 const unsigned Flags = MI.getFlags();
3608 MachineFunction &MF = B.getMF();
3609 MachineRegisterInfo &MRI = *B.getMRI();
3610 LLT Ty = MRI.getType(Dst);
3611 const LLT F16 = LLT::scalar(16);
3612 const LLT F32 = LLT::scalar(32);
3613 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3614
3615 if (Ty == F16) {
3616 // v_exp_f16 (fmul x, log2e)
3617 if (allowApproxFunc(MF, Flags)) {
3618 // TODO: Does this really require fast?
3619 legalizeFExpUnsafe(B, Dst, X, Flags);
3620 MI.eraseFromParent();
3621 return true;
3622 }
3623
3624 // exp(f16 x) ->
3625 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3626
3627 // Nothing in half is a denormal when promoted to f32.
3628 auto Ext = B.buildFPExt(F32, X, Flags);
3629 Register Lowered = MRI.createGenericVirtualRegister(F32);
3630 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3631 B.buildFPTrunc(Dst, Lowered, Flags);
3632 MI.eraseFromParent();
3633 return true;
3634 }
3635
3636 assert(Ty == F32);
3637
3638 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3639 // library behavior. Also, is known-not-daz source sufficient?
3640 if (allowApproxFunc(MF, Flags)) {
3641 legalizeFExpUnsafe(B, Dst, X, Flags);
3642 MI.eraseFromParent();
3643 return true;
3644 }
3645
3646 // Algorithm:
3647 //
3648 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3649 //
3650 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3651 // n = 64*m + j, 0 <= j < 64
3652 //
3653 // e^x = 2^((64*m + j + f)/64)
3654 // = (2^m) * (2^(j/64)) * 2^(f/64)
3655 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3656 //
3657 // f = x*(64/ln(2)) - n
3658 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3659 //
3660 // e^x = (2^m) * (2^(j/64)) * e^r
3661 //
3662 // (2^(j/64)) is precomputed
3663 //
3664 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3665 // e^r = 1 + q
3666 //
3667 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3668 //
3669 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3670 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3671 Register PH, PL;
3672
3673 if (ST.hasFastFMAF32()) {
3674 const float c_exp = numbers::log2ef;
3675 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3676 const float c_exp10 = 0x1.a934f0p+1f;
3677 const float cc_exp10 = 0x1.2f346ep-24f;
3678
3679 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3680 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3681 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3682 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3683
3684 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3685 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3686 } else {
3687 const float ch_exp = 0x1.714000p+0f;
3688 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3689
3690 const float ch_exp10 = 0x1.a92000p+1f;
3691 const float cl_exp10 = 0x1.4f0978p-11f;
3692
3693 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3694 auto XH = B.buildAnd(Ty, X, MaskConst);
3695 auto XL = B.buildFSub(Ty, X, XH, Flags);
3696
3697 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3698 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3699
3700 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3701 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3702
3703 Register Mad0 =
3704 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3705 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3706 }
3707
3708 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3709
3710 // It is unsafe to contract this fsub into the PH multiply.
3711 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3712 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3713 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3714
3715 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3716 .addUse(A.getReg(0))
3717 .setMIFlags(Flags);
3718 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3719
3720 auto UnderflowCheckConst =
3721 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3722 auto Zero = B.buildFConstant(Ty, 0.0);
3723 auto Underflow =
3724 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3725
3726 R = B.buildSelect(Ty, Underflow, Zero, R);
3727
3728 const auto &Options = MF.getTarget().Options;
3729
3730 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3731 auto OverflowCheckConst =
3732 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3733
3734 auto Overflow =
3735 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3736 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3737 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3738 }
3739
3740 B.buildCopy(Dst, R);
3741 MI.eraseFromParent();
3742 return true;
3743}
3744
3746 MachineIRBuilder &B) const {
3747 Register Dst = MI.getOperand(0).getReg();
3748 Register Src0 = MI.getOperand(1).getReg();
3749 Register Src1 = MI.getOperand(2).getReg();
3750 unsigned Flags = MI.getFlags();
3751 LLT Ty = B.getMRI()->getType(Dst);
3752 const LLT F16 = LLT::float16();
3753 const LLT F32 = LLT::float32();
3754
3755 if (Ty == F32) {
3756 auto Log = B.buildFLog2(F32, Src0, Flags);
3757 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3758 .addUse(Log.getReg(0))
3759 .addUse(Src1)
3760 .setMIFlags(Flags);
3761 B.buildFExp2(Dst, Mul, Flags);
3762 } else if (Ty == F16) {
3763 // There's no f16 fmul_legacy, so we need to convert for it.
3764 auto Log = B.buildFLog2(F16, Src0, Flags);
3765 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3766 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3767 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3768 .addUse(Ext0.getReg(0))
3769 .addUse(Ext1.getReg(0))
3770 .setMIFlags(Flags);
3771 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3772 } else
3773 return false;
3774
3775 MI.eraseFromParent();
3776 return true;
3777}
3778
3779// Find a source register, ignoring any possible source modifiers.
3781 Register ModSrc = OrigSrc;
3782 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3783 ModSrc = SrcFNeg->getOperand(1).getReg();
3784 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3785 ModSrc = SrcFAbs->getOperand(1).getReg();
3786 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3787 ModSrc = SrcFAbs->getOperand(1).getReg();
3788 return ModSrc;
3789}
3790
3793 MachineIRBuilder &B) const {
3794
3795 const LLT S1 = LLT::scalar(1);
3796 const LLT F64 = LLT::float64();
3797 Register Dst = MI.getOperand(0).getReg();
3798 Register OrigSrc = MI.getOperand(1).getReg();
3799 unsigned Flags = MI.getFlags();
3800 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3801 "this should not have been custom lowered");
3802
3803 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3804 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3805 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3806 // V_FRACT bug is:
3807 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3808 //
3809 // Convert floor(x) to (x - fract(x))
3810
3811 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3812 .addUse(OrigSrc)
3813 .setMIFlags(Flags);
3814
3815 // Give source modifier matching some assistance before obscuring a foldable
3816 // pattern.
3817
3818 // TODO: We can avoid the neg on the fract? The input sign to fract
3819 // shouldn't matter?
3820 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3821
3822 auto Const =
3823 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3824
3825 Register Min = MRI.createGenericVirtualRegister(F64);
3826
3827 // We don't need to concern ourselves with the snan handling difference, so
3828 // use the one which will directly select.
3829 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3830 if (MFI->getMode().IEEE)
3831 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3832 else
3833 B.buildFMinNum(Min, Fract, Const, Flags);
3834
3835 Register CorrectedFract = Min;
3836 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3837 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3838 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3839 }
3840
3841 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3842 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3843
3844 MI.eraseFromParent();
3845 return true;
3846}
3847
3848// Turn an illegal packed v2s16 build vector into bit operations.
3849// TODO: This should probably be a bitcast action in LegalizerHelper.
3852 Register Dst = MI.getOperand(0).getReg();
3853 const LLT S32 = LLT::scalar(32);
3854 const LLT S16 = LLT::scalar(16);
3855 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3856
3857 Register Src0 = MI.getOperand(1).getReg();
3858 Register Src1 = MI.getOperand(2).getReg();
3859
3860 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3861 assert(MRI.getType(Src0) == S32);
3862 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3863 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3864 }
3865
3866 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3867 B.buildBitcast(Dst, Merge);
3868
3869 MI.eraseFromParent();
3870 return true;
3871}
3872
3873// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3874//
3875// Source and accumulation registers must all be 32-bits.
3876//
3877// TODO: When the multiply is uniform, we should produce a code sequence
3878// that is better suited to instruction selection on the SALU. Instead of
3879// the outer loop going over parts of the result, the outer loop should go
3880// over parts of one of the factors. This should result in instruction
3881// selection that makes full use of S_ADDC_U32 instructions.
3884 ArrayRef<Register> Src0,
3885 ArrayRef<Register> Src1,
3886 bool UsePartialMad64_32,
3887 bool SeparateOddAlignedProducts) const {
3888 // Use (possibly empty) vectors of S1 registers to represent the set of
3889 // carries from one pair of positions to the next.
3890 using Carry = SmallVector<Register, 2>;
3891
3892 MachineIRBuilder &B = Helper.MIRBuilder;
3893 GISelKnownBits &KB = *Helper.getKnownBits();
3894
3895 const LLT S1 = LLT::scalar(1);
3896 const LLT S32 = LLT::scalar(32);
3897 const LLT S64 = LLT::scalar(64);
3898
3899 Register Zero32;
3900 Register Zero64;
3901
3902 auto getZero32 = [&]() -> Register {
3903 if (!Zero32)
3904 Zero32 = B.buildConstant(S32, 0).getReg(0);
3905 return Zero32;
3906 };
3907 auto getZero64 = [&]() -> Register {
3908 if (!Zero64)
3909 Zero64 = B.buildConstant(S64, 0).getReg(0);
3910 return Zero64;
3911 };
3912
3913 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3914 for (unsigned i = 0; i < Src0.size(); ++i) {
3915 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3916 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3917 }
3918
3919 // Merge the given carries into the 32-bit LocalAccum, which is modified
3920 // in-place.
3921 //
3922 // Returns the carry-out, which is a single S1 register or null.
3923 auto mergeCarry =
3924 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3925 if (CarryIn.empty())
3926 return Register();
3927
3928 bool HaveCarryOut = true;
3929 Register CarryAccum;
3930 if (CarryIn.size() == 1) {
3931 if (!LocalAccum) {
3932 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3933 return Register();
3934 }
3935
3936 CarryAccum = getZero32();
3937 } else {
3938 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3939 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3940 CarryAccum =
3941 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3942 .getReg(0);
3943 }
3944
3945 if (!LocalAccum) {
3946 LocalAccum = getZero32();
3947 HaveCarryOut = false;
3948 }
3949 }
3950
3951 auto Add =
3952 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3953 LocalAccum = Add.getReg(0);
3954 return HaveCarryOut ? Add.getReg(1) : Register();
3955 };
3956
3957 // Build a multiply-add chain to compute
3958 //
3959 // LocalAccum + (partial products at DstIndex)
3960 // + (opportunistic subset of CarryIn)
3961 //
3962 // LocalAccum is an array of one or two 32-bit registers that are updated
3963 // in-place. The incoming registers may be null.
3964 //
3965 // In some edge cases, carry-ins can be consumed "for free". In that case,
3966 // the consumed carry bits are removed from CarryIn in-place.
3967 auto buildMadChain =
3968 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3969 -> Carry {
3970 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3971 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3972
3973 Carry CarryOut;
3974 unsigned j0 = 0;
3975
3976 // Use plain 32-bit multiplication for the most significant part of the
3977 // result by default.
3978 if (LocalAccum.size() == 1 &&
3979 (!UsePartialMad64_32 || !CarryIn.empty())) {
3980 do {
3981 // Skip multiplication if one of the operands is 0
3982 unsigned j1 = DstIndex - j0;
3983 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3984 ++j0;
3985 continue;
3986 }
3987 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3988 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3989 LocalAccum[0] = Mul.getReg(0);
3990 } else {
3991 if (CarryIn.empty()) {
3992 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3993 } else {
3994 LocalAccum[0] =
3995 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3996 .getReg(0);
3997 CarryIn.pop_back();
3998 }
3999 }
4000 ++j0;
4001 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4002 }
4003
4004 // Build full 64-bit multiplies.
4005 if (j0 <= DstIndex) {
4006 bool HaveSmallAccum = false;
4007 Register Tmp;
4008
4009 if (LocalAccum[0]) {
4010 if (LocalAccum.size() == 1) {
4011 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4012 HaveSmallAccum = true;
4013 } else if (LocalAccum[1]) {
4014 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4015 HaveSmallAccum = false;
4016 } else {
4017 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4018 HaveSmallAccum = true;
4019 }
4020 } else {
4021 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4022 Tmp = getZero64();
4023 HaveSmallAccum = true;
4024 }
4025
4026 do {
4027 unsigned j1 = DstIndex - j0;
4028 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4029 ++j0;
4030 continue;
4031 }
4032 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4033 {Src0[j0], Src1[j1], Tmp});
4034 Tmp = Mad.getReg(0);
4035 if (!HaveSmallAccum)
4036 CarryOut.push_back(Mad.getReg(1));
4037 HaveSmallAccum = false;
4038
4039 ++j0;
4040 } while (j0 <= DstIndex);
4041
4042 auto Unmerge = B.buildUnmerge(S32, Tmp);
4043 LocalAccum[0] = Unmerge.getReg(0);
4044 if (LocalAccum.size() > 1)
4045 LocalAccum[1] = Unmerge.getReg(1);
4046 }
4047
4048 return CarryOut;
4049 };
4050
4051 // Outer multiply loop, iterating over destination parts from least
4052 // significant to most significant parts.
4053 //
4054 // The columns of the following diagram correspond to the destination parts
4055 // affected by one iteration of the outer loop (ignoring boundary
4056 // conditions).
4057 //
4058 // Dest index relative to 2 * i: 1 0 -1
4059 // ------
4060 // Carries from previous iteration: e o
4061 // Even-aligned partial product sum: E E .
4062 // Odd-aligned partial product sum: O O
4063 //
4064 // 'o' is OddCarry, 'e' is EvenCarry.
4065 // EE and OO are computed from partial products via buildMadChain and use
4066 // accumulation where possible and appropriate.
4067 //
4068 Register SeparateOddCarry;
4069 Carry EvenCarry;
4070 Carry OddCarry;
4071
4072 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4073 Carry OddCarryIn = std::move(OddCarry);
4074 Carry EvenCarryIn = std::move(EvenCarry);
4075 OddCarry.clear();
4076 EvenCarry.clear();
4077
4078 // Partial products at offset 2 * i.
4079 if (2 * i < Accum.size()) {
4080 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4081 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4082 }
4083
4084 // Partial products at offset 2 * i - 1.
4085 if (i > 0) {
4086 if (!SeparateOddAlignedProducts) {
4087 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4088 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4089 } else {
4090 bool IsHighest = 2 * i >= Accum.size();
4091 Register SeparateOddOut[2];
4092 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4093 .take_front(IsHighest ? 1 : 2);
4094 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4095
4097
4098 if (i == 1) {
4099 if (!IsHighest)
4100 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4101 else
4102 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4103 } else {
4104 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4105 SeparateOddCarry);
4106 }
4107 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4108
4109 if (!IsHighest) {
4110 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4111 Lo->getOperand(1).getReg());
4112 Accum[2 * i] = Hi.getReg(0);
4113 SeparateOddCarry = Hi.getReg(1);
4114 }
4115 }
4116 }
4117
4118 // Add in the carries from the previous iteration
4119 if (i > 0) {
4120 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4121 EvenCarryIn.push_back(CarryOut);
4122
4123 if (2 * i < Accum.size()) {
4124 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4125 OddCarry.push_back(CarryOut);
4126 }
4127 }
4128 }
4129}
4130
4131// Custom narrowing of wide multiplies using wide multiply-add instructions.
4132//
4133// TODO: If the multiply is followed by an addition, we should attempt to
4134// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4136 MachineInstr &MI) const {
4137 assert(ST.hasMad64_32());
4138 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4139
4140 MachineIRBuilder &B = Helper.MIRBuilder;
4141 MachineRegisterInfo &MRI = *B.getMRI();
4142
4143 Register DstReg = MI.getOperand(0).getReg();
4144 Register Src0 = MI.getOperand(1).getReg();
4145 Register Src1 = MI.getOperand(2).getReg();
4146
4147 LLT Ty = MRI.getType(DstReg);
4148 assert(Ty.isScalar());
4149
4150 unsigned Size = Ty.getSizeInBits();
4151 unsigned NumParts = Size / 32;
4152 assert((Size % 32) == 0);
4153 assert(NumParts >= 2);
4154
4155 // Whether to use MAD_64_32 for partial products whose high half is
4156 // discarded. This avoids some ADD instructions but risks false dependency
4157 // stalls on some subtargets in some cases.
4158 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4159
4160 // Whether to compute odd-aligned partial products separately. This is
4161 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4162 // in an even-aligned VGPR.
4163 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4164
4165 LLT S32 = LLT::scalar(32);
4166 SmallVector<Register, 2> Src0Parts, Src1Parts;
4167 for (unsigned i = 0; i < NumParts; ++i) {
4168 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4169 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4170 }
4171 B.buildUnmerge(Src0Parts, Src0);
4172 B.buildUnmerge(Src1Parts, Src1);
4173
4174 SmallVector<Register, 2> AccumRegs(NumParts);
4175 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4176 SeparateOddAlignedProducts);
4177
4178 B.buildMergeLikeInstr(DstReg, AccumRegs);
4179 MI.eraseFromParent();
4180 return true;
4181}
4182
4183// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4184// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4185// case with a single min instruction instead of a compare+select.
4188 MachineIRBuilder &B) const {
4189 Register Dst = MI.getOperand(0).getReg();
4190 Register Src = MI.getOperand(1).getReg();
4191 LLT DstTy = MRI.getType(Dst);
4192 LLT SrcTy = MRI.getType(Src);
4193
4194 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4195 ? AMDGPU::G_AMDGPU_FFBH_U32
4196 : AMDGPU::G_AMDGPU_FFBL_B32;
4197 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4198 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4199
4200 MI.eraseFromParent();
4201 return true;
4202}
4203
4206 MachineIRBuilder &B) const {
4207 Register Dst = MI.getOperand(0).getReg();
4208 Register Src = MI.getOperand(1).getReg();
4209 LLT SrcTy = MRI.getType(Src);
4210 TypeSize NumBits = SrcTy.getSizeInBits();
4211
4212 assert(NumBits < 32u);
4213
4214 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4215 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4216 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4217 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4218 B.buildTrunc(Dst, Ctlz);
4219 MI.eraseFromParent();
4220 return true;
4221}
4222
4223// Check that this is a G_XOR x, -1
4224static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4225 if (MI.getOpcode() != TargetOpcode::G_XOR)
4226 return false;
4227 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4228 return ConstVal && *ConstVal == -1;
4229}
4230
4231// Return the use branch instruction, otherwise null if the usage is invalid.
4232static MachineInstr *
4234 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4235 Register CondDef = MI.getOperand(0).getReg();
4236 if (!MRI.hasOneNonDBGUse(CondDef))
4237 return nullptr;
4238
4239 MachineBasicBlock *Parent = MI.getParent();
4240 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4241
4242 if (isNot(MRI, *UseMI)) {
4243 Register NegatedCond = UseMI->getOperand(0).getReg();
4244 if (!MRI.hasOneNonDBGUse(NegatedCond))
4245 return nullptr;
4246
4247 // We're deleting the def of this value, so we need to remove it.
4248 eraseInstr(*UseMI, MRI);
4249
4250 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4251 Negated = true;
4252 }
4253
4254 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4255 return nullptr;
4256
4257 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4258 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4259 if (Next == Parent->end()) {
4260 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4261 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4262 return nullptr;
4263 UncondBrTarget = &*NextMBB;
4264 } else {
4265 if (Next->getOpcode() != AMDGPU::G_BR)
4266 return nullptr;
4267 Br = &*Next;
4268 UncondBrTarget = Br->getOperand(0).getMBB();
4269 }
4270
4271 return UseMI;
4272}
4273
4275 const ArgDescriptor *Arg,
4276 const TargetRegisterClass *ArgRC,
4277 LLT ArgTy) const {
4278 MCRegister SrcReg = Arg->getRegister();
4279 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4280 assert(DstReg.isVirtual() && "Virtual register expected");
4281
4282 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4283 *ArgRC, B.getDebugLoc(), ArgTy);
4284 if (Arg->isMasked()) {
4285 // TODO: Should we try to emit this once in the entry block?
4286 const LLT S32 = LLT::scalar(32);
4287 const unsigned Mask = Arg->getMask();
4288 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4289
4290 Register AndMaskSrc = LiveIn;
4291
4292 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4293 // 0.
4294 if (Shift != 0) {
4295 auto ShiftAmt = B.buildConstant(S32, Shift);
4296 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4297 }
4298
4299 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4300 } else {
4301 B.buildCopy(DstReg, LiveIn);
4302 }
4303
4304 return true;
4305}
4306
4308 Register DstReg, MachineIRBuilder &B,
4310 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4311 const ArgDescriptor *Arg = nullptr;
4312 const TargetRegisterClass *ArgRC;
4313 LLT ArgTy;
4314
4315 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4316 const ArgDescriptor WorkGroupIDX =
4317 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4318 // If GridZ is not programmed in an entry function then the hardware will set
4319 // it to all zeros, so there is no need to mask the GridY value in the low
4320 // order bits.
4321 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4322 AMDGPU::TTMP7,
4323 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4324 const ArgDescriptor WorkGroupIDZ =
4325 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4326 if (ST.hasArchitectedSGPRs() &&
4328 switch (ArgType) {
4330 Arg = &WorkGroupIDX;
4331 ArgRC = &AMDGPU::SReg_32RegClass;
4332 ArgTy = LLT::scalar(32);
4333 break;
4335 Arg = &WorkGroupIDY;
4336 ArgRC = &AMDGPU::SReg_32RegClass;
4337 ArgTy = LLT::scalar(32);
4338 break;
4340 Arg = &WorkGroupIDZ;
4341 ArgRC = &AMDGPU::SReg_32RegClass;
4342 ArgTy = LLT::scalar(32);
4343 break;
4344 default:
4345 break;
4346 }
4347 }
4348
4349 if (!Arg)
4350 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4351
4352 if (!Arg) {
4354 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4355 // case the pointer argument may be missing and we use null.
4356 B.buildConstant(DstReg, 0);
4357 return true;
4358 }
4359
4360 // It's undefined behavior if a function marked with the amdgpu-no-*
4361 // attributes uses the corresponding intrinsic.
4362 B.buildUndef(DstReg);
4363 return true;
4364 }
4365
4366 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4367 return false; // TODO: Handle these
4368 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4369}
4370
4374 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4375 return false;
4376
4377 MI.eraseFromParent();
4378 return true;
4379}
4380
4382 int64_t C) {
4383 B.buildConstant(MI.getOperand(0).getReg(), C);
4384 MI.eraseFromParent();
4385 return true;
4386}
4387
4390 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4391 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4392 if (MaxID == 0)
4393 return replaceWithConstant(B, MI, 0);
4394
4395 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4396 const ArgDescriptor *Arg;
4397 const TargetRegisterClass *ArgRC;
4398 LLT ArgTy;
4399 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4400
4401 Register DstReg = MI.getOperand(0).getReg();
4402 if (!Arg) {
4403 // It's undefined behavior if a function marked with the amdgpu-no-*
4404 // attributes uses the corresponding intrinsic.
4405 B.buildUndef(DstReg);
4406 MI.eraseFromParent();
4407 return true;
4408 }
4409
4410 if (Arg->isMasked()) {
4411 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4412 // masking operations anyway.
4413 //
4414 // TODO: We could assert the top bit is 0 for the source copy.
4415 if (!loadInputValue(DstReg, B, ArgType))
4416 return false;
4417 } else {
4418 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4419 if (!loadInputValue(TmpReg, B, ArgType))
4420 return false;
4421 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4422 }
4423
4424 MI.eraseFromParent();
4425 return true;
4426}
4427
4429 int64_t Offset) const {
4431 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4432
4433 // TODO: If we passed in the base kernel offset we could have a better
4434 // alignment than 4, but we don't really need it.
4435 if (!loadInputValue(KernArgReg, B,
4437 llvm_unreachable("failed to find kernarg segment ptr");
4438
4439 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4440 // TODO: Should get nuw
4441 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4442}
4443
4444/// Legalize a value that's loaded from kernel arguments. This is only used by
4445/// legacy intrinsics.
4449 Align Alignment) const {
4450 Register DstReg = MI.getOperand(0).getReg();
4451
4452 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4453 "unexpected kernarg parameter type");
4454
4457 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4460 MI.eraseFromParent();
4461 return true;
4462}
4463
4466 MachineIRBuilder &B) const {
4467 Register Dst = MI.getOperand(0).getReg();
4468 LLT DstTy = MRI.getType(Dst);
4469 LLT S16 = LLT::scalar(16);
4470 LLT S32 = LLT::scalar(32);
4471 LLT S64 = LLT::scalar(64);
4472
4473 if (DstTy == S16)
4474 return legalizeFDIV16(MI, MRI, B);
4475 if (DstTy == S32)
4476 return legalizeFDIV32(MI, MRI, B);
4477 if (DstTy == S64)
4478 return legalizeFDIV64(MI, MRI, B);
4479
4480 return false;
4481}
4482
4484 Register DstDivReg,
4485 Register DstRemReg,
4486 Register X,
4487 Register Y) const {
4488 const LLT S1 = LLT::scalar(1);
4489 const LLT S32 = LLT::scalar(32);
4490
4491 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4492 // algorithm used here.
4493
4494 // Initial estimate of inv(y).
4495 auto FloatY = B.buildUITOFP(S32, Y);
4496 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4497 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4498 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4499 auto Z = B.buildFPTOUI(S32, ScaledY);
4500
4501 // One round of UNR.
4502 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4503 auto NegYZ = B.buildMul(S32, NegY, Z);
4504 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4505
4506 // Quotient/remainder estimate.
4507 auto Q = B.buildUMulH(S32, X, Z);
4508 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4509
4510 // First quotient/remainder refinement.
4511 auto One = B.buildConstant(S32, 1);
4512 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4513 if (DstDivReg)
4514 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4515 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4516
4517 // Second quotient/remainder refinement.
4518 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4519 if (DstDivReg)
4520 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4521
4522 if (DstRemReg)
4523 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4524}
4525
4526// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4527//
4528// Return lo, hi of result
4529//
4530// %cvt.lo = G_UITOFP Val.lo
4531// %cvt.hi = G_UITOFP Val.hi
4532// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4533// %rcp = G_AMDGPU_RCP_IFLAG %mad
4534// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4535// %mul2 = G_FMUL %mul1, 2**(-32)
4536// %trunc = G_INTRINSIC_TRUNC %mul2
4537// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4538// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4539static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4540 Register Val) {
4541 const LLT S32 = LLT::scalar(32);
4542 auto Unmerge = B.buildUnmerge(S32, Val);
4543
4544 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4545 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4546
4547 auto Mad = B.buildFMAD(
4548 S32, CvtHi, // 2**32
4549 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4550
4551 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4552 auto Mul1 = B.buildFMul(
4553 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4554
4555 // 2**(-32)
4556 auto Mul2 = B.buildFMul(
4557 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4558 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4559
4560 // -(2**32)
4561 auto Mad2 = B.buildFMAD(
4562 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4563 Mul1);
4564
4565 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4566 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4567
4568 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4569}
4570
4572 Register DstDivReg,
4573 Register DstRemReg,
4574 Register Numer,
4575 Register Denom) const {
4576 const LLT S32 = LLT::scalar(32);
4577 const LLT S64 = LLT::scalar(64);
4578 const LLT S1 = LLT::scalar(1);
4579 Register RcpLo, RcpHi;
4580
4581 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4582
4583 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4584
4585 auto Zero64 = B.buildConstant(S64, 0);
4586 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4587
4588 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4589 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4590
4591 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4592 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4593 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4594
4595 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4596 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4597 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4598
4599 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4600 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4601 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4602 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4603 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4604
4605 auto Zero32 = B.buildConstant(S32, 0);
4606 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4607 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4608 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4609
4610 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4611 Register NumerLo = UnmergeNumer.getReg(0);
4612 Register NumerHi = UnmergeNumer.getReg(1);
4613
4614 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4615 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4616 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4617 Register Mul3_Lo = UnmergeMul3.getReg(0);
4618 Register Mul3_Hi = UnmergeMul3.getReg(1);
4619 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4620 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4621 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4622 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4623
4624 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4625 Register DenomLo = UnmergeDenom.getReg(0);
4626 Register DenomHi = UnmergeDenom.getReg(1);
4627
4628 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4629 auto C1 = B.buildSExt(S32, CmpHi);
4630
4631 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4632 auto C2 = B.buildSExt(S32, CmpLo);
4633
4634 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4635 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4636
4637 // TODO: Here and below portions of the code can be enclosed into if/endif.
4638 // Currently control flow is unconditional and we have 4 selects after
4639 // potential endif to substitute PHIs.
4640
4641 // if C3 != 0 ...
4642 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4643 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4644 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4645 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4646
4647 auto One64 = B.buildConstant(S64, 1);
4648 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4649
4650 auto C4 =
4651 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4652 auto C5 =
4653 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4654 auto C6 = B.buildSelect(
4655 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4656
4657 // if (C6 != 0)
4658 auto Add4 = B.buildAdd(S64, Add3, One64);
4659 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4660
4661 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4662 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4663 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4664
4665 // endif C6
4666 // endif C3
4667
4668 if (DstDivReg) {
4669 auto Sel1 = B.buildSelect(
4670 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4671 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4672 Sel1, MulHi3);
4673 }
4674
4675 if (DstRemReg) {
4676 auto Sel2 = B.buildSelect(
4677 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4678 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4679 Sel2, Sub1);
4680 }
4681}
4682
4685 MachineIRBuilder &B) const {
4686 Register DstDivReg, DstRemReg;
4687 switch (MI.getOpcode()) {
4688 default:
4689 llvm_unreachable("Unexpected opcode!");
4690 case AMDGPU::G_UDIV: {
4691 DstDivReg = MI.getOperand(0).getReg();
4692 break;
4693 }
4694 case AMDGPU::G_UREM: {
4695 DstRemReg = MI.getOperand(0).getReg();
4696 break;
4697 }
4698 case AMDGPU::G_UDIVREM: {
4699 DstDivReg = MI.getOperand(0).getReg();
4700 DstRemReg = MI.getOperand(1).getReg();
4701 break;
4702 }
4703 }
4704
4705 const LLT S64 = LLT::scalar(64);
4706 const LLT S32 = LLT::scalar(32);
4707 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4708 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4709 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4710 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4711
4712 if (Ty == S32)
4713 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4714 else if (Ty == S64)
4715 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4716 else
4717 return false;
4718
4719 MI.eraseFromParent();
4720 return true;
4721}
4722
4725 MachineIRBuilder &B) const {
4726 const LLT S64 = LLT::scalar(64);
4727 const LLT S32 = LLT::scalar(32);
4728
4729 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4730 if (Ty != S32 && Ty != S64)
4731 return false;
4732
4733 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4734 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4735 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4736
4737 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4738 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4739 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4740
4741 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4742 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4743
4744 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4745 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4746
4747 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4748 switch (MI.getOpcode()) {
4749 default:
4750 llvm_unreachable("Unexpected opcode!");
4751 case AMDGPU::G_SDIV: {
4752 DstDivReg = MI.getOperand(0).getReg();
4753 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4754 break;
4755 }
4756 case AMDGPU::G_SREM: {
4757 DstRemReg = MI.getOperand(0).getReg();
4758 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4759 break;
4760 }
4761 case AMDGPU::G_SDIVREM: {
4762 DstDivReg = MI.getOperand(0).getReg();
4763 DstRemReg = MI.getOperand(1).getReg();
4764 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4765 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4766 break;
4767 }
4768 }
4769
4770 if (Ty == S32)
4771 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4772 else
4773 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4774
4775 if (DstDivReg) {
4776 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4777 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4778 B.buildSub(DstDivReg, SignXor, Sign);
4779 }
4780
4781 if (DstRemReg) {
4782 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4783 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4784 B.buildSub(DstRemReg, SignXor, Sign);
4785 }
4786
4787 MI.eraseFromParent();
4788 return true;
4789}
4790
4793 MachineIRBuilder &B) const {
4794 Register Res = MI.getOperand(0).getReg();
4795 Register LHS = MI.getOperand(1).getReg();
4796 Register RHS = MI.getOperand(2).getReg();
4797 uint16_t Flags = MI.getFlags();
4798 LLT ResTy = MRI.getType(Res);
4799
4800 const MachineFunction &MF = B.getMF();
4801 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4803
4804 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4805 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4806 return false;
4807
4808 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4809 // the CI documentation has a worst case error of 1 ulp.
4810 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4811 // use it as long as we aren't trying to use denormals.
4812 //
4813 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4814
4815 // 1 / x -> RCP(x)
4816 if (CLHS->isExactlyValue(1.0)) {
4817 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4818 .addUse(RHS)
4819 .setMIFlags(Flags);
4820
4821 MI.eraseFromParent();
4822 return true;
4823 }
4824
4825 // -1 / x -> RCP( FNEG(x) )
4826 if (CLHS->isExactlyValue(-1.0)) {
4827 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4828 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4829 .addUse(FNeg.getReg(0))
4830 .setMIFlags(Flags);
4831
4832 MI.eraseFromParent();
4833 return true;
4834 }
4835 }
4836
4837 // For f16 require afn or arcp.
4838 // For f32 require afn.
4839 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4840 !MI.getFlag(MachineInstr::FmArcp)))
4841 return false;
4842
4843 // x / y -> x * (1.0 / y)
4844 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4845 .addUse(RHS)
4846 .setMIFlags(Flags);
4847 B.buildFMul(Res, LHS, RCP, Flags);
4848
4849 MI.eraseFromParent();
4850 return true;
4851}
4852
4855 MachineIRBuilder &B) const {
4856 Register Res = MI.getOperand(0).getReg();
4857 Register X = MI.getOperand(1).getReg();
4858 Register Y = MI.getOperand(2).getReg();
4859 uint16_t Flags = MI.getFlags();
4860 LLT ResTy = MRI.getType(Res);
4861
4862 const MachineFunction &MF = B.getMF();
4863 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4864 MI.getFlag(MachineInstr::FmAfn);
4865
4866 if (!AllowInaccurateRcp)
4867 return false;
4868
4869 auto NegY = B.buildFNeg(ResTy, Y);
4870 auto One = B.buildFConstant(ResTy, 1.0);
4871
4872 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4873 .addUse(Y)
4874 .setMIFlags(Flags);
4875
4876 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4877 R = B.buildFMA(ResTy, Tmp0, R, R);
4878
4879 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4880 R = B.buildFMA(ResTy, Tmp1, R, R);
4881
4882 auto Ret = B.buildFMul(ResTy, X, R);
4883 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4884
4885 B.buildFMA(Res, Tmp2, R, Ret);
4886 MI.eraseFromParent();
4887 return true;
4888}
4889
4892 MachineIRBuilder &B) const {
4894 return true;
4895
4896 Register Res = MI.getOperand(0).getReg();
4897 Register LHS = MI.getOperand(1).getReg();
4898 Register RHS = MI.getOperand(2).getReg();
4899
4900 uint16_t Flags = MI.getFlags();
4901
4902 LLT S16 = LLT::scalar(16);
4903 LLT S32 = LLT::scalar(32);
4904
4905 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4906 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4907
4908 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4909 .addUse(RHSExt.getReg(0))
4910 .setMIFlags(Flags);
4911
4912 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4913 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4914
4915 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4916 .addUse(RDst.getReg(0))
4917 .addUse(RHS)
4918 .addUse(LHS)
4919 .setMIFlags(Flags);
4920
4921 MI.eraseFromParent();
4922 return true;
4923}
4924
4925static constexpr unsigned SPDenormModeBitField =
4927
4928// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4929// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4931 const GCNSubtarget &ST,
4933 // Set SP denorm mode to this value.
4934 unsigned SPDenormMode =
4935 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4936
4937 if (ST.hasDenormModeInst()) {
4938 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4939 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4940
4941 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4942 B.buildInstr(AMDGPU::S_DENORM_MODE)
4943 .addImm(NewDenormModeValue);
4944
4945 } else {
4946 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4947 .addImm(SPDenormMode)
4948 .addImm(SPDenormModeBitField);
4949 }
4950}
4951
4954 MachineIRBuilder &B) const {
4956 return true;
4957
4958 Register Res = MI.getOperand(0).getReg();
4959 Register LHS = MI.getOperand(1).getReg();
4960 Register RHS = MI.getOperand(2).getReg();
4961 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4962 SIModeRegisterDefaults Mode = MFI->getMode();
4963
4964 uint16_t Flags = MI.getFlags();
4965
4966 LLT S32 = LLT::scalar(32);
4967 LLT S1 = LLT::scalar(1);
4968
4969 auto One = B.buildFConstant(S32, 1.0f);
4970
4971 auto DenominatorScaled =
4972 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4973 .addUse(LHS)
4974 .addUse(RHS)
4975 .addImm(0)
4976 .setMIFlags(Flags);
4977 auto NumeratorScaled =
4978 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4979 .addUse(LHS)
4980 .addUse(RHS)
4981 .addImm(1)
4982 .setMIFlags(Flags);
4983
4984 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4985 .addUse(DenominatorScaled.getReg(0))
4986 .setMIFlags(Flags);
4987 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4988
4989 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4990 const bool HasDynamicDenormals =
4991 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4992 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4993
4994 Register SavedSPDenormMode;
4995 if (!PreservesDenormals) {
4996 if (HasDynamicDenormals) {
4997 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4998 B.buildInstr(AMDGPU::S_GETREG_B32)
4999 .addDef(SavedSPDenormMode)
5000 .addImm(SPDenormModeBitField);
5001 }
5002 toggleSPDenormMode(true, B, ST, Mode);
5003 }
5004
5005 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5006 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5007 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5008 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5009 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5010 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5011
5012 if (!PreservesDenormals) {
5013 if (HasDynamicDenormals) {
5014 assert(SavedSPDenormMode);
5015 B.buildInstr(AMDGPU::S_SETREG_B32)
5016 .addReg(SavedSPDenormMode)
5017 .addImm(SPDenormModeBitField);
5018 } else
5019 toggleSPDenormMode(false, B, ST, Mode);
5020 }
5021
5022 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5023 .addUse(Fma4.getReg(0))
5024 .addUse(Fma1.getReg(0))
5025 .addUse(Fma3.getReg(0))
5026 .addUse(NumeratorScaled.getReg(1))
5027 .setMIFlags(Flags);
5028
5029 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5030 .addUse(Fmas.getReg(0))
5031 .addUse(RHS)
5032 .addUse(LHS)
5033 .setMIFlags(Flags);
5034
5035 MI.eraseFromParent();
5036 return true;
5037}
5038
5041 MachineIRBuilder &B) const {
5043 return true;
5044
5045 Register Res = MI.getOperand(0).getReg();
5046 Register LHS = MI.getOperand(1).getReg();
5047 Register RHS = MI.getOperand(2).getReg();
5048
5049 uint16_t Flags = MI.getFlags();
5050
5051 LLT S64 = LLT::scalar(64);
5052 LLT S1 = LLT::scalar(1);
5053
5054 auto One = B.buildFConstant(S64, 1.0);
5055
5056 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5057 .addUse(LHS)
5058 .addUse(RHS)
5059 .addImm(0)
5060 .setMIFlags(Flags);
5061
5062 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5063
5064 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5065 .addUse(DivScale0.getReg(0))
5066 .setMIFlags(Flags);
5067
5068 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5069 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5070 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5071
5072 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5073 .addUse(LHS)
5074 .addUse(RHS)
5075 .addImm(1)
5076 .setMIFlags(Flags);
5077
5078 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5079 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5080 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5081
5082 Register Scale;
5084 // Workaround a hardware bug on SI where the condition output from div_scale
5085 // is not usable.
5086
5087 LLT S32 = LLT::scalar(32);
5088
5089 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5090 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5091 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5092 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5093
5094 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5095 Scale1Unmerge.getReg(1));
5096 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5097 Scale0Unmerge.getReg(1));
5098 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5099 } else {
5100 Scale = DivScale1.getReg(1);
5101 }
5102
5103 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5104 .addUse(Fma4.getReg(0))
5105 .addUse(Fma3.getReg(0))
5106 .addUse(Mul.getReg(0))
5107 .addUse(Scale)
5108 .setMIFlags(Flags);
5109
5110 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5111 .addUse(Fmas.getReg(0))
5112 .addUse(RHS)
5113 .addUse(LHS)
5114 .setMIFlags(Flags);
5115
5116 MI.eraseFromParent();
5117 return true;
5118}
5119
5122 MachineIRBuilder &B) const {
5123 Register Res0 = MI.getOperand(0).getReg();
5124 Register Res1 = MI.getOperand(1).getReg();
5125 Register Val = MI.getOperand(2).getReg();
5126 uint16_t Flags = MI.getFlags();
5127
5128 LLT Ty = MRI.getType(Res0);
5129 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5130
5131 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5132 .addUse(Val)
5133 .setMIFlags(Flags);
5134 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5135 .addUse(Val)
5136 .setMIFlags(Flags);
5137
5138 if (ST.hasFractBug()) {
5139 auto Fabs = B.buildFAbs(Ty, Val);
5140 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5141 auto IsFinite =
5142 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5143 auto Zero = B.buildConstant(InstrExpTy, 0);
5144 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5145 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5146 }
5147
5148 B.buildCopy(Res0, Mant);
5149 B.buildSExtOrTrunc(Res1, Exp);
5150
5151 MI.eraseFromParent();
5152 return true;
5153}
5154
5157 MachineIRBuilder &B) const {
5158 Register Res = MI.getOperand(0).getReg();
5159 Register LHS = MI.getOperand(2).getReg();
5160 Register RHS = MI.getOperand(3).getReg();
5161 uint16_t Flags = MI.getFlags();
5162
5163 LLT S32 = LLT::scalar(32);
5164 LLT S1 = LLT::scalar(1);
5165
5166 auto Abs = B.buildFAbs(S32, RHS, Flags);
5167 const APFloat C0Val(1.0f);
5168
5169 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5170 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5171 auto C2 = B.buildFConstant(S32, 1.0f);
5172
5173 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5174 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5175
5176 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5177
5178 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5179 .addUse(Mul0.getReg(0))
5180 .setMIFlags(Flags);
5181
5182 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5183
5184 B.buildFMul(Res, Sel, Mul1, Flags);
5185
5186 MI.eraseFromParent();
5187 return true;
5188}
5189
5192 MachineIRBuilder &B) const {
5193 // Bypass the correct expansion a standard promotion through G_FSQRT would
5194 // get. The f32 op is accurate enough for the f16 cas.
5195 unsigned Flags = MI.getFlags();
5196 assert(!ST.has16BitInsts());
5197 const LLT F32 = LLT::scalar(32);
5198 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5199 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5200 .addUse(Ext.getReg(0))
5201 .setMIFlags(Flags);
5202 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5203 MI.eraseFromParent();
5204 return true;
5205}
5206
5209 MachineIRBuilder &B) const {
5210 MachineFunction &MF = B.getMF();
5211 Register Dst = MI.getOperand(0).getReg();
5212 Register X = MI.getOperand(1).getReg();
5213 const unsigned Flags = MI.getFlags();
5214 const LLT S1 = LLT::scalar(1);
5215 const LLT F32 = LLT::scalar(32);
5216 const LLT I32 = LLT::scalar(32);
5217
5218 if (allowApproxFunc(MF, Flags)) {
5219 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5220 .addUse(X)
5221 .setMIFlags(Flags);
5222 MI.eraseFromParent();
5223 return true;
5224 }
5225
5226 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5227 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5228 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5229 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5230 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5231
5232 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5233 if (needsDenormHandlingF32(MF, X, Flags)) {
5234 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5235 .addUse(SqrtX.getReg(0))
5236 .setMIFlags(Flags);
5237
5238 auto NegOne = B.buildConstant(I32, -1);
5239 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5240
5241 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5242 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5243
5244 auto PosOne = B.buildConstant(I32, 1);
5245 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5246
5247 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5248 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5249
5250 auto Zero = B.buildFConstant(F32, 0.0f);
5251 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5252
5253 SqrtS =
5254 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5255
5256 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5257 SqrtS =
5258 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5259 } else {
5260 auto SqrtR =
5261 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5262 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5263
5264 auto Half = B.buildFConstant(F32, 0.5f);
5265 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5266 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5267 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5268 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5269 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5270 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5271 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5272 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5273 }
5274
5275 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5276
5277 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5278
5279 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5280
5281 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5282 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5283
5284 MI.eraseFromParent();
5285 return true;
5286}
5287
5290 MachineIRBuilder &B) const {
5291 // For double type, the SQRT and RSQ instructions don't have required
5292 // precision, we apply Goldschmidt's algorithm to improve the result:
5293 //
5294 // y0 = rsq(x)
5295 // g0 = x * y0
5296 // h0 = 0.5 * y0
5297 //
5298 // r0 = 0.5 - h0 * g0
5299 // g1 = g0 * r0 + g0
5300 // h1 = h0 * r0 + h0
5301 //
5302 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5303 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5304 // h2 = h1 * r1 + h1
5305 //
5306 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5307 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5308 //
5309 // sqrt(x) = g3
5310
5311 const LLT S1 = LLT::scalar(1);
5312 const LLT S32 = LLT::scalar(32);
5313 const LLT F64 = LLT::scalar(64);
5314
5315 Register Dst = MI.getOperand(0).getReg();
5316 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5317
5318 Register X = MI.getOperand(1).getReg();
5319 unsigned Flags = MI.getFlags();
5320
5321 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5322
5323 auto ZeroInt = B.buildConstant(S32, 0);
5324 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5325
5326 // Scale up input if it is too small.
5327 auto ScaleUpFactor = B.buildConstant(S32, 256);
5328 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5329 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5330
5331 auto SqrtY =
5332 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5333
5334 auto Half = B.buildFConstant(F64, 0.5);
5335 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5336 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5337
5338 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5339 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5340
5341 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5342 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5343
5344 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5345 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5346
5347 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5348
5349 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5350 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5351
5352 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5353
5354 // Scale down the result.
5355 auto ScaleDownFactor = B.buildConstant(S32, -128);
5356 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5357 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5358
5359 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5360 // with finite only or nsz because rsq(+/-0) = +/-inf
5361
5362 // TODO: Check for DAZ and expand to subnormals
5363 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5364
5365 // If x is +INF, +0, or -0, use its original value
5366 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5367
5368 MI.eraseFromParent();
5369 return true;
5370}
5371
5374 MachineIRBuilder &B) const {
5375 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5376 if (Ty == LLT::scalar(32))
5377 return legalizeFSQRTF32(MI, MRI, B);
5378 if (Ty == LLT::scalar(64))
5379 return legalizeFSQRTF64(MI, MRI, B);
5380 if (Ty == LLT::scalar(16))
5381 return legalizeFSQRTF16(MI, MRI, B);
5382 return false;
5383}
5384
5385// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5386// FIXME: Why do we handle this one but not other removed instructions?
5387//
5388// Reciprocal square root. The clamp prevents infinite results, clamping
5389// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5390// +-max_float.
5393 MachineIRBuilder &B) const {
5395 return true;
5396
5397 Register Dst = MI.getOperand(0).getReg();
5398 Register Src = MI.getOperand(2).getReg();
5399 auto Flags = MI.getFlags();
5400
5401 LLT Ty = MRI.getType(Dst);
5402
5403 const fltSemantics *FltSemantics;
5404 if (Ty == LLT::scalar(32))
5405 FltSemantics = &APFloat::IEEEsingle();
5406 else if (Ty == LLT::scalar(64))
5407 FltSemantics = &APFloat::IEEEdouble();
5408 else
5409 return false;
5410
5411 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5412 .addUse(Src)
5413 .setMIFlags(Flags);
5414
5415 // We don't need to concern ourselves with the snan handling difference, since
5416 // the rsq quieted (or not) so use the one which will directly select.
5417 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5418 const bool UseIEEE = MFI->getMode().IEEE;
5419
5420 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5421 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5422 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5423
5424 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5425
5426 if (UseIEEE)
5427 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5428 else
5429 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5430 MI.eraseFromParent();
5431 return true;
5432}
5433
5434// TODO: Fix pointer type handling
5437 Intrinsic::ID IID) const {
5438
5439 MachineIRBuilder &B = Helper.MIRBuilder;
5440 MachineRegisterInfo &MRI = *B.getMRI();
5441
5442 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5443 IID == Intrinsic::amdgcn_permlanex16;
5444
5445 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5446 Register Src2, LLT VT) -> Register {
5447 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5448 switch (IID) {
5449 case Intrinsic::amdgcn_readfirstlane:
5450 case Intrinsic::amdgcn_permlane64:
5451 return LaneOp.getReg(0);
5452 case Intrinsic::amdgcn_readlane:
5453 return LaneOp.addUse(Src1).getReg(0);
5454 case Intrinsic::amdgcn_writelane:
5455 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5456 case Intrinsic::amdgcn_permlane16:
5457 case Intrinsic::amdgcn_permlanex16: {
5458 Register Src3 = MI.getOperand(5).getReg();
5459 Register Src4 = MI.getOperand(6).getImm();
5460 Register Src5 = MI.getOperand(7).getImm();
5461 return LaneOp.addUse(Src1)
5462 .addUse(Src2)
5463 .addUse(Src3)
5464 .addImm(Src4)
5465 .addImm(Src5)
5466 .getReg(0);
5467 }
5468 default:
5469 llvm_unreachable("unhandled lane op");
5470 }
5471 };
5472
5473 Register DstReg = MI.getOperand(0).getReg();
5474 Register Src0 = MI.getOperand(2).getReg();
5475 Register Src1, Src2;
5476 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5477 IsPermLane16) {
5478 Src1 = MI.getOperand(3).getReg();
5479 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5480 Src2 = MI.getOperand(4).getReg();
5481 }
5482 }
5483
5484 LLT Ty = MRI.getType(DstReg);
5485 unsigned Size = Ty.getSizeInBits();
5486
5487 if (Size == 32) {
5488 // Already legal
5489 return true;
5490 }
5491
5492 if (Size < 32) {
5493 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5494
5495 if (IsPermLane16)
5496 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5497
5498 if (IID == Intrinsic::amdgcn_writelane)
5499 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5500
5501 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5502 B.buildTrunc(DstReg, LaneOpDst);
5503 MI.eraseFromParent();
5504 return true;
5505 }
5506
5507 if (Size % 32 != 0)
5508 return false;
5509
5510 LLT PartialResTy = S32;
5511 if (Ty.isVector()) {
5512 LLT EltTy = Ty.getElementType();
5513 switch (EltTy.getSizeInBits()) {
5514 case 16:
5515 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5516 break;
5517 case 32:
5518 PartialResTy = EltTy;
5519 break;
5520 default:
5521 // Handle all other cases via S32 pieces;
5522 break;
5523 }
5524 }
5525
5526 SmallVector<Register, 2> PartialRes;
5527 unsigned NumParts = Size / 32;
5528 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5529 MachineInstrBuilder Src1Parts, Src2Parts;
5530
5531 if (IsPermLane16)
5532 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5533
5534 if (IID == Intrinsic::amdgcn_writelane)
5535 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5536
5537 for (unsigned i = 0; i < NumParts; ++i) {
5538 Src0 = Src0Parts.getReg(i);
5539
5540 if (IsPermLane16)
5541 Src1 = Src1Parts.getReg(i);
5542
5543 if (IID == Intrinsic::amdgcn_writelane)
5544 Src2 = Src2Parts.getReg(i);
5545
5546 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5547 }
5548
5549 B.buildMergeLikeInstr(DstReg, PartialRes);
5550 MI.eraseFromParent();
5551 return true;
5552}
5553
5556 MachineIRBuilder &B) const {
5560 LLT DstTy = MRI.getType(DstReg);
5561 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5562
5563 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5564 if (!loadInputValue(KernargPtrReg, B,
5566 return false;
5567
5568 // FIXME: This should be nuw
5569 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5570 return true;
5571}
5572
5573/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5574/// bits of the pointer and replace them with the stride argument, then
5575/// merge_values everything together. In the common case of a raw buffer (the
5576/// stride component is 0), we can just AND off the upper half.
5579 Register Result = MI.getOperand(0).getReg();
5580 Register Pointer = MI.getOperand(2).getReg();
5581 Register Stride = MI.getOperand(3).getReg();
5582 Register NumRecords = MI.getOperand(4).getReg();
5583 Register Flags = MI.getOperand(5).getReg();
5584
5585 LLT S32 = LLT::scalar(32);
5586
5587 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5588 auto Unmerge = B.buildUnmerge(S32, Pointer);
5589 Register LowHalf = Unmerge.getReg(0);
5590 Register HighHalf = Unmerge.getReg(1);
5591
5592 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5593 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5594
5595 MachineInstrBuilder NewHighHalf = Masked;
5596 std::optional<ValueAndVReg> StrideConst =
5598 if (!StrideConst || !StrideConst->Value.isZero()) {
5599 MachineInstrBuilder ShiftedStride;
5600 if (StrideConst) {
5601 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5602 uint32_t ShiftedStrideVal = StrideVal << 16;
5603 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5604 } else {
5605 auto ExtStride = B.buildAnyExt(S32, Stride);
5606 auto ShiftConst = B.buildConstant(S32, 16);
5607 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5608 }
5609 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5610 }
5611 Register NewHighHalfReg = NewHighHalf.getReg(0);
5612 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5613 MI.eraseFromParent();
5614 return true;
5615}
5616
5619 MachineIRBuilder &B) const {
5620 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5621 if (!MFI->isEntryFunction()) {
5624 }
5625
5626 Register DstReg = MI.getOperand(0).getReg();
5627 if (!getImplicitArgPtr(DstReg, MRI, B))
5628 return false;
5629
5630 MI.eraseFromParent();
5631 return true;
5632}
5633
5636 MachineIRBuilder &B) const {
5637 Function &F = B.getMF().getFunction();
5638 std::optional<uint32_t> KnownSize =
5640 if (KnownSize.has_value())
5641 B.buildConstant(DstReg, *KnownSize);
5642 return false;
5643}
5644
5647 MachineIRBuilder &B) const {
5648
5649 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5650 if (!MFI->isEntryFunction()) {
5653 }
5654
5655 Register DstReg = MI.getOperand(0).getReg();
5656 if (!getLDSKernelId(DstReg, MRI, B))
5657 return false;
5658
5659 MI.eraseFromParent();
5660 return true;
5661}
5662
5666 unsigned AddrSpace) const {
5667 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5668 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5669 Register Hi32 = Unmerge.getReg(1);
5670
5671 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5672 MI.eraseFromParent();
5673 return true;
5674}
5675
5676// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5677// offset (the offset that is included in bounds checking and swizzling, to be
5678// split between the instruction's voffset and immoffset fields) and soffset
5679// (the offset that is excluded from bounds checking and swizzling, to go in
5680// the instruction's soffset field). This function takes the first kind of
5681// offset and figures out how to split it between voffset and immoffset.
5682std::pair<Register, unsigned>
5684 Register OrigOffset) const {
5685 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5686 Register BaseReg;
5687 unsigned ImmOffset;
5688 const LLT S32 = LLT::scalar(32);
5689 MachineRegisterInfo &MRI = *B.getMRI();
5690
5691 std::tie(BaseReg, ImmOffset) =
5693
5694 // If BaseReg is a pointer, convert it to int.
5695 if (MRI.getType(BaseReg).isPointer())
5696 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5697
5698 // If the immediate value is too big for the immoffset field, put only bits
5699 // that would normally fit in the immoffset field. The remaining value that
5700 // is copied/added for the voffset field is a large power of 2, and it
5701 // stands more chance of being CSEd with the copy/add for another similar
5702 // load/store.
5703 // However, do not do that rounding down if that is a negative
5704 // number, as it appears to be illegal to have a negative offset in the
5705 // vgpr, even if adding the immediate offset makes it positive.
5706 unsigned Overflow = ImmOffset & ~MaxImm;
5707 ImmOffset -= Overflow;
5708 if ((int32_t)Overflow < 0) {
5709 Overflow += ImmOffset;
5710 ImmOffset = 0;
5711 }
5712
5713 if (Overflow != 0) {
5714 if (!BaseReg) {
5715 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5716 } else {
5717 auto OverflowVal = B.buildConstant(S32, Overflow);
5718 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5719 }
5720 }
5721
5722 if (!BaseReg)
5723 BaseReg = B.buildConstant(S32, 0).getReg(0);
5724
5725 return std::pair(BaseReg, ImmOffset);
5726}
5727
5728/// Handle register layout difference for f16 images for some subtargets.
5731 Register Reg,
5732 bool ImageStore) const {
5733 const LLT S16 = LLT::scalar(16);
5734 const LLT S32 = LLT::scalar(32);
5735 LLT StoreVT = MRI.getType(Reg);
5736 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5737
5738 if (ST.hasUnpackedD16VMem()) {
5739 auto Unmerge = B.buildUnmerge(S16, Reg);
5740
5741 SmallVector<Register, 4> WideRegs;
5742 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5743 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5744
5745 int NumElts = StoreVT.getNumElements();
5746
5747 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5748 .getReg(0);
5749 }
5750
5751 if (ImageStore && ST.hasImageStoreD16Bug()) {
5752 if (StoreVT.getNumElements() == 2) {
5753 SmallVector<Register, 4> PackedRegs;
5754 Reg = B.buildBitcast(S32, Reg).getReg(0);
5755 PackedRegs.push_back(Reg);
5756 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5757 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5758 .getReg(0);
5759 }
5760
5761 if (StoreVT.getNumElements() == 3) {
5762 SmallVector<Register, 4> PackedRegs;
5763 auto Unmerge = B.buildUnmerge(S16, Reg);
5764 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5765 PackedRegs.push_back(Unmerge.getReg(I));
5766 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5767 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5768 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5769 }
5770
5771 if (StoreVT.getNumElements() == 4) {
5772 SmallVector<Register, 4> PackedRegs;
5773 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5774 auto Unmerge = B.buildUnmerge(S32, Reg);
5775 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5776 PackedRegs.push_back(Unmerge.getReg(I));
5777 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5778 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5779 .getReg(0);
5780 }
5781
5782 llvm_unreachable("invalid data type");
5783 }
5784
5785 if (StoreVT == LLT::fixed_vector(3, S16)) {
5786 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5787 .getReg(0);
5788 }
5789 return Reg;
5790}
5791
5793 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5794 MachineRegisterInfo *MRI = B.getMRI();
5795 LLT Ty = MRI->getType(VData);
5796
5797 const LLT S16 = LLT::scalar(16);
5798
5799 // Fixup buffer resources themselves needing to be v4i128.
5801 return castBufferRsrcToV4I32(VData, B);
5802
5803 // Fixup illegal register types for i8 stores.
5804 if (Ty == LLT::scalar(8) || Ty == S16) {
5805 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5806 return AnyExt;
5807 }
5808
5809 if (Ty.isVector()) {
5810 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5811 if (IsFormat)
5812 return handleD16VData(B, *MRI, VData);
5813 }
5814 }
5815
5816 return VData;
5817}
5818
5822 bool IsTyped,
5823 bool IsFormat) const {
5824 Register VData = MI.getOperand(1).getReg();
5825 LLT Ty = MRI.getType(VData);
5826 LLT EltTy = Ty.getScalarType();
5827 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5828 const LLT S32 = LLT::scalar(32);
5829
5830 VData = fixStoreSourceType(B, VData, IsFormat);
5832 Register RSrc = MI.getOperand(2).getReg();
5833
5834 MachineMemOperand *MMO = *MI.memoperands_begin();
5835 const int MemSize = MMO->getSize().getValue();
5836
5837 unsigned ImmOffset;
5838
5839 // The typed intrinsics add an immediate after the registers.
5840 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5841
5842 // The struct intrinsic variants add one additional operand over raw.
5843 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5844 Register VIndex;
5845 int OpOffset = 0;
5846 if (HasVIndex) {
5847 VIndex = MI.getOperand(3).getReg();
5848 OpOffset = 1;
5849 } else {
5850 VIndex = B.buildConstant(S32, 0).getReg(0);
5851 }
5852
5853 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5854 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5855
5856 unsigned Format = 0;
5857 if (IsTyped) {
5858 Format = MI.getOperand(5 + OpOffset).getImm();
5859 ++OpOffset;
5860 }
5861
5862 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5863
5864 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5865
5866 unsigned Opc;
5867 if (IsTyped) {
5868 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5869 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5870 } else if (IsFormat) {
5871 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5872 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5873 } else {
5874 switch (MemSize) {
5875 case 1:
5876 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5877 break;
5878 case 2:
5879 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5880 break;
5881 default:
5882 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5883 break;
5884 }
5885 }
5886
5887 auto MIB = B.buildInstr(Opc)
5888 .addUse(VData) // vdata
5889 .addUse(RSrc) // rsrc
5890 .addUse(VIndex) // vindex
5891 .addUse(VOffset) // voffset
5892 .addUse(SOffset) // soffset
5893 .addImm(ImmOffset); // offset(imm)
5894
5895 if (IsTyped)
5896 MIB.addImm(Format);
5897
5898 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5899 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5900 .addMemOperand(MMO);
5901
5902 MI.eraseFromParent();
5903 return true;
5904}
5905
5906static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5907 Register VIndex, Register VOffset, Register SOffset,
5908 unsigned ImmOffset, unsigned Format,
5909 unsigned AuxiliaryData, MachineMemOperand *MMO,
5910 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5911 auto MIB = B.buildInstr(Opc)
5912 .addDef(LoadDstReg) // vdata
5913 .addUse(RSrc) // rsrc
5914 .addUse(VIndex) // vindex
5915 .addUse(VOffset) // voffset
5916 .addUse(SOffset) // soffset
5917 .addImm(ImmOffset); // offset(imm)
5918
5919 if (IsTyped)
5920 MIB.addImm(Format);
5921
5922 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5923 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5924 .addMemOperand(MMO);
5925}
5926
5930 bool IsFormat,
5931 bool IsTyped) const {
5932 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5933 MachineMemOperand *MMO = *MI.memoperands_begin();
5934 const LLT MemTy = MMO->getMemoryType();
5935 const LLT S32 = LLT::scalar(32);
5936
5937 Register Dst = MI.getOperand(0).getReg();
5938
5939 Register StatusDst;
5940 int OpOffset = 0;
5941 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5942 bool IsTFE = MI.getNumExplicitDefs() == 2;
5943 if (IsTFE) {
5944 StatusDst = MI.getOperand(1).getReg();
5945 ++OpOffset;
5946 }
5947
5948 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5949 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5950
5951 // The typed intrinsics add an immediate after the registers.
5952 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5953
5954 // The struct intrinsic variants add one additional operand over raw.
5955 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5956 Register VIndex;
5957 if (HasVIndex) {
5958 VIndex = MI.getOperand(3 + OpOffset).getReg();
5959 ++OpOffset;
5960 } else {
5961 VIndex = B.buildConstant(S32, 0).getReg(0);
5962 }
5963
5964 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5965 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5966
5967 unsigned Format = 0;
5968 if (IsTyped) {
5969 Format = MI.getOperand(5 + OpOffset).getImm();
5970 ++OpOffset;
5971 }
5972
5973 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5974 unsigned ImmOffset;
5975
5976 LLT Ty = MRI.getType(Dst);
5977 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5978 // logic doesn't have to handle that case.
5979 if (hasBufferRsrcWorkaround(Ty)) {
5980 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5981 Dst = MI.getOperand(0).getReg();
5982 }
5983 LLT EltTy = Ty.getScalarType();
5984 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5985 const bool Unpacked = ST.hasUnpackedD16VMem();
5986
5987 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5988
5989 unsigned Opc;
5990
5991 // TODO: Support TFE for typed and narrow loads.
5992 if (IsTyped) {
5993 if (IsTFE)
5994 return false;
5995 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5996 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5997 } else if (IsFormat) {
5998 if (IsD16) {
5999 if (IsTFE)
6000 return false;
6001 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6002 } else {
6003 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6004 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6005 }
6006 } else {
6007 switch (MemTy.getSizeInBits()) {
6008 case 8:
6009 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6010 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6011 break;
6012 case 16:
6013 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6014 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6015 break;
6016 default:
6017 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6018 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6019 break;
6020 }
6021 }
6022
6023 if (IsTFE) {
6024 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6025 unsigned NumLoadDWords = NumValueDWords + 1;
6026 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6027 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6028 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6029 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6030 if (MemTy.getSizeInBits() < 32) {
6031 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6032 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6033 B.buildTrunc(Dst, ExtDst);
6034 } else if (NumValueDWords == 1) {
6035 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6036 } else {
6037 SmallVector<Register, 5> LoadElts;
6038 for (unsigned I = 0; I != NumValueDWords; ++I)
6039 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6040 LoadElts.push_back(StatusDst);
6041 B.buildUnmerge(LoadElts, LoadDstReg);
6042 LoadElts.truncate(NumValueDWords);
6043 B.buildMergeLikeInstr(Dst, LoadElts);
6044 }
6045 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6046 (IsD16 && !Ty.isVector())) {
6047 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6048 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6049 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6050 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6051 B.buildTrunc(Dst, LoadDstReg);
6052 } else if (Unpacked && IsD16 && Ty.isVector()) {
6053 LLT UnpackedTy = Ty.changeElementSize(32);
6054 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6055 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6056 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6057 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6058 // FIXME: G_TRUNC should work, but legalization currently fails
6059 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6061 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6062 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6063 B.buildMergeLikeInstr(Dst, Repack);
6064 } else {
6065 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6066 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6067 }
6068
6069 MI.eraseFromParent();
6070 return true;
6071}
6072
6073static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6074 switch (IntrID) {
6075 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6077 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6080 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6082 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6085 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6087 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6090 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6092 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6095 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6097 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6100 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6102 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6105 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6107 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6110 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6112 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6115 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6117 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6120 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6122 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6125 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6127 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6130 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6132 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6135 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6137 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6140 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6141 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6142 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6145 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6147 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6148 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6149 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6150 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6151 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6152 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6153 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6154 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6155 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6156 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6158 default:
6159 llvm_unreachable("unhandled atomic opcode");
6160 }
6161}
6162
6165 Intrinsic::ID IID) const {
6166 const bool IsCmpSwap =
6167 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6168 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6169 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6170 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6171
6172 Register Dst = MI.getOperand(0).getReg();
6173 // Since we don't have 128-bit atomics, we don't need to handle the case of
6174 // p8 argmunents to the atomic itself
6175 Register VData = MI.getOperand(2).getReg();
6176
6177 Register CmpVal;
6178 int OpOffset = 0;
6179
6180 if (IsCmpSwap) {
6181 CmpVal = MI.getOperand(3).getReg();
6182 ++OpOffset;
6183 }
6184
6185 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6186 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6187 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6188
6189 // The struct intrinsic variants add one additional operand over raw.
6190 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6191 Register VIndex;
6192 if (HasVIndex) {
6193 VIndex = MI.getOperand(4 + OpOffset).getReg();
6194 ++OpOffset;
6195 } else {
6196 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6197 }
6198
6199 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6200 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6201 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6202
6203 MachineMemOperand *MMO = *MI.memoperands_begin();
6204
6205 unsigned ImmOffset;
6206 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6207
6208 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6209 .addDef(Dst)
6210 .addUse(VData); // vdata
6211
6212 if (IsCmpSwap)
6213 MIB.addReg(CmpVal);
6214
6215 MIB.addUse(RSrc) // rsrc
6216 .addUse(VIndex) // vindex
6217 .addUse(VOffset) // voffset
6218 .addUse(SOffset) // soffset
6219 .addImm(ImmOffset) // offset(imm)
6220 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6221 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6222 .addMemOperand(MMO);
6223
6224 MI.eraseFromParent();
6225 return true;
6226}
6227
6228/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6229/// vector with s16 typed elements.
6231 SmallVectorImpl<Register> &PackedAddrs,
6232 unsigned ArgOffset,
6234 bool IsA16, bool IsG16) {
6235 const LLT S16 = LLT::scalar(16);
6236 const LLT V2S16 = LLT::fixed_vector(2, 16);
6237 auto EndIdx = Intr->VAddrEnd;
6238
6239 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6240 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6241 if (!SrcOp.isReg())
6242 continue; // _L to _LZ may have eliminated this.
6243
6244 Register AddrReg = SrcOp.getReg();
6245
6246 if ((I < Intr->GradientStart) ||
6247 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6248 (I >= Intr->CoordStart && !IsA16)) {
6249 if ((I < Intr->GradientStart) && IsA16 &&
6250 (B.getMRI()->getType(AddrReg) == S16)) {
6251 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6252 // Special handling of bias when A16 is on. Bias is of type half but
6253 // occupies full 32-bit.
6254 PackedAddrs.push_back(
6255 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6256 .getReg(0));
6257 } else {
6258 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6259 "Bias needs to be converted to 16 bit in A16 mode");
6260 // Handle any gradient or coordinate operands that should not be packed
6261 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6262 PackedAddrs.push_back(AddrReg);
6263 }
6264 } else {
6265 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6266 // derivatives dx/dh and dx/dv are packed with undef.
6267 if (((I + 1) >= EndIdx) ||
6268 ((Intr->NumGradients / 2) % 2 == 1 &&
6269 (I == static_cast<unsigned>(Intr->GradientStart +
6270 (Intr->NumGradients / 2) - 1) ||
6271 I == static_cast<unsigned>(Intr->GradientStart +
6272 Intr->NumGradients - 1))) ||
6273 // Check for _L to _LZ optimization
6274 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6275 PackedAddrs.push_back(
6276 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6277 .getReg(0));
6278 } else {
6279 PackedAddrs.push_back(
6280 B.buildBuildVector(
6281 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6282 .getReg(0));
6283 ++I;
6284 }
6285 }
6286 }
6287}
6288
6289/// Convert from separate vaddr components to a single vector address register,
6290/// and replace the remaining operands with $noreg.
6292 int DimIdx, int NumVAddrs) {
6293 const LLT S32 = LLT::scalar(32);
6294 (void)S32;
6295 SmallVector<Register, 8> AddrRegs;
6296 for (int I = 0; I != NumVAddrs; ++I) {
6297 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6298 if (SrcOp.isReg()) {
6299 AddrRegs.push_back(SrcOp.getReg());
6300 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6301 }
6302 }
6303
6304 int NumAddrRegs = AddrRegs.size();
6305 if (NumAddrRegs != 1) {
6306 auto VAddr =
6307 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6308 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6309 }
6310
6311 for (int I = 1; I != NumVAddrs; ++I) {
6312 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6313 if (SrcOp.isReg())
6314 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6315 }
6316}
6317
6318/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6319///
6320/// Depending on the subtarget, load/store with 16-bit element data need to be
6321/// rewritten to use the low half of 32-bit registers, or directly use a packed
6322/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6323/// registers.
6324///
6325/// We don't want to directly select image instructions just yet, but also want
6326/// to exposes all register repacking to the legalizer/combiners. We also don't
6327/// want a selected instruction entering RegBankSelect. In order to avoid
6328/// defining a multitude of intermediate image instructions, directly hack on
6329/// the intrinsic's arguments. In cases like a16 addresses, this requires
6330/// padding now unnecessary arguments with $noreg.
6333 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6334
6335 const MachineFunction &MF = *MI.getMF();
6336 const unsigned NumDefs = MI.getNumExplicitDefs();
6337 const unsigned ArgOffset = NumDefs + 1;
6338 bool IsTFE = NumDefs == 2;
6339 // We are only processing the operands of d16 image operations on subtargets
6340 // that use the unpacked register layout, or need to repack the TFE result.
6341
6342 // TODO: Do we need to guard against already legalized intrinsics?
6343 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6345
6346 MachineRegisterInfo *MRI = B.getMRI();
6347 const LLT S32 = LLT::scalar(32);
6348 const LLT S16 = LLT::scalar(16);
6349 const LLT V2S16 = LLT::fixed_vector(2, 16);
6350
6351 unsigned DMask = 0;
6352 Register VData;
6353 LLT Ty;
6354
6355 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6356 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6357 Ty = MRI->getType(VData);
6358 }
6359
6360 const bool IsAtomicPacked16Bit =
6361 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6362 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6363
6364 // Check for 16 bit addresses and pack if true.
6365 LLT GradTy =
6366 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6367 LLT AddrTy =
6368 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6369 const bool IsG16 =
6370 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6371 const bool IsA16 = AddrTy == S16;
6372 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6373
6374 int DMaskLanes = 0;
6375 if (!BaseOpcode->Atomic) {
6376 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6377 if (BaseOpcode->Gather4) {
6378 DMaskLanes = 4;
6379 } else if (DMask != 0) {
6380 DMaskLanes = llvm::popcount(DMask);
6381 } else if (!IsTFE && !BaseOpcode->Store) {
6382 // If dmask is 0, this is a no-op load. This can be eliminated.
6383 B.buildUndef(MI.getOperand(0));
6384 MI.eraseFromParent();
6385 return true;
6386 }
6387 }
6388
6389 Observer.changingInstr(MI);
6390 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6391
6392 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6393 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6394 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6395 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6396 unsigned NewOpcode = LoadOpcode;
6397 if (BaseOpcode->Store)
6398 NewOpcode = StoreOpcode;
6399 else if (BaseOpcode->NoReturn)
6400 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6401
6402 // Track that we legalized this
6403 MI.setDesc(B.getTII().get(NewOpcode));
6404
6405 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6406 // dmask to be at least 1 otherwise the instruction will fail
6407 if (IsTFE && DMask == 0) {
6408 DMask = 0x1;
6409 DMaskLanes = 1;
6410 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6411 }
6412
6413 if (BaseOpcode->Atomic) {
6414 Register VData0 = MI.getOperand(2).getReg();
6415 LLT Ty = MRI->getType(VData0);
6416
6417 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6418 if (Ty.isVector() && !IsAtomicPacked16Bit)
6419 return false;
6420
6421 if (BaseOpcode->AtomicX2) {
6422 Register VData1 = MI.getOperand(3).getReg();
6423 // The two values are packed in one register.
6424 LLT PackedTy = LLT::fixed_vector(2, Ty);
6425 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6426 MI.getOperand(2).setReg(Concat.getReg(0));
6427 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6428 }
6429 }
6430
6431 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6432
6433 // Rewrite the addressing register layout before doing anything else.
6434 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6435 // 16 bit gradients are supported, but are tied to the A16 control
6436 // so both gradients and addresses must be 16 bit
6437 return false;
6438 }
6439
6440 if (IsA16 && !ST.hasA16()) {
6441 // A16 not supported
6442 return false;
6443 }
6444
6445 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6446 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6447
6448 if (IsA16 || IsG16) {
6449 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6450 // instructions expect VGPR_32
6451 SmallVector<Register, 4> PackedRegs;
6452
6453 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6454
6455 // See also below in the non-a16 branch
6456 const bool UseNSA = ST.hasNSAEncoding() &&
6457 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6458 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6459 const bool UsePartialNSA =
6460 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6461
6462 if (UsePartialNSA) {
6463 // Pack registers that would go over NSAMaxSize into last VAddr register
6464 LLT PackedAddrTy =
6465 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6466 auto Concat = B.buildConcatVectors(
6467 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6468 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6469 PackedRegs.resize(NSAMaxSize);
6470 } else if (!UseNSA && PackedRegs.size() > 1) {
6471 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6472 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6473 PackedRegs[0] = Concat.getReg(0);
6474 PackedRegs.resize(1);
6475 }
6476
6477 const unsigned NumPacked = PackedRegs.size();
6478 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6479 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6480 if (!SrcOp.isReg()) {
6481 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6482 continue;
6483 }
6484
6485 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6486
6487 if (I - Intr->VAddrStart < NumPacked)
6488 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6489 else
6490 SrcOp.setReg(AMDGPU::NoRegister);
6491 }
6492 } else {
6493 // If the register allocator cannot place the address registers contiguously
6494 // without introducing moves, then using the non-sequential address encoding
6495 // is always preferable, since it saves VALU instructions and is usually a
6496 // wash in terms of code size or even better.
6497 //
6498 // However, we currently have no way of hinting to the register allocator
6499 // that MIMG addresses should be placed contiguously when it is possible to
6500 // do so, so force non-NSA for the common 2-address case as a heuristic.
6501 //
6502 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6503 // allocation when possible.
6504 //
6505 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6506 // set of the remaining addresses.
6507 const bool UseNSA = ST.hasNSAEncoding() &&
6508 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6509 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6510 const bool UsePartialNSA =
6511 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6512
6513 if (UsePartialNSA) {
6515 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6516 Intr->NumVAddrs - NSAMaxSize + 1);
6517 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6518 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6519 Intr->NumVAddrs);
6520 }
6521 }
6522
6523 int Flags = 0;
6524 if (IsA16)
6525 Flags |= 1;
6526 if (IsG16)
6527 Flags |= 2;
6528 MI.addOperand(MachineOperand::CreateImm(Flags));
6529
6530 if (BaseOpcode->NoReturn) { // No TFE for stores?
6531 // TODO: Handle dmask trim
6532 if (!Ty.isVector() || !IsD16)
6533 return true;
6534
6535 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6536 if (RepackedReg != VData) {
6537 MI.getOperand(1).setReg(RepackedReg);
6538 }
6539
6540 return true;
6541 }
6542
6543 Register DstReg = MI.getOperand(0).getReg();
6544 const LLT EltTy = Ty.getScalarType();
6545 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6546
6547 // Confirm that the return type is large enough for the dmask specified
6548 if (NumElts < DMaskLanes)
6549 return false;
6550
6551 if (NumElts > 4 || DMaskLanes > 4)
6552 return false;
6553
6554 // Image atomic instructions are using DMask to specify how many bits
6555 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6556 // DMaskLanes for image atomic has default value '0'.
6557 // We must be sure that atomic variants (especially packed) will not be
6558 // truncated from v2s16 or v4s16 to s16 type.
6559 //
6560 // ChangeElementCount will be needed for image load where Ty is always scalar.
6561 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6562 const LLT AdjustedTy =
6563 DMaskLanes == 0
6564 ? Ty
6565 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6566
6567 // The raw dword aligned data component of the load. The only legal cases
6568 // where this matters should be when using the packed D16 format, for
6569 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6570 LLT RoundedTy;
6571
6572 // S32 vector to cover all data, plus TFE result element.
6573 LLT TFETy;
6574
6575 // Register type to use for each loaded component. Will be S32 or V2S16.
6576 LLT RegTy;
6577
6578 if (IsD16 && ST.hasUnpackedD16VMem()) {
6579 RoundedTy =
6580 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6581 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6582 RegTy = S32;
6583 } else {
6584 unsigned EltSize = EltTy.getSizeInBits();
6585 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6586 unsigned RoundedSize = 32 * RoundedElts;
6587 RoundedTy = LLT::scalarOrVector(
6588 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6589 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6590 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6591 }
6592
6593 // The return type does not need adjustment.
6594 // TODO: Should we change s16 case to s32 or <2 x s16>?
6595 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6596 return true;
6597
6598 Register Dst1Reg;
6599
6600 // Insert after the instruction.
6601 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6602
6603 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6604 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6605 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6606 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6607
6608 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6609
6610 MI.getOperand(0).setReg(NewResultReg);
6611
6612 // In the IR, TFE is supposed to be used with a 2 element struct return
6613 // type. The instruction really returns these two values in one contiguous
6614 // register, with one additional dword beyond the loaded data. Rewrite the
6615 // return type to use a single register result.
6616
6617 if (IsTFE) {
6618 Dst1Reg = MI.getOperand(1).getReg();
6619 if (MRI->getType(Dst1Reg) != S32)
6620 return false;
6621
6622 // TODO: Make sure the TFE operand bit is set.
6623 MI.removeOperand(1);
6624
6625 // Handle the easy case that requires no repack instructions.
6626 if (Ty == S32) {
6627 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6628 return true;
6629 }
6630 }
6631
6632 // Now figure out how to copy the new result register back into the old
6633 // result.
6634 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6635
6636 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6637
6638 if (ResultNumRegs == 1) {
6639 assert(!IsTFE);
6640 ResultRegs[0] = NewResultReg;
6641 } else {
6642 // We have to repack into a new vector of some kind.
6643 for (int I = 0; I != NumDataRegs; ++I)
6644 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6645 B.buildUnmerge(ResultRegs, NewResultReg);
6646
6647 // Drop the final TFE element to get the data part. The TFE result is
6648 // directly written to the right place already.
6649 if (IsTFE)
6650 ResultRegs.resize(NumDataRegs);
6651 }
6652
6653 // For an s16 scalar result, we form an s32 result with a truncate regardless
6654 // of packed vs. unpacked.
6655 if (IsD16 && !Ty.isVector()) {
6656 B.buildTrunc(DstReg, ResultRegs[0]);
6657 return true;
6658 }
6659
6660 // Avoid a build/concat_vector of 1 entry.
6661 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6662 B.buildBitcast(DstReg, ResultRegs[0]);
6663 return true;
6664 }
6665
6666 assert(Ty.isVector());
6667
6668 if (IsD16) {
6669 // For packed D16 results with TFE enabled, all the data components are
6670 // S32. Cast back to the expected type.
6671 //
6672 // TODO: We don't really need to use load s32 elements. We would only need one
6673 // cast for the TFE result if a multiple of v2s16 was used.
6674 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6675 for (Register &Reg : ResultRegs)
6676 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6677 } else if (ST.hasUnpackedD16VMem()) {
6678 for (Register &Reg : ResultRegs)
6679 Reg = B.buildTrunc(S16, Reg).getReg(0);
6680 }
6681 }
6682
6683 auto padWithUndef = [&](LLT Ty, int NumElts) {
6684 if (NumElts == 0)
6685 return;
6686 Register Undef = B.buildUndef(Ty).getReg(0);
6687 for (int I = 0; I != NumElts; ++I)
6688 ResultRegs.push_back(Undef);
6689 };
6690
6691 // Pad out any elements eliminated due to the dmask.
6692 LLT ResTy = MRI->getType(ResultRegs[0]);
6693 if (!ResTy.isVector()) {
6694 padWithUndef(ResTy, NumElts - ResultRegs.size());
6695 B.buildBuildVector(DstReg, ResultRegs);
6696 return true;
6697 }
6698
6699 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6700 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6701
6702 // Deal with the one annoying legal case.
6703 const LLT V3S16 = LLT::fixed_vector(3, 16);
6704 if (Ty == V3S16) {
6705 if (IsTFE) {
6706 if (ResultRegs.size() == 1) {
6707 NewResultReg = ResultRegs[0];
6708 } else if (ResultRegs.size() == 2) {
6709 LLT V4S16 = LLT::fixed_vector(4, 16);
6710 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6711 } else {
6712 return false;
6713 }
6714 }
6715
6716 if (MRI->getType(DstReg).getNumElements() <
6717 MRI->getType(NewResultReg).getNumElements()) {
6718 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6719 } else {
6720 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6721 }
6722 return true;
6723 }
6724
6725 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6726 B.buildConcatVectors(DstReg, ResultRegs);
6727 return true;
6728}
6729
6731 MachineInstr &MI) const {
6732 MachineIRBuilder &B = Helper.MIRBuilder;
6733 GISelChangeObserver &Observer = Helper.Observer;
6734
6735 Register OrigDst = MI.getOperand(0).getReg();
6736 Register Dst;
6737 LLT Ty = B.getMRI()->getType(OrigDst);
6738 unsigned Size = Ty.getSizeInBits();
6739 MachineFunction &MF = B.getMF();
6740 unsigned Opc = 0;
6741 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6742 assert(Size == 8 || Size == 16);
6743 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6744 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6745 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6746 // destination register.
6747 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6748 } else {
6749 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6750 Dst = OrigDst;
6751 }
6752
6753 Observer.changingInstr(MI);
6754
6755 // Handle needing to s.buffer.load() a p8 value.
6756 if (hasBufferRsrcWorkaround(Ty)) {
6757 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6758 B.setInsertPt(B.getMBB(), MI);
6759 }
6761 Ty = getBitcastRegisterType(Ty);
6762 Helper.bitcastDst(MI, Ty, 0);
6763 B.setInsertPt(B.getMBB(), MI);
6764 }
6765
6766 // FIXME: We don't really need this intermediate instruction. The intrinsic
6767 // should be fixed to have a memory operand. Since it's readnone, we're not
6768 // allowed to add one.
6769 MI.setDesc(B.getTII().get(Opc));
6770 MI.removeOperand(1); // Remove intrinsic ID
6771
6772 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6773 const unsigned MemSize = (Size + 7) / 8;
6774 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6780 MemSize, MemAlign);
6781 MI.addMemOperand(MF, MMO);
6782 if (Dst != OrigDst) {
6783 MI.getOperand(0).setReg(Dst);
6784 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6785 B.buildTrunc(OrigDst, Dst);
6786 }
6787
6788 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6789 // always be legal. We may need to restore this to a 96-bit result if it turns
6790 // out this needs to be converted to a vector load during RegBankSelect.
6791 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6792 if (Ty.isVector())
6794 else
6795 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6796 }
6797
6798 Observer.changedInstr(MI);
6799 return true;
6800}
6801
6802// TODO: Move to selection
6805 MachineIRBuilder &B) const {
6806 if (!ST.isTrapHandlerEnabled() ||
6808 return legalizeTrapEndpgm(MI, MRI, B);
6809
6810 return ST.supportsGetDoorbellID() ?
6812}
6813
6816 const DebugLoc &DL = MI.getDebugLoc();
6817 MachineBasicBlock &BB = B.getMBB();
6818 MachineFunction *MF = BB.getParent();
6819
6820 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6821 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6822 .addImm(0);
6823 MI.eraseFromParent();
6824 return true;
6825 }
6826
6827 // We need a block split to make the real endpgm a terminator. We also don't
6828 // want to break phis in successor blocks, so we can't just delete to the
6829 // end of the block.
6830 BB.splitAt(MI, false /*UpdateLiveIns*/);
6832 MF->push_back(TrapBB);
6833 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6834 .addImm(0);
6835 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6836 .addMBB(TrapBB);
6837
6838 BB.addSuccessor(TrapBB);
6839 MI.eraseFromParent();
6840 return true;
6841}
6842
6845 MachineFunction &MF = B.getMF();
6846 const LLT S64 = LLT::scalar(64);
6847
6848 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6849 // For code object version 5, queue_ptr is passed through implicit kernarg.
6855 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6856
6857 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6859
6860 if (!loadInputValue(KernargPtrReg, B,
6862 return false;
6863
6864 // TODO: can we be smarter about machine pointer info?
6867 PtrInfo,
6871
6872 // Pointer address
6873 Register LoadAddr = MRI.createGenericVirtualRegister(
6875 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6876 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6877 // Load address
6878 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6879 B.buildCopy(SGPR01, Temp);
6880 B.buildInstr(AMDGPU::S_TRAP)
6881 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6882 .addReg(SGPR01, RegState::Implicit);
6883 MI.eraseFromParent();
6884 return true;
6885 }
6886
6887 // Pass queue pointer to trap handler as input, and insert trap instruction
6888 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6889 Register LiveIn =
6890 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6892 return false;
6893
6894 B.buildCopy(SGPR01, LiveIn);
6895 B.buildInstr(AMDGPU::S_TRAP)
6896 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6897 .addReg(SGPR01, RegState::Implicit);
6898
6899 MI.eraseFromParent();
6900 return true;
6901}
6902
6905 MachineIRBuilder &B) const {
6906 // We need to simulate the 's_trap 2' instruction on targets that run in
6907 // PRIV=1 (where it is treated as a nop).
6908 if (ST.hasPrivEnabledTrap2NopBug()) {
6909 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6910 MI.getDebugLoc());
6911 MI.eraseFromParent();
6912 return true;
6913 }
6914
6915 B.buildInstr(AMDGPU::S_TRAP)
6916 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6917 MI.eraseFromParent();
6918 return true;
6919}
6920
6923 MachineIRBuilder &B) const {
6924 // Is non-HSA path or trap-handler disabled? Then, report a warning
6925 // accordingly
6926 if (!ST.isTrapHandlerEnabled() ||
6928 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6929 "debugtrap handler not supported",
6930 MI.getDebugLoc(), DS_Warning);
6931 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6932 Ctx.diagnose(NoTrap);
6933 } else {
6934 // Insert debug-trap instruction
6935 B.buildInstr(AMDGPU::S_TRAP)
6936 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6937 }
6938
6939 MI.eraseFromParent();
6940 return true;
6941}
6942
6944 MachineIRBuilder &B) const {
6945 MachineRegisterInfo &MRI = *B.getMRI();
6946 const LLT S16 = LLT::scalar(16);
6947 const LLT S32 = LLT::scalar(32);
6948 const LLT V2S16 = LLT::fixed_vector(2, 16);
6949 const LLT V3S32 = LLT::fixed_vector(3, 32);
6950
6951 Register DstReg = MI.getOperand(0).getReg();
6952 Register NodePtr = MI.getOperand(2).getReg();
6953 Register RayExtent = MI.getOperand(3).getReg();
6954 Register RayOrigin = MI.getOperand(4).getReg();
6955 Register RayDir = MI.getOperand(5).getReg();
6956 Register RayInvDir = MI.getOperand(6).getReg();
6957 Register TDescr = MI.getOperand(7).getReg();
6958
6959 if (!ST.hasGFX10_AEncoding()) {
6960 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6961 "intrinsic not supported on subtarget",
6962 MI.getDebugLoc());
6963 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6964 return false;
6965 }
6966
6967 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6968 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6969 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6970 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6971 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6972 const unsigned NumVDataDwords = 4;
6973 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6974 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6975 const bool UseNSA =
6976 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6977
6978 const unsigned BaseOpcodes[2][2] = {
6979 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6980 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6981 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6982 int Opcode;
6983 if (UseNSA) {
6984 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6985 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6986 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6987 : AMDGPU::MIMGEncGfx10NSA,
6988 NumVDataDwords, NumVAddrDwords);
6989 } else {
6990 assert(!IsGFX12Plus);
6991 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6992 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6993 : AMDGPU::MIMGEncGfx10Default,
6994 NumVDataDwords, NumVAddrDwords);
6995 }
6996 assert(Opcode != -1);
6997
6999 if (UseNSA && IsGFX11Plus) {
7000 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7001 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7002 auto Merged = B.buildMergeLikeInstr(
7003 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7004 Ops.push_back(Merged.getReg(0));
7005 };
7006
7007 Ops.push_back(NodePtr);
7008 Ops.push_back(RayExtent);
7009 packLanes(RayOrigin);
7010
7011 if (IsA16) {
7012 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7013 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7014 auto MergedDir = B.buildMergeLikeInstr(
7015 V3S32,
7016 {B.buildBitcast(
7017 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7018 UnmergeRayDir.getReg(0)}))
7019 .getReg(0),
7020 B.buildBitcast(
7021 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7022 UnmergeRayDir.getReg(1)}))
7023 .getReg(0),
7024 B.buildBitcast(
7025 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7026 UnmergeRayDir.getReg(2)}))
7027 .getReg(0)});
7028 Ops.push_back(MergedDir.getReg(0));
7029 } else {
7030 packLanes(RayDir);
7031 packLanes(RayInvDir);
7032 }
7033 } else {
7034 if (Is64) {
7035 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7036 Ops.push_back(Unmerge.getReg(0));
7037 Ops.push_back(Unmerge.getReg(1));
7038 } else {
7039 Ops.push_back(NodePtr);
7040 }
7041 Ops.push_back(RayExtent);
7042
7043 auto packLanes = [&Ops, &S32, &B](Register Src) {
7044 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7045 Ops.push_back(Unmerge.getReg(0));
7046 Ops.push_back(Unmerge.getReg(1));
7047 Ops.push_back(Unmerge.getReg(2));
7048 };
7049
7050 packLanes(RayOrigin);
7051 if (IsA16) {
7052 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7053 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7054 Register R1 = MRI.createGenericVirtualRegister(S32);
7055 Register R2 = MRI.createGenericVirtualRegister(S32);
7056 Register R3 = MRI.createGenericVirtualRegister(S32);
7057 B.buildMergeLikeInstr(R1,
7058 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7059 B.buildMergeLikeInstr(
7060 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7061 B.buildMergeLikeInstr(
7062 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7063 Ops.push_back(R1);
7064 Ops.push_back(R2);
7065 Ops.push_back(R3);
7066 } else {
7067 packLanes(RayDir);
7068 packLanes(RayInvDir);
7069 }
7070 }
7071
7072 if (!UseNSA) {
7073 // Build a single vector containing all the operands so far prepared.
7074 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7075 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7076 Ops.clear();
7077 Ops.push_back(MergedOps);
7078 }
7079
7080 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7081 .addDef(DstReg)
7082 .addImm(Opcode);
7083
7084 for (Register R : Ops) {
7085 MIB.addUse(R);
7086 }
7087
7088 MIB.addUse(TDescr)
7089 .addImm(IsA16 ? 1 : 0)
7090 .cloneMemRefs(MI);
7091
7092 MI.eraseFromParent();
7093 return true;
7094}
7095
7097 MachineIRBuilder &B) const {
7098 MachineRegisterInfo &MRI = *B.getMRI();
7099 Register Src = MI.getOperand(1).getReg();
7100 if (MRI.getType(Src) != LLT::scalar(32))
7101 return false;
7102
7103 // Only support towardzero, tonearest, upward and downward.
7104 int RoundMode = MI.getOperand(2).getImm();
7105 if (RoundMode != (int)RoundingMode::TowardZero &&
7106 RoundMode != (int)RoundingMode::NearestTiesToEven &&
7107 RoundMode != (int)RoundingMode::TowardPositive &&
7108 RoundMode != (int)RoundingMode::TowardNegative)
7109 return false;
7110
7111 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7112 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7113 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7114 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7115 unsigned HW_Mode = (RoundMode + 3) % 4;
7116 B.buildInstr(AMDGPU::G_FPTRUNC_ROUND)
7117 .addDef(MI.getOperand(0).getReg())
7118 .addUse(Src)
7119 .addImm(HW_Mode);
7120
7121 MI.eraseFromParent();
7122 return true;
7123}
7124
7126 MachineIRBuilder &B) const {
7127 const SITargetLowering *TLI = ST.getTargetLowering();
7129 Register DstReg = MI.getOperand(0).getReg();
7130 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7131 MI.eraseFromParent();
7132 return true;
7133}
7134
7136 MachineIRBuilder &B) const {
7137 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7138 if (!ST.hasArchitectedSGPRs())
7139 return false;
7140 LLT S32 = LLT::scalar(32);
7141 Register DstReg = MI.getOperand(0).getReg();
7142 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7143 auto LSB = B.buildConstant(S32, 25);
7144 auto Width = B.buildConstant(S32, 5);
7145 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7146 MI.eraseFromParent();
7147 return true;
7148}
7149
7150static constexpr unsigned FPEnvModeBitField =
7152
7153static constexpr unsigned FPEnvTrapBitField =
7155
7158 MachineIRBuilder &B) const {
7159 Register Src = MI.getOperand(0).getReg();
7160 if (MRI.getType(Src) != S64)
7161 return false;
7162
7163 auto ModeReg =
7164 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7165 /*HasSideEffects=*/true, /*isConvergent=*/false)
7166 .addImm(FPEnvModeBitField);
7167 auto TrapReg =
7168 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7169 /*HasSideEffects=*/true, /*isConvergent=*/false)
7170 .addImm(FPEnvTrapBitField);
7171 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7172 MI.eraseFromParent();
7173 return true;
7174}
7175
7178 MachineIRBuilder &B) const {
7179 Register Src = MI.getOperand(0).getReg();
7180 if (MRI.getType(Src) != S64)
7181 return false;
7182
7183 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7184 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7185 /*HasSideEffects=*/true, /*isConvergent=*/false)
7186 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7187 .addReg(Unmerge.getReg(0));
7188 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7189 /*HasSideEffects=*/true, /*isConvergent=*/false)
7190 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7191 .addReg(Unmerge.getReg(1));
7192 MI.eraseFromParent();
7193 return true;
7194}
7195
7197 MachineInstr &MI) const {
7198 MachineIRBuilder &B = Helper.MIRBuilder;
7199 MachineRegisterInfo &MRI = *B.getMRI();
7200
7201 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7202 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7203 switch (IntrID) {
7204 case Intrinsic::amdgcn_if:
7205 case Intrinsic::amdgcn_else: {
7206 MachineInstr *Br = nullptr;
7207 MachineBasicBlock *UncondBrTarget = nullptr;
7208 bool Negated = false;
7209 if (MachineInstr *BrCond =
7210 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7211 const SIRegisterInfo *TRI
7212 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7213
7214 Register Def = MI.getOperand(1).getReg();
7215 Register Use = MI.getOperand(3).getReg();
7216
7217 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7218
7219 if (Negated)
7220 std::swap(CondBrTarget, UncondBrTarget);
7221
7222 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7223 if (IntrID == Intrinsic::amdgcn_if) {
7224 B.buildInstr(AMDGPU::SI_IF)
7225 .addDef(Def)
7226 .addUse(Use)
7227 .addMBB(UncondBrTarget);
7228 } else {
7229 B.buildInstr(AMDGPU::SI_ELSE)
7230 .addDef(Def)
7231 .addUse(Use)
7232 .addMBB(UncondBrTarget);
7233 }
7234
7235 if (Br) {
7236 Br->getOperand(0).setMBB(CondBrTarget);
7237 } else {
7238 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7239 // since we're swapping branch targets it needs to be reinserted.
7240 // FIXME: IRTranslator should probably not do this
7241 B.buildBr(*CondBrTarget);
7242 }
7243
7244 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7245 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7246 MI.eraseFromParent();
7247 BrCond->eraseFromParent();
7248 return true;
7249 }
7250
7251 return false;
7252 }
7253 case Intrinsic::amdgcn_loop: {
7254 MachineInstr *Br = nullptr;
7255 MachineBasicBlock *UncondBrTarget = nullptr;
7256 bool Negated = false;
7257 if (MachineInstr *BrCond =
7258 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7259 const SIRegisterInfo *TRI
7260 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7261
7262 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7263 Register Reg = MI.getOperand(2).getReg();
7264
7265 if (Negated)
7266 std::swap(CondBrTarget, UncondBrTarget);
7267
7268 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7269 B.buildInstr(AMDGPU::SI_LOOP)
7270 .addUse(Reg)
7271 .addMBB(UncondBrTarget);
7272
7273 if (Br)
7274 Br->getOperand(0).setMBB(CondBrTarget);
7275 else
7276 B.buildBr(*CondBrTarget);
7277
7278 MI.eraseFromParent();
7279 BrCond->eraseFromParent();
7280 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7281 return true;
7282 }
7283
7284 return false;
7285 }
7286 case Intrinsic::amdgcn_addrspacecast_nonnull:
7287 return legalizeAddrSpaceCast(MI, MRI, B);
7288 case Intrinsic::amdgcn_make_buffer_rsrc:
7290 case Intrinsic::amdgcn_kernarg_segment_ptr:
7291 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7292 // This only makes sense to call in a kernel, so just lower to null.
7293 B.buildConstant(MI.getOperand(0).getReg(), 0);
7294 MI.eraseFromParent();
7295 return true;
7296 }
7297
7300 case Intrinsic::amdgcn_implicitarg_ptr:
7301 return legalizeImplicitArgPtr(MI, MRI, B);
7302 case Intrinsic::amdgcn_workitem_id_x:
7303 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7305 case Intrinsic::amdgcn_workitem_id_y:
7306 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7308 case Intrinsic::amdgcn_workitem_id_z:
7309 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7311 case Intrinsic::amdgcn_workgroup_id_x:
7314 case Intrinsic::amdgcn_workgroup_id_y:
7317 case Intrinsic::amdgcn_workgroup_id_z:
7320 case Intrinsic::amdgcn_wave_id:
7321 return legalizeWaveID(MI, B);
7322 case Intrinsic::amdgcn_lds_kernel_id:
7325 case Intrinsic::amdgcn_dispatch_ptr:
7328 case Intrinsic::amdgcn_queue_ptr:
7331 case Intrinsic::amdgcn_implicit_buffer_ptr:
7334 case Intrinsic::amdgcn_dispatch_id:
7337 case Intrinsic::r600_read_ngroups_x:
7338 // TODO: Emit error for hsa
7341 case Intrinsic::r600_read_ngroups_y:
7344 case Intrinsic::r600_read_ngroups_z:
7347 case Intrinsic::r600_read_local_size_x:
7348 // TODO: Could insert G_ASSERT_ZEXT from s16
7350 case Intrinsic::r600_read_local_size_y:
7351 // TODO: Could insert G_ASSERT_ZEXT from s16
7353 // TODO: Could insert G_ASSERT_ZEXT from s16
7354 case Intrinsic::r600_read_local_size_z:
7356 case Intrinsic::r600_read_global_size_x:
7358 case Intrinsic::r600_read_global_size_y:
7360 case Intrinsic::r600_read_global_size_z:
7362 case Intrinsic::amdgcn_fdiv_fast:
7363 return legalizeFDIVFastIntrin(MI, MRI, B);
7364 case Intrinsic::amdgcn_is_shared:
7366 case Intrinsic::amdgcn_is_private:
7368 case Intrinsic::amdgcn_wavefrontsize: {
7369 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7370 MI.eraseFromParent();
7371 return true;
7372 }
7373 case Intrinsic::amdgcn_s_buffer_load:
7374 return legalizeSBufferLoad(Helper, MI);
7375 case Intrinsic::amdgcn_raw_buffer_store:
7376 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7377 case Intrinsic::amdgcn_struct_buffer_store:
7378 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7379 return legalizeBufferStore(MI, MRI, B, false, false);
7380 case Intrinsic::amdgcn_raw_buffer_store_format:
7381 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7382 case Intrinsic::amdgcn_struct_buffer_store_format:
7383 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7384 return legalizeBufferStore(MI, MRI, B, false, true);
7385 case Intrinsic::amdgcn_raw_tbuffer_store:
7386 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7387 case Intrinsic::amdgcn_struct_tbuffer_store:
7388 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7389 return legalizeBufferStore(MI, MRI, B, true, true);
7390 case Intrinsic::amdgcn_raw_buffer_load:
7391 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7392 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7393 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7394 case Intrinsic::amdgcn_struct_buffer_load:
7395 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7396 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7397 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7398 return legalizeBufferLoad(MI, MRI, B, false, false);
7399 case Intrinsic::amdgcn_raw_buffer_load_format:
7400 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7401 case Intrinsic::amdgcn_struct_buffer_load_format:
7402 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7403 return legalizeBufferLoad(MI, MRI, B, true, false);
7404 case Intrinsic::amdgcn_raw_tbuffer_load:
7405 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7406 case Intrinsic::amdgcn_struct_tbuffer_load:
7407 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7408 return legalizeBufferLoad(MI, MRI, B, true, true);
7409 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7410 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7411 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7412 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7413 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7414 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7415 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7416 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7417 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7418 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7419 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7420 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7421 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7422 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7423 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7424 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7425 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7426 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7427 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7428 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7429 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7430 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7431 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7432 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7433 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7434 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7435 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7436 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7437 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7438 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7439 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7440 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7441 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7442 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7443 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7444 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7445 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7446 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7447 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7448 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7449 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7451 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7452 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7453 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7454 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7455 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7456 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7457 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7458 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7459 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7460 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7461 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7463 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7465 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7466 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7467 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7468 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7469 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7470 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7471 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7473 return legalizeBufferAtomic(MI, B, IntrID);
7474 case Intrinsic::amdgcn_rsq_clamp:
7476 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7477 return legalizeBVHIntrinsic(MI, B);
7478 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7479 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7480 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7481 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7482 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7483 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7484 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7485 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7486 Register Index = MI.getOperand(5).getReg();
7487 LLT S32 = LLT::scalar(32);
7488 if (MRI.getType(Index) != S32)
7489 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7490 return true;
7491 }
7492 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7493 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7494 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7495 Register Index = MI.getOperand(7).getReg();
7496 LLT S32 = LLT::scalar(32);
7497 if (MRI.getType(Index) != S32)
7498 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7499 return true;
7500 }
7501 case Intrinsic::amdgcn_fmed3: {
7502 GISelChangeObserver &Observer = Helper.Observer;
7503
7504 // FIXME: This is to workaround the inability of tablegen match combiners to
7505 // match intrinsics in patterns.
7506 Observer.changingInstr(MI);
7507 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7508 MI.removeOperand(1);
7509 Observer.changedInstr(MI);
7510 return true;
7511 }
7512 case Intrinsic::amdgcn_readlane:
7513 case Intrinsic::amdgcn_writelane:
7514 case Intrinsic::amdgcn_readfirstlane:
7515 case Intrinsic::amdgcn_permlane16:
7516 case Intrinsic::amdgcn_permlanex16:
7517 case Intrinsic::amdgcn_permlane64:
7518 return legalizeLaneOp(Helper, MI, IntrID);
7519 default: {
7520 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7522 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7523 return true;
7524 }
7525 }
7526
7527 return true;
7528}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
unsigned Intr
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static const LLT V3S64
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static const LLT V16S16
static const LLT S128
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static const LLT V4S32
static const LLT V2S32
static const LLT V8S64
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static const LLT V12S32
static const LLT V8S32
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static const LLT V2S16
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static const LLT V4S64
static const LLT S1
static const LLT V3S32
static const LLT S64
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static const LLT S32
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static const LLT V6S32
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static const LLT V7S32
static const LLT V5S32
static const LLT V4S16
static const LLT V11S32
static const LLT F64
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static const LLT V32S32
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static const LLT V9S32
static const LLT V10S32
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static const LLT V12S16
static const LLT V16S64
static const LLT S512
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V16S32
static const LLT V7S64
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static const LLT V5S64
static const LLT S160
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static const LLT V4S128
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static const LLT V2BF16
static const LLT V6S64
static constexpr unsigned MaxRegisterSize
static const LLT V2S8
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static const LLT S96
static const LLT V2S64
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static const LLT S16
static const LLT V10S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V2S128
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static const LLT V2F16
static const LLT S256
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static const LLT S8
static const LLT V6S16
static bool isRegisterVectorType(LLT Ty)
static const LLT S224
static const LLT V8S16
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
static const LLT F32
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition: MachO.cpp:71
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define R2(n)
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1172
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
Value * RHS
Value * LHS
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1070
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1050
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1010
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
bool hasA16() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:477
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:455
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:461
bool hasMad64_32() const
Definition: GCNSubtarget.h:741
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:457
bool hasIntClamp() const
Definition: GCNSubtarget.h:357
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:274
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:377
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:601
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:730
bool hasNSAEncoding() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:728
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasFractBug() const
Definition: GCNSubtarget.h:395
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
static constexpr LLT float64()
Get a 64-bit IEEE double value.
Definition: LowLevelType.h:94
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
Definition: LowLevelType.h:214
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:64
constexpr bool isPointerVector() const
Definition: LowLevelType.h:152
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:159
constexpr bool isVector() const
Definition: LowLevelType.h:148
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr bool isPointer() const
Definition: LowLevelType.h:149
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:184
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
static constexpr LLT float16()
Get a 16-bit IEEE half value.
Definition: LowLevelType.h:84
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr bool isPointerOrPointerVector() const
Definition: LowLevelType.h:153
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
Definition: LowLevelType.h:230
constexpr LLT getScalarType() const
Definition: LowLevelType.h:208
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
Definition: LowLevelType.h:124
static constexpr LLT float32()
Get a 32-bit IEEE float value.
Definition: LowLevelType.h:89
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:387
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:412
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void truncate(size_type N)
Like resize, but requires that N is less than size().
Definition: SmallVector.h:657
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:458
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:910
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:353
@ Offset
Definition: DWP.cpp:480
Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition: Utils.cpp:1981
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:646
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
void * PointerTy
Definition: GenericValue.h:21
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Add
Sum of integers.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition: Utils.cpp:1673
@ DS_Warning
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1886
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:382
@ Enable
Enable colors.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static constexpr uint64_t encode(Fields... Values)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:281
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:282
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:76
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.