LLVM 19.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT F32 = LLT::float32();
287static const LLT S64 = LLT::scalar(64);
288static const LLT F64 = LLT::float64();
289static const LLT S96 = LLT::scalar(96);
290static const LLT S128 = LLT::scalar(128);
291static const LLT S160 = LLT::scalar(160);
292static const LLT S224 = LLT::scalar(224);
293static const LLT S256 = LLT::scalar(256);
294static const LLT S512 = LLT::scalar(512);
296
297static const LLT V2S8 = LLT::fixed_vector(2, 8);
298static const LLT V2S16 = LLT::fixed_vector(2, 16);
299static const LLT V4S16 = LLT::fixed_vector(4, 16);
300static const LLT V6S16 = LLT::fixed_vector(6, 16);
301static const LLT V8S16 = LLT::fixed_vector(8, 16);
302static const LLT V10S16 = LLT::fixed_vector(10, 16);
303static const LLT V12S16 = LLT::fixed_vector(12, 16);
304static const LLT V16S16 = LLT::fixed_vector(16, 16);
305
307static const LLT V2BF16 = V2F16; // FIXME
308
309static const LLT V2S32 = LLT::fixed_vector(2, 32);
310static const LLT V3S32 = LLT::fixed_vector(3, 32);
311static const LLT V4S32 = LLT::fixed_vector(4, 32);
312static const LLT V5S32 = LLT::fixed_vector(5, 32);
313static const LLT V6S32 = LLT::fixed_vector(6, 32);
314static const LLT V7S32 = LLT::fixed_vector(7, 32);
315static const LLT V8S32 = LLT::fixed_vector(8, 32);
316static const LLT V9S32 = LLT::fixed_vector(9, 32);
317static const LLT V10S32 = LLT::fixed_vector(10, 32);
318static const LLT V11S32 = LLT::fixed_vector(11, 32);
319static const LLT V12S32 = LLT::fixed_vector(12, 32);
320static const LLT V16S32 = LLT::fixed_vector(16, 32);
321static const LLT V32S32 = LLT::fixed_vector(32, 32);
322
323static const LLT V2S64 = LLT::fixed_vector(2, 64);
324static const LLT V3S64 = LLT::fixed_vector(3, 64);
325static const LLT V4S64 = LLT::fixed_vector(4, 64);
326static const LLT V5S64 = LLT::fixed_vector(5, 64);
327static const LLT V6S64 = LLT::fixed_vector(6, 64);
328static const LLT V7S64 = LLT::fixed_vector(7, 64);
329static const LLT V8S64 = LLT::fixed_vector(8, 64);
330static const LLT V16S64 = LLT::fixed_vector(16, 64);
331
332static const LLT V2S128 = LLT::fixed_vector(2, 128);
333static const LLT V4S128 = LLT::fixed_vector(4, 128);
334
335static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336 S160, S224, S256, S512};
337
338static std::initializer_list<LLT> AllS16Vectors{
340
341static std::initializer_list<LLT> AllS32Vectors = {
344
345static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
347
348// Checks whether a type is in the list of legal register types.
349static bool isRegisterClassType(LLT Ty) {
352
355}
356
357static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358 return [TypeIdx](const LegalityQuery &Query) {
359 return isRegisterClassType(Query.Types[TypeIdx]);
360 };
361}
362
363// If we have a truncating store or an extending load with a data size larger
364// than 32-bits, we need to reduce to a 32-bit type.
366 return [=](const LegalityQuery &Query) {
367 const LLT Ty = Query.Types[TypeIdx];
368 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370 };
371}
372
373// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374// handle some operations by just promoting the register during
375// selection. There are also d16 loads on GFX9+ which preserve the high bits.
376static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377 bool IsLoad, bool IsAtomic) {
378 switch (AS) {
380 // FIXME: Private element size.
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
393 // kernel.
394 return IsLoad ? 512 : 128;
395 default:
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
400 }
401}
402
403static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404 const LegalityQuery &Query) {
405 const LLT Ty = Query.Types[0];
406
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409
410 unsigned RegSize = Ty.getSizeInBits();
411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413 unsigned AS = Query.Types[1].getAddressSpace();
414
415 // All of these need to be custom lowered to cast the pointer operand.
417 return false;
418
419 // Do not handle extending vector loads.
420 if (Ty.isVector() && MemSize != RegSize)
421 return false;
422
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
425#if 0
426 // Accept widening loads based on alignment.
427 if (IsLoad && MemSize < Size)
428 MemSize = std::max(MemSize, Align);
429#endif
430
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize != RegSize && RegSize != 32)
433 return false;
434
435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436 Query.MMODescrs[0].Ordering !=
437 AtomicOrdering::NotAtomic))
438 return false;
439
440 switch (MemSize) {
441 case 8:
442 case 16:
443 case 32:
444 case 64:
445 case 128:
446 break;
447 case 96:
448 if (!ST.hasDwordx3LoadStores())
449 return false;
450 break;
451 case 256:
452 case 512:
453 // These may contextually need to be broken down.
454 break;
455 default:
456 return false;
457 }
458
459 assert(RegSize >= MemSize);
460
461 if (AlignBits < MemSize) {
462 const SITargetLowering *TLI = ST.getTargetLowering();
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464 Align(AlignBits / 8)))
465 return false;
466 }
467
468 return true;
469}
470
471// The newer buffer intrinsic forms take their resource arguments as
472// pointers in address space 8, aka s128 values. However, in order to not break
473// SelectionDAG, the underlying operations have to continue to take v4i32
474// arguments. Therefore, we convert resource pointers - or vectors of them
475// to integer values here.
476static bool hasBufferRsrcWorkaround(const LLT Ty) {
478 return true;
479 if (Ty.isVector()) {
480 const LLT ElemTy = Ty.getElementType();
481 return hasBufferRsrcWorkaround(ElemTy);
482 }
483 return false;
484}
485
486// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487// workaround this. Eventually it should ignore the type for loads and only care
488// about the size. Return true in cases where we will workaround this for now by
489// bitcasting.
490static bool loadStoreBitcastWorkaround(const LLT Ty) {
492 return false;
493
494 const unsigned Size = Ty.getSizeInBits();
495 if (Size <= 64)
496 return false;
497 // Address space 8 pointers get their own workaround.
499 return false;
500 if (!Ty.isVector())
501 return true;
502
503 if (Ty.isPointerVector())
504 return true;
505
506 unsigned EltSize = Ty.getScalarSizeInBits();
507 return EltSize != 32 && EltSize != 64;
508}
509
510static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511 const LLT Ty = Query.Types[0];
512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
514}
515
516/// Return true if a load or store of the type should be lowered with a bitcast
517/// to a different type.
518static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519 const LLT MemTy) {
520 const unsigned MemSizeInBits = MemTy.getSizeInBits();
521 const unsigned Size = Ty.getSizeInBits();
522 if (Size != MemSizeInBits)
523 return Size <= 32 && Ty.isVector();
524
526 return true;
527
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530 (Size <= 32 || isRegisterSize(Size)) &&
532}
533
534/// Return true if we should legalize a load by widening an odd sized memory
535/// access up to the alignment. Note this case when the memory access itself
536/// changes, not the size of the result register.
537static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538 uint64_t AlignInBits, unsigned AddrSpace,
539 unsigned Opcode) {
540 unsigned SizeInBits = MemoryTy.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(SizeInBits))
543 return false;
544
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549 return false;
550
551 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552 return false;
553
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
555 // to it.
556 //
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize = NextPowerOf2(SizeInBits);
559 if (AlignInBits < RoundedSize)
560 return false;
561
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering *TLI = ST.getTargetLowering();
564 unsigned Fast = 0;
566 RoundedSize, AddrSpace, Align(AlignInBits / 8),
568 Fast;
569}
570
571static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572 unsigned Opcode) {
573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574 return false;
575
576 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577 Query.MMODescrs[0].AlignInBits,
578 Query.Types[1].getAddressSpace(), Opcode);
579}
580
581/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
585 MachineRegisterInfo &MRI, unsigned Idx) {
586 MachineOperand &MO = MI.getOperand(Idx);
587
588 const LLT PointerTy = MRI.getType(MO.getReg());
589
590 // Paranoidly prevent us from doing this multiple times.
592 return PointerTy;
593
594 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
595 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
596 if (!PointerTy.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599 const LLT S32 = LLT::scalar(32);
600
601 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
604 for (unsigned I = 0; I < NumParts; ++I)
605 VectorElems[I] =
606 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
608 MO.setReg(VectorReg);
609 return VectorTy;
610 }
611 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
613 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
615 MO.setReg(BitcastReg);
616
617 return VectorTy;
618}
619
620/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621/// the form in which the value must be in order to be passed to the low-level
622/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623/// needed in order to account for the fact that we can't define a register
624/// class for s128 without breaking SelectionDAG.
626 MachineRegisterInfo &MRI = *B.getMRI();
627 const LLT PointerTy = MRI.getType(Pointer);
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630
631 if (!PointerTy.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector<Register, 4> PointerParts;
634 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
636 for (unsigned I = 0; I < NumParts; ++I)
637 PointerParts.push_back(Unmerged.getReg(I));
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
639 }
640 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
642}
643
645 unsigned Idx) {
646 MachineOperand &MO = MI.getOperand(Idx);
647
648 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
651 return;
653}
654
656 const GCNTargetMachine &TM)
657 : ST(ST_) {
658 using namespace TargetOpcode;
659
660 auto GetAddrSpacePtr = [&TM](unsigned AS) {
661 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
662 };
663
664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673 const LLT BufferStridedPtr =
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
675
676 const LLT CodePtr = FlatPtr;
677
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
680 };
681
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684 };
685
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687
688 const std::initializer_list<LLT> FPTypesBase = {
689 S32, S64
690 };
691
692 const std::initializer_list<LLT> FPTypes16 = {
693 S32, S64, S16
694 };
695
696 const std::initializer_list<LLT> FPTypesPK16 = {
697 S32, S64, S16, V2S16
698 };
699
700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701
702 // s1 for VCC branches, s32 for SCC branches.
704
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
709 .legalFor(AllS32Vectors)
711 .legalFor(AddrSpaces64)
712 .legalFor(AddrSpaces32)
713 .legalFor(AddrSpaces128)
714 .legalIf(isPointer(0))
715 .clampScalar(0, S16, S256)
717 .clampMaxNumElements(0, S32, 16)
719 .scalarize(0);
720
721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder({G_ADD, G_SUB})
725 .legalFor({S64, S32, S16, V2S16})
726 .clampMaxNumElementsStrict(0, S16, 2)
727 .scalarize(0)
728 .minScalar(0, S16)
730 .maxScalar(0, S32);
731 } else {
732 getActionDefinitionsBuilder({G_ADD, G_SUB})
733 .legalFor({S32, S16, V2S16})
734 .clampMaxNumElementsStrict(0, S16, 2)
735 .scalarize(0)
736 .minScalar(0, S16)
738 .maxScalar(0, S32);
739 }
740
741 if (ST.hasScalarSMulU64()) {
743 .legalFor({S64, S32, S16, V2S16})
744 .clampMaxNumElementsStrict(0, S16, 2)
745 .scalarize(0)
746 .minScalar(0, S16)
748 .custom();
749 } else {
751 .legalFor({S32, S16, V2S16})
752 .clampMaxNumElementsStrict(0, S16, 2)
753 .scalarize(0)
754 .minScalar(0, S16)
756 .custom();
757 }
758 assert(ST.hasMad64_32());
759
760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761 .legalFor({S32, S16, V2S16}) // Clamp modifier
762 .minScalarOrElt(0, S16)
764 .scalarize(0)
766 .lower();
767 } else if (ST.has16BitInsts()) {
768 getActionDefinitionsBuilder({G_ADD, G_SUB})
769 .legalFor({S32, S16})
770 .minScalar(0, S16)
772 .maxScalar(0, S32)
773 .scalarize(0);
774
776 .legalFor({S32, S16})
777 .scalarize(0)
778 .minScalar(0, S16)
779 .widenScalarToNextMultipleOf(0, 32)
780 .custom();
781 assert(ST.hasMad64_32());
782
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786 .legalFor({S32, S16}) // Clamp modifier
787 .minScalar(0, S16)
788 .scalarize(0)
790 .lower();
791
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795 .minScalar(0, S16)
796 .scalarize(0)
797 .lower();
798 } else {
799 getActionDefinitionsBuilder({G_ADD, G_SUB})
800 .legalFor({S32})
801 .widenScalarToNextMultipleOf(0, 32)
802 .clampScalar(0, S32, S32)
803 .scalarize(0);
804
805 auto &Mul = getActionDefinitionsBuilder(G_MUL)
806 .legalFor({S32})
807 .scalarize(0)
808 .minScalar(0, S32)
809 .widenScalarToNextMultipleOf(0, 32);
810
811 if (ST.hasMad64_32())
812 Mul.custom();
813 else
814 Mul.maxScalar(0, S32);
815
816 if (ST.hasIntClamp()) {
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32}) // Clamp modifier.
819 .scalarize(0)
820 .minScalarOrElt(0, S32)
821 .lower();
822 } else {
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825 .minScalar(0, S32)
826 .scalarize(0)
827 .lower();
828 }
829
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833 .minScalar(0, S32)
834 .scalarize(0)
835 .lower();
836 }
837
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840 .customFor({S32, S64})
841 .clampScalar(0, S32, S64)
843 .scalarize(0);
844
845 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
846 .legalFor({S32})
847 .maxScalar(0, S32);
848
849 if (ST.hasVOP3PInsts()) {
850 Mulh
851 .clampMaxNumElements(0, S8, 2)
852 .lowerFor({V2S8});
853 }
854
855 Mulh
856 .scalarize(0)
857 .lower();
858
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
862 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
863 .clampScalar(0, S32, S64)
867 .scalarize(0);
868
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871 .legalFor({{S32, S1}, {S32, S32}})
872 .clampScalar(0, S32, S32)
873 .scalarize(0);
874
876 // Don't worry about the size constraint.
878 .lower();
879
881 .legalFor({S1, S32, S64, S16, GlobalPtr,
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883 .legalIf(isPointer(0))
884 .clampScalar(0, S32, S64)
886
887 getActionDefinitionsBuilder(G_FCONSTANT)
888 .legalFor({S32, S64, S16})
889 .clampScalar(0, S16, S64);
890
891 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
892 .legalIf(isRegisterType(0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
895 .legalFor({S1, S16})
896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
899 .clampMaxNumElements(0, S32, 16);
900
901 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
902
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
906 .legalFor({{PrivatePtr, S32}});
907
908 getActionDefinitionsBuilder(G_STACKSAVE)
909 .customFor({PrivatePtr});
910 getActionDefinitionsBuilder(G_STACKRESTORE)
911 .legalFor({PrivatePtr});
912
913 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
914
915 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916 .customIf(typeIsNot(0, PrivatePtr));
917
918 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
919
920 auto &FPOpActions = getActionDefinitionsBuilder(
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923 .legalFor({S32, S64});
924 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
925 .customFor({S32, S64});
926 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
927 .customFor({S32, S64});
928
929 if (ST.has16BitInsts()) {
930 if (ST.hasVOP3PInsts())
931 FPOpActions.legalFor({S16, V2S16});
932 else
933 FPOpActions.legalFor({S16});
934
935 TrigActions.customFor({S16});
936 FDIVActions.customFor({S16});
937 }
938
939 if (ST.hasPackedFP32Ops()) {
940 FPOpActions.legalFor({V2S32});
941 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
942 }
943
944 auto &MinNumMaxNum = getActionDefinitionsBuilder({
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946
947 if (ST.hasVOP3PInsts()) {
948 MinNumMaxNum.customFor(FPTypesPK16)
951 .clampScalar(0, S16, S64)
952 .scalarize(0);
953 } else if (ST.has16BitInsts()) {
954 MinNumMaxNum.customFor(FPTypes16)
955 .clampScalar(0, S16, S64)
956 .scalarize(0);
957 } else {
958 MinNumMaxNum.customFor(FPTypesBase)
959 .clampScalar(0, S32, S64)
960 .scalarize(0);
961 }
962
963 if (ST.hasVOP3PInsts())
964 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
965
966 FPOpActions
967 .scalarize(0)
968 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
969
970 TrigActions
971 .scalarize(0)
972 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
973
974 FDIVActions
975 .scalarize(0)
976 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
977
978 getActionDefinitionsBuilder({G_FNEG, G_FABS})
979 .legalFor(FPTypesPK16)
981 .scalarize(0)
982 .clampScalar(0, S16, S64);
983
984 if (ST.has16BitInsts()) {
986 .legalFor({S16})
987 .customFor({S32, S64})
988 .scalarize(0)
989 .unsupported();
991 .legalFor({S32, S64, S16})
992 .scalarize(0)
993 .clampScalar(0, S16, S64);
994
995 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
996 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
997 .scalarize(0)
998 .maxScalarIf(typeIs(0, S16), 1, S16)
999 .clampScalar(1, S32, S32)
1000 .lower();
1001
1003 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004 .scalarize(0)
1005 .lower();
1006 } else {
1008 .customFor({S32, S64, S16})
1009 .scalarize(0)
1010 .unsupported();
1011
1012
1013 if (ST.hasFractBug()) {
1015 .customFor({S64})
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 } else {
1021 .legalFor({S32, S64})
1022 .scalarize(0)
1023 .clampScalar(0, S32, S64);
1024 }
1025
1026 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1027 .legalFor({{S32, S32}, {S64, S32}})
1028 .scalarize(0)
1029 .clampScalar(0, S32, S64)
1030 .clampScalar(1, S32, S32)
1031 .lower();
1032
1034 .customFor({{S32, S32}, {S64, S32}})
1035 .scalarize(0)
1036 .minScalar(0, S32)
1037 .clampScalar(1, S32, S32)
1038 .lower();
1039 }
1040
1042 .legalFor({{S32, S64}, {S16, S32}})
1043 .scalarize(0)
1044 .lower();
1045
1047 .legalFor({{S64, S32}, {S32, S16}})
1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1049 .scalarize(0);
1050
1051 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1052 if (ST.has16BitInsts()) {
1053 FSubActions
1054 // Use actual fsub instruction
1055 .legalFor({S32, S16})
1056 // Must use fadd + fneg
1057 .lowerFor({S64, V2S16});
1058 } else {
1059 FSubActions
1060 // Use actual fsub instruction
1061 .legalFor({S32})
1062 // Must use fadd + fneg
1063 .lowerFor({S64, S16, V2S16});
1064 }
1065
1066 FSubActions
1067 .scalarize(0)
1068 .clampScalar(0, S32, S64);
1069
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073 FMad.customFor({S32, S16});
1074 else if (ST.hasMadMacF32Insts())
1075 FMad.customFor({S32});
1076 else if (ST.hasMadF16())
1077 FMad.customFor({S16});
1078 FMad.scalarize(0)
1079 .lower();
1080
1081 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082 if (ST.has16BitInsts()) {
1083 FRem.customFor({S16, S32, S64});
1084 } else {
1085 FRem.minScalar(0, S32)
1086 .customFor({S32, S64});
1087 }
1088 FRem.scalarize(0);
1089
1090 // TODO: Do we need to clamp maximum bitwidth?
1092 .legalIf(isScalar(0))
1093 .legalFor({{V2S16, V2S32}})
1094 .clampMaxNumElements(0, S16, 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1099 .alwaysLegal();
1100
1101 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1102 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1103 {S32, S1}, {S64, S1}, {S16, S1}})
1104 .scalarize(0)
1105 .clampScalar(0, S32, S64)
1106 .widenScalarToNextPow2(1, 32);
1107
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111 .lowerIf(typeIs(1, S1))
1112 .customFor({{S32, S64}, {S64, S64}});
1113 if (ST.has16BitInsts())
1114 IToFP.legalFor({{S16, S16}});
1115 IToFP.clampScalar(1, S32, S64)
1116 .minScalar(0, S32)
1117 .scalarize(0)
1119
1120 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1121 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122 .customFor({{S64, S32}, {S64, S64}})
1123 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1124 if (ST.has16BitInsts())
1125 FPToI.legalFor({{S16, S16}});
1126 else
1127 FPToI.minScalar(1, S32);
1128
1129 FPToI.minScalar(0, S32)
1130 .widenScalarToNextPow2(0, 32)
1131 .scalarize(0)
1132 .lower();
1133
1134 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1135 .customFor({S16, S32})
1136 .scalarize(0)
1137 .lower();
1138
1139 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1140 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1141 .scalarize(0)
1142 .lower();
1143
1144 if (ST.has16BitInsts()) {
1145 getActionDefinitionsBuilder(
1146 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1147 .legalFor({S16, S32, S64})
1148 .clampScalar(0, S16, S64)
1149 .scalarize(0);
1150 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1151 getActionDefinitionsBuilder(
1152 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1153 .legalFor({S32, S64})
1154 .clampScalar(0, S32, S64)
1155 .scalarize(0);
1156 } else {
1157 getActionDefinitionsBuilder(
1158 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1159 .legalFor({S32})
1160 .customFor({S64})
1161 .clampScalar(0, S32, S64)
1162 .scalarize(0);
1163 }
1164
1165 getActionDefinitionsBuilder(G_PTR_ADD)
1166 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1167 .legalIf(all(isPointer(0), sameSize(0, 1)))
1168 .scalarize(0)
1169 .scalarSameSizeAs(1, 0);
1170
1171 getActionDefinitionsBuilder(G_PTRMASK)
1172 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1173 .scalarSameSizeAs(1, 0)
1174 .scalarize(0);
1175
1176 auto &CmpBuilder =
1177 getActionDefinitionsBuilder(G_ICMP)
1178 // The compare output type differs based on the register bank of the output,
1179 // so make both s1 and s32 legal.
1180 //
1181 // Scalar compares producing output in scc will be promoted to s32, as that
1182 // is the allocatable register type that will be needed for the copy from
1183 // scc. This will be promoted during RegBankSelect, and we assume something
1184 // before that won't try to use s32 result types.
1185 //
1186 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1187 // bank.
1188 .legalForCartesianProduct(
1189 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1190 .legalForCartesianProduct(
1191 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1192 if (ST.has16BitInsts()) {
1193 CmpBuilder.legalFor({{S1, S16}});
1194 }
1195
1196 CmpBuilder
1197 .widenScalarToNextPow2(1)
1198 .clampScalar(1, S32, S64)
1199 .scalarize(0)
1200 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1201
1202 auto &FCmpBuilder =
1203 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1204 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1205
1206 if (ST.hasSALUFloatInsts())
1207 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1208
1209 FCmpBuilder
1210 .widenScalarToNextPow2(1)
1211 .clampScalar(1, S32, S64)
1212 .scalarize(0);
1213
1214 // FIXME: fpow has a selection pattern that should move to custom lowering.
1215 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1216 if (ST.has16BitInsts())
1217 ExpOps.customFor({{S32}, {S16}});
1218 else
1219 ExpOps.customFor({S32});
1220 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1221 .scalarize(0);
1222
1223 getActionDefinitionsBuilder(G_FPOWI)
1224 .clampScalar(0, MinScalarFPTy, S32)
1225 .lower();
1226
1227 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1228 Log2Ops.customFor({S32});
1229 if (ST.has16BitInsts())
1230 Log2Ops.legalFor({S16});
1231 else
1232 Log2Ops.customFor({S16});
1233 Log2Ops.scalarize(0)
1234 .lower();
1235
1236 auto &LogOps =
1237 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1238 LogOps.customFor({S32, S16});
1239 LogOps.clampScalar(0, MinScalarFPTy, S32)
1240 .scalarize(0);
1241
1242 // The 64-bit versions produce 32-bit results, but only on the SALU.
1243 getActionDefinitionsBuilder(G_CTPOP)
1244 .legalFor({{S32, S32}, {S32, S64}})
1245 .clampScalar(0, S32, S32)
1246 .widenScalarToNextPow2(1, 32)
1247 .clampScalar(1, S32, S64)
1248 .scalarize(0)
1249 .widenScalarToNextPow2(0, 32);
1250
1251 // If no 16 bit instr is available, lower into different instructions.
1252 if (ST.has16BitInsts())
1253 getActionDefinitionsBuilder(G_IS_FPCLASS)
1254 .legalForCartesianProduct({S1}, FPTypes16)
1255 .widenScalarToNextPow2(1)
1256 .scalarize(0)
1257 .lower();
1258 else
1259 getActionDefinitionsBuilder(G_IS_FPCLASS)
1260 .legalForCartesianProduct({S1}, FPTypesBase)
1261 .lowerFor({S1, S16})
1262 .widenScalarToNextPow2(1)
1263 .scalarize(0)
1264 .lower();
1265
1266 // The hardware instructions return a different result on 0 than the generic
1267 // instructions expect. The hardware produces -1, but these produce the
1268 // bitwidth.
1269 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1270 .scalarize(0)
1271 .clampScalar(0, S32, S32)
1272 .clampScalar(1, S32, S64)
1273 .widenScalarToNextPow2(0, 32)
1274 .widenScalarToNextPow2(1, 32)
1275 .custom();
1276
1277 // The 64-bit versions produce 32-bit results, but only on the SALU.
1278 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1279 .legalFor({{S32, S32}, {S32, S64}})
1280 .customIf(scalarNarrowerThan(1, 32))
1281 .clampScalar(0, S32, S32)
1282 .clampScalar(1, S32, S64)
1283 .scalarize(0)
1284 .widenScalarToNextPow2(0, 32)
1285 .widenScalarToNextPow2(1, 32);
1286
1287 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1288 .legalFor({{S32, S32}, {S32, S64}})
1289 .clampScalar(0, S32, S32)
1290 .clampScalar(1, S32, S64)
1291 .scalarize(0)
1292 .widenScalarToNextPow2(0, 32)
1293 .widenScalarToNextPow2(1, 32);
1294
1295 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1296 // RegBankSelect.
1297 getActionDefinitionsBuilder(G_BITREVERSE)
1298 .legalFor({S32, S64})
1299 .clampScalar(0, S32, S64)
1300 .scalarize(0)
1301 .widenScalarToNextPow2(0);
1302
1303 if (ST.has16BitInsts()) {
1304 getActionDefinitionsBuilder(G_BSWAP)
1305 .legalFor({S16, S32, V2S16})
1306 .clampMaxNumElementsStrict(0, S16, 2)
1307 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1308 // narrowScalar limitation.
1309 .widenScalarToNextPow2(0)
1310 .clampScalar(0, S16, S32)
1311 .scalarize(0);
1312
1313 if (ST.hasVOP3PInsts()) {
1314 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1315 .legalFor({S32, S16, V2S16})
1316 .clampMaxNumElements(0, S16, 2)
1317 .minScalar(0, S16)
1318 .widenScalarToNextPow2(0)
1319 .scalarize(0)
1320 .lower();
1321 } else {
1322 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1323 .legalFor({S32, S16})
1324 .widenScalarToNextPow2(0)
1325 .minScalar(0, S16)
1326 .scalarize(0)
1327 .lower();
1328 }
1329 } else {
1330 // TODO: Should have same legality without v_perm_b32
1331 getActionDefinitionsBuilder(G_BSWAP)
1332 .legalFor({S32})
1333 .lowerIf(scalarNarrowerThan(0, 32))
1334 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1335 // narrowScalar limitation.
1336 .widenScalarToNextPow2(0)
1337 .maxScalar(0, S32)
1338 .scalarize(0)
1339 .lower();
1340
1341 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1342 .legalFor({S32})
1343 .minScalar(0, S32)
1344 .widenScalarToNextPow2(0)
1345 .scalarize(0)
1346 .lower();
1347 }
1348
1349 getActionDefinitionsBuilder(G_INTTOPTR)
1350 // List the common cases
1351 .legalForCartesianProduct(AddrSpaces64, {S64})
1352 .legalForCartesianProduct(AddrSpaces32, {S32})
1353 .scalarize(0)
1354 // Accept any address space as long as the size matches
1355 .legalIf(sameSize(0, 1))
1356 .widenScalarIf(smallerThan(1, 0),
1357 [](const LegalityQuery &Query) {
1358 return std::pair(
1359 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1360 })
1361 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1362 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1363 });
1364
1365 getActionDefinitionsBuilder(G_PTRTOINT)
1366 // List the common cases
1367 .legalForCartesianProduct(AddrSpaces64, {S64})
1368 .legalForCartesianProduct(AddrSpaces32, {S32})
1369 .scalarize(0)
1370 // Accept any address space as long as the size matches
1371 .legalIf(sameSize(0, 1))
1372 .widenScalarIf(smallerThan(0, 1),
1373 [](const LegalityQuery &Query) {
1374 return std::pair(
1375 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1376 })
1377 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1378 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1379 });
1380
1381 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1382 .scalarize(0)
1383 .custom();
1384
1385 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1386 bool IsLoad) -> bool {
1387 const LLT DstTy = Query.Types[0];
1388
1389 // Split vector extloads.
1390 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1391
1392 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1393 return true;
1394
1395 const LLT PtrTy = Query.Types[1];
1396 unsigned AS = PtrTy.getAddressSpace();
1397 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1398 Query.MMODescrs[0].Ordering !=
1400 return true;
1401
1402 // Catch weird sized loads that don't evenly divide into the access sizes
1403 // TODO: May be able to widen depending on alignment etc.
1404 unsigned NumRegs = (MemSize + 31) / 32;
1405 if (NumRegs == 3) {
1406 if (!ST.hasDwordx3LoadStores())
1407 return true;
1408 } else {
1409 // If the alignment allows, these should have been widened.
1410 if (!isPowerOf2_32(NumRegs))
1411 return true;
1412 }
1413
1414 return false;
1415 };
1416
1417 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1418 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1419 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1420
1421 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1422 // LDS
1423 // TODO: Unsupported flat for SI.
1424
1425 for (unsigned Op : {G_LOAD, G_STORE}) {
1426 const bool IsStore = Op == G_STORE;
1427
1428 auto &Actions = getActionDefinitionsBuilder(Op);
1429 // Explicitly list some common cases.
1430 // TODO: Does this help compile time at all?
1431 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1432 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1433 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1434 {S64, GlobalPtr, S64, GlobalAlign32},
1435 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1436 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1437 {S32, GlobalPtr, S8, GlobalAlign8},
1438 {S32, GlobalPtr, S16, GlobalAlign16},
1439
1440 {S32, LocalPtr, S32, 32},
1441 {S64, LocalPtr, S64, 32},
1442 {V2S32, LocalPtr, V2S32, 32},
1443 {S32, LocalPtr, S8, 8},
1444 {S32, LocalPtr, S16, 16},
1445 {V2S16, LocalPtr, S32, 32},
1446
1447 {S32, PrivatePtr, S32, 32},
1448 {S32, PrivatePtr, S8, 8},
1449 {S32, PrivatePtr, S16, 16},
1450 {V2S16, PrivatePtr, S32, 32},
1451
1452 {S32, ConstantPtr, S32, GlobalAlign32},
1453 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1454 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1455 {S64, ConstantPtr, S64, GlobalAlign32},
1456 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1457 Actions.legalIf(
1458 [=](const LegalityQuery &Query) -> bool {
1459 return isLoadStoreLegal(ST, Query);
1460 });
1461
1462 // The custom pointers (fat pointers, buffer resources) don't work with load
1463 // and store at this level. Fat pointers should have been lowered to
1464 // intrinsics before the translation to MIR.
1465 Actions.unsupportedIf(
1466 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1467
1468 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1469 // ptrtoint. This is needed to account for the fact that we can't have i128
1470 // as a register class for SelectionDAG reasons.
1471 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1472 return hasBufferRsrcWorkaround(Query.Types[0]);
1473 });
1474
1475 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1476 // 64-bits.
1477 //
1478 // TODO: Should generalize bitcast action into coerce, which will also cover
1479 // inserting addrspacecasts.
1480 Actions.customIf(typeIs(1, Constant32Ptr));
1481
1482 // Turn any illegal element vectors into something easier to deal
1483 // with. These will ultimately produce 32-bit scalar shifts to extract the
1484 // parts anyway.
1485 //
1486 // For odd 16-bit element vectors, prefer to split those into pieces with
1487 // 16-bit vector parts.
1488 Actions.bitcastIf(
1489 [=](const LegalityQuery &Query) -> bool {
1490 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1491 Query.MMODescrs[0].MemoryTy);
1492 }, bitcastToRegisterType(0));
1493
1494 if (!IsStore) {
1495 // Widen suitably aligned loads by loading extra bytes. The standard
1496 // legalization actions can't properly express widening memory operands.
1497 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1498 return shouldWidenLoad(ST, Query, G_LOAD);
1499 });
1500 }
1501
1502 // FIXME: load/store narrowing should be moved to lower action
1503 Actions
1504 .narrowScalarIf(
1505 [=](const LegalityQuery &Query) -> bool {
1506 return !Query.Types[0].isVector() &&
1507 needToSplitMemOp(Query, Op == G_LOAD);
1508 },
1509 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1510 const LLT DstTy = Query.Types[0];
1511 const LLT PtrTy = Query.Types[1];
1512
1513 const unsigned DstSize = DstTy.getSizeInBits();
1514 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1515
1516 // Split extloads.
1517 if (DstSize > MemSize)
1518 return std::pair(0, LLT::scalar(MemSize));
1519
1520 unsigned MaxSize = maxSizeForAddrSpace(
1521 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1522 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1523 if (MemSize > MaxSize)
1524 return std::pair(0, LLT::scalar(MaxSize));
1525
1526 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1527 return std::pair(0, LLT::scalar(Align));
1528 })
1529 .fewerElementsIf(
1530 [=](const LegalityQuery &Query) -> bool {
1531 return Query.Types[0].isVector() &&
1532 needToSplitMemOp(Query, Op == G_LOAD);
1533 },
1534 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1535 const LLT DstTy = Query.Types[0];
1536 const LLT PtrTy = Query.Types[1];
1537
1538 LLT EltTy = DstTy.getElementType();
1539 unsigned MaxSize = maxSizeForAddrSpace(
1540 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1541 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1542
1543 // FIXME: Handle widened to power of 2 results better. This ends
1544 // up scalarizing.
1545 // FIXME: 3 element stores scalarized on SI
1546
1547 // Split if it's too large for the address space.
1548 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1549 if (MemSize > MaxSize) {
1550 unsigned NumElts = DstTy.getNumElements();
1551 unsigned EltSize = EltTy.getSizeInBits();
1552
1553 if (MaxSize % EltSize == 0) {
1554 return std::pair(
1556 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1557 }
1558
1559 unsigned NumPieces = MemSize / MaxSize;
1560
1561 // FIXME: Refine when odd breakdowns handled
1562 // The scalars will need to be re-legalized.
1563 if (NumPieces == 1 || NumPieces >= NumElts ||
1564 NumElts % NumPieces != 0)
1565 return std::pair(0, EltTy);
1566
1567 return std::pair(0,
1568 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1569 }
1570
1571 // FIXME: We could probably handle weird extending loads better.
1572 if (DstTy.getSizeInBits() > MemSize)
1573 return std::pair(0, EltTy);
1574
1575 unsigned EltSize = EltTy.getSizeInBits();
1576 unsigned DstSize = DstTy.getSizeInBits();
1577 if (!isPowerOf2_32(DstSize)) {
1578 // We're probably decomposing an odd sized store. Try to split
1579 // to the widest type. TODO: Account for alignment. As-is it
1580 // should be OK, since the new parts will be further legalized.
1581 unsigned FloorSize = llvm::bit_floor(DstSize);
1582 return std::pair(
1584 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1585 }
1586
1587 // May need relegalization for the scalars.
1588 return std::pair(0, EltTy);
1589 })
1590 .minScalar(0, S32)
1591 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1592 .widenScalarToNextPow2(0)
1593 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1594 .lower();
1595 }
1596
1597 // FIXME: Unaligned accesses not lowered.
1598 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1599 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1600 {S32, GlobalPtr, S16, 2 * 8},
1601 {S32, LocalPtr, S8, 8},
1602 {S32, LocalPtr, S16, 16},
1603 {S32, PrivatePtr, S8, 8},
1604 {S32, PrivatePtr, S16, 16},
1605 {S32, ConstantPtr, S8, 8},
1606 {S32, ConstantPtr, S16, 2 * 8}})
1607 .legalIf(
1608 [=](const LegalityQuery &Query) -> bool {
1609 return isLoadStoreLegal(ST, Query);
1610 });
1611
1612 if (ST.hasFlatAddressSpace()) {
1613 ExtLoads.legalForTypesWithMemDesc(
1614 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1615 }
1616
1617 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618 // 64-bits.
1619 //
1620 // TODO: Should generalize bitcast action into coerce, which will also cover
1621 // inserting addrspacecasts.
1622 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1623
1624 ExtLoads.clampScalar(0, S32, S32)
1625 .widenScalarToNextPow2(0)
1626 .lower();
1627
1628 auto &Atomics = getActionDefinitionsBuilder(
1629 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1630 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1631 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1632 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1633 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1634 {S64, GlobalPtr}, {S64, LocalPtr},
1635 {S32, RegionPtr}, {S64, RegionPtr}});
1636 if (ST.hasFlatAddressSpace()) {
1637 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1638 }
1639
1640 // TODO: v2bf16 operations, and fat buffer pointer support.
1641 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1642 if (ST.hasLDSFPAtomicAddF32()) {
1643 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1644 if (ST.hasLdsAtomicAddF64())
1645 Atomic.legalFor({{S64, LocalPtr}});
1646 if (ST.hasAtomicDsPkAdd16Insts())
1647 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1648 }
1649 if (ST.hasAtomicFaddInsts())
1650 Atomic.legalFor({{S32, GlobalPtr}});
1651 if (ST.hasFlatAtomicFaddF32Inst())
1652 Atomic.legalFor({{S32, FlatPtr}});
1653
1654 if (ST.hasGFX90AInsts()) {
1655 // These are legal with some caveats, and should have undergone expansion in
1656 // the IR in most situations
1657 // TODO: Move atomic expansion into legalizer
1658 Atomic.legalFor({
1659 {S32, GlobalPtr},
1660 {S64, GlobalPtr},
1661 {S64, FlatPtr}
1662 });
1663 }
1664
1665 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1666 ST.hasAtomicBufferGlobalPkAddF16Insts())
1667 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1668 if (ST.hasAtomicGlobalPkAddBF16Inst())
1669 Atomic.legalFor({{V2BF16, GlobalPtr}});
1670 if (ST.hasAtomicFlatPkAdd16Insts())
1671 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1672
1673 // FIXME: Handle flat, global and buffer cases.
1674 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1675 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1676
1677 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1678 // demarshalling
1679 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1680 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1681 {S32, FlatPtr}, {S64, FlatPtr}})
1682 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1683 {S32, RegionPtr}, {S64, RegionPtr}});
1684 // TODO: Pointer types, any 32-bit or 64-bit vector
1685
1686 // Condition should be s32 for scalar, s1 for vector.
1687 getActionDefinitionsBuilder(G_SELECT)
1688 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1689 LocalPtr, FlatPtr, PrivatePtr,
1690 LLT::fixed_vector(2, LocalPtr),
1691 LLT::fixed_vector(2, PrivatePtr)},
1692 {S1, S32})
1693 .clampScalar(0, S16, S64)
1694 .scalarize(1)
1695 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1696 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1697 .clampMaxNumElements(0, S32, 2)
1698 .clampMaxNumElements(0, LocalPtr, 2)
1699 .clampMaxNumElements(0, PrivatePtr, 2)
1700 .scalarize(0)
1701 .widenScalarToNextPow2(0)
1702 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1703
1704 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1705 // be more flexible with the shift amount type.
1706 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1707 .legalFor({{S32, S32}, {S64, S32}});
1708 if (ST.has16BitInsts()) {
1709 if (ST.hasVOP3PInsts()) {
1710 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1711 .clampMaxNumElements(0, S16, 2);
1712 } else
1713 Shifts.legalFor({{S16, S16}});
1714
1715 // TODO: Support 16-bit shift amounts for all types
1716 Shifts.widenScalarIf(
1717 [=](const LegalityQuery &Query) {
1718 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1719 // 32-bit amount.
1720 const LLT ValTy = Query.Types[0];
1721 const LLT AmountTy = Query.Types[1];
1722 return ValTy.getSizeInBits() <= 16 &&
1723 AmountTy.getSizeInBits() < 16;
1724 }, changeTo(1, S16));
1725 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1726 Shifts.clampScalar(1, S32, S32);
1727 Shifts.widenScalarToNextPow2(0, 16);
1728 Shifts.clampScalar(0, S16, S64);
1729
1730 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1731 .minScalar(0, S16)
1732 .scalarize(0)
1733 .lower();
1734 } else {
1735 // Make sure we legalize the shift amount type first, as the general
1736 // expansion for the shifted type will produce much worse code if it hasn't
1737 // been truncated already.
1738 Shifts.clampScalar(1, S32, S32);
1739 Shifts.widenScalarToNextPow2(0, 32);
1740 Shifts.clampScalar(0, S32, S64);
1741
1742 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1743 .minScalar(0, S32)
1744 .scalarize(0)
1745 .lower();
1746 }
1747 Shifts.scalarize(0);
1748
1749 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1750 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1751 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1752 unsigned IdxTypeIdx = 2;
1753
1754 getActionDefinitionsBuilder(Op)
1755 .customIf([=](const LegalityQuery &Query) {
1756 const LLT EltTy = Query.Types[EltTypeIdx];
1757 const LLT VecTy = Query.Types[VecTypeIdx];
1758 const LLT IdxTy = Query.Types[IdxTypeIdx];
1759 const unsigned EltSize = EltTy.getSizeInBits();
1760 const bool isLegalVecType =
1762 // Address space 8 pointers are 128-bit wide values, but the logic
1763 // below will try to bitcast them to 2N x s64, which will fail.
1764 // Therefore, as an intermediate step, wrap extracts/insertions from a
1765 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1766 // extraction result) in order to produce a vector operation that can
1767 // be handled by the logic below.
1768 if (EltTy.isPointer() && EltSize > 64)
1769 return true;
1770 return (EltSize == 32 || EltSize == 64) &&
1771 VecTy.getSizeInBits() % 32 == 0 &&
1772 VecTy.getSizeInBits() <= MaxRegisterSize &&
1773 IdxTy.getSizeInBits() == 32 &&
1774 isLegalVecType;
1775 })
1776 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1777 bitcastToVectorElement32(VecTypeIdx))
1778 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1779 .bitcastIf(
1780 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1781 [=](const LegalityQuery &Query) {
1782 // For > 64-bit element types, try to turn this into a 64-bit
1783 // element vector since we may be able to do better indexing
1784 // if this is scalar. If not, fall back to 32.
1785 const LLT EltTy = Query.Types[EltTypeIdx];
1786 const LLT VecTy = Query.Types[VecTypeIdx];
1787 const unsigned DstEltSize = EltTy.getSizeInBits();
1788 const unsigned VecSize = VecTy.getSizeInBits();
1789
1790 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1791 return std::pair(
1792 VecTypeIdx,
1793 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1794 })
1795 .clampScalar(EltTypeIdx, S32, S64)
1796 .clampScalar(VecTypeIdx, S32, S64)
1797 .clampScalar(IdxTypeIdx, S32, S32)
1798 .clampMaxNumElements(VecTypeIdx, S32, 32)
1799 // TODO: Clamp elements for 64-bit vectors?
1800 .moreElementsIf(
1801 isIllegalRegisterType(VecTypeIdx),
1803 // It should only be necessary with variable indexes.
1804 // As a last resort, lower to the stack
1805 .lower();
1806 }
1807
1808 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1809 .unsupportedIf([=](const LegalityQuery &Query) {
1810 const LLT &EltTy = Query.Types[1].getElementType();
1811 return Query.Types[0] != EltTy;
1812 });
1813
1814 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1815 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1816 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1817
1818 // FIXME: Doesn't handle extract of illegal sizes.
1819 getActionDefinitionsBuilder(Op)
1820 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1821 .lowerIf([=](const LegalityQuery &Query) {
1822 // Sub-vector(or single element) insert and extract.
1823 // TODO: verify immediate offset here since lower only works with
1824 // whole elements.
1825 const LLT BigTy = Query.Types[BigTyIdx];
1826 return BigTy.isVector();
1827 })
1828 // FIXME: Multiples of 16 should not be legal.
1829 .legalIf([=](const LegalityQuery &Query) {
1830 const LLT BigTy = Query.Types[BigTyIdx];
1831 const LLT LitTy = Query.Types[LitTyIdx];
1832 return (BigTy.getSizeInBits() % 32 == 0) &&
1833 (LitTy.getSizeInBits() % 16 == 0);
1834 })
1835 .widenScalarIf(
1836 [=](const LegalityQuery &Query) {
1837 const LLT BigTy = Query.Types[BigTyIdx];
1838 return (BigTy.getScalarSizeInBits() < 16);
1839 },
1841 .widenScalarIf(
1842 [=](const LegalityQuery &Query) {
1843 const LLT LitTy = Query.Types[LitTyIdx];
1844 return (LitTy.getScalarSizeInBits() < 16);
1845 },
1847 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1848 .widenScalarToNextPow2(BigTyIdx, 32);
1849
1850 }
1851
1852 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1853 .legalForCartesianProduct(AllS32Vectors, {S32})
1854 .legalForCartesianProduct(AllS64Vectors, {S64})
1855 .clampNumElements(0, V16S32, V32S32)
1856 .clampNumElements(0, V2S64, V16S64)
1857 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1858 .moreElementsIf(
1861
1862 if (ST.hasScalarPackInsts()) {
1863 BuildVector
1864 // FIXME: Should probably widen s1 vectors straight to s32
1865 .minScalarOrElt(0, S16)
1866 .minScalar(1, S16);
1867
1868 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1869 .legalFor({V2S16, S32})
1870 .lower();
1871 } else {
1872 BuildVector.customFor({V2S16, S16});
1873 BuildVector.minScalarOrElt(0, S32);
1874
1875 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1876 .customFor({V2S16, S32})
1877 .lower();
1878 }
1879
1880 BuildVector.legalIf(isRegisterType(0));
1881
1882 // FIXME: Clamp maximum size
1883 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1884 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1885 .clampMaxNumElements(0, S32, 32)
1886 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1887 .clampMaxNumElements(0, S16, 64);
1888
1889 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1890
1891 // Merge/Unmerge
1892 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1893 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1894 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1895
1896 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1897 const LLT Ty = Query.Types[TypeIdx];
1898 if (Ty.isVector()) {
1899 const LLT &EltTy = Ty.getElementType();
1900 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1901 return true;
1902 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1903 return true;
1904 }
1905 return false;
1906 };
1907
1908 auto &Builder = getActionDefinitionsBuilder(Op)
1909 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1910 .lowerFor({{S16, V2S16}})
1911 .lowerIf([=](const LegalityQuery &Query) {
1912 const LLT BigTy = Query.Types[BigTyIdx];
1913 return BigTy.getSizeInBits() == 32;
1914 })
1915 // Try to widen to s16 first for small types.
1916 // TODO: Only do this on targets with legal s16 shifts
1917 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1918 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1919 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1920 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1921 elementTypeIs(1, S16)),
1922 changeTo(1, V2S16))
1923 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1924 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1925 // valid.
1926 .clampScalar(LitTyIdx, S32, S512)
1927 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1928 // Break up vectors with weird elements into scalars
1929 .fewerElementsIf(
1930 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1931 scalarize(0))
1932 .fewerElementsIf(
1933 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1934 scalarize(1))
1935 .clampScalar(BigTyIdx, S32, MaxScalar);
1936
1937 if (Op == G_MERGE_VALUES) {
1938 Builder.widenScalarIf(
1939 // TODO: Use 16-bit shifts if legal for 8-bit values?
1940 [=](const LegalityQuery &Query) {
1941 const LLT Ty = Query.Types[LitTyIdx];
1942 return Ty.getSizeInBits() < 32;
1943 },
1944 changeTo(LitTyIdx, S32));
1945 }
1946
1947 Builder.widenScalarIf(
1948 [=](const LegalityQuery &Query) {
1949 const LLT Ty = Query.Types[BigTyIdx];
1950 return Ty.getSizeInBits() % 16 != 0;
1951 },
1952 [=](const LegalityQuery &Query) {
1953 // Pick the next power of 2, or a multiple of 64 over 128.
1954 // Whichever is smaller.
1955 const LLT &Ty = Query.Types[BigTyIdx];
1956 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1957 if (NewSizeInBits >= 256) {
1958 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1959 if (RoundedTo < NewSizeInBits)
1960 NewSizeInBits = RoundedTo;
1961 }
1962 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1963 })
1964 // Any vectors left are the wrong size. Scalarize them.
1965 .scalarize(0)
1966 .scalarize(1);
1967 }
1968
1969 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1970 // RegBankSelect.
1971 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1972 .legalFor({{S32}, {S64}});
1973
1974 if (ST.hasVOP3PInsts()) {
1975 SextInReg.lowerFor({{V2S16}})
1976 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1977 // get more vector shift opportunities, since we'll get those when
1978 // expanded.
1979 .clampMaxNumElementsStrict(0, S16, 2);
1980 } else if (ST.has16BitInsts()) {
1981 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1982 } else {
1983 // Prefer to promote to s32 before lowering if we don't have 16-bit
1984 // shifts. This avoid a lot of intermediate truncate and extend operations.
1985 SextInReg.lowerFor({{S32}, {S64}});
1986 }
1987
1988 SextInReg
1989 .scalarize(0)
1990 .clampScalar(0, S32, S64)
1991 .lower();
1992
1993 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1994 .scalarize(0)
1995 .lower();
1996
1997 // TODO: Only Try to form v2s16 with legal packed instructions.
1998 getActionDefinitionsBuilder(G_FSHR)
1999 .legalFor({{S32, S32}})
2000 .lowerFor({{V2S16, V2S16}})
2001 .clampMaxNumElementsStrict(0, S16, 2)
2002 .scalarize(0)
2003 .lower();
2004
2005 if (ST.hasVOP3PInsts()) {
2006 getActionDefinitionsBuilder(G_FSHL)
2007 .lowerFor({{V2S16, V2S16}})
2008 .clampMaxNumElementsStrict(0, S16, 2)
2009 .scalarize(0)
2010 .lower();
2011 } else {
2012 getActionDefinitionsBuilder(G_FSHL)
2013 .scalarize(0)
2014 .lower();
2015 }
2016
2017 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2018 .legalFor({S64});
2019
2020 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2021
2022 getActionDefinitionsBuilder(G_FENCE)
2023 .alwaysLegal();
2024
2025 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2026 .scalarize(0)
2027 .minScalar(0, S32)
2028 .lower();
2029
2030 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2031 .legalFor({{S32, S32}, {S64, S32}})
2032 .clampScalar(1, S32, S32)
2033 .clampScalar(0, S32, S64)
2034 .widenScalarToNextPow2(0)
2035 .scalarize(0);
2036
2037 getActionDefinitionsBuilder(
2038 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2039 G_FCOPYSIGN,
2040
2041 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2042 G_READ_REGISTER, G_WRITE_REGISTER,
2043
2044 G_SADDO, G_SSUBO})
2045 .lower();
2046
2047 if (ST.hasIEEEMinMax()) {
2048 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2049 .legalFor(FPTypesPK16)
2050 .clampMaxNumElements(0, S16, 2)
2051 .scalarize(0);
2052 } else {
2053 // TODO: Implement
2054 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2055 }
2056
2057 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2058 .lower();
2059
2060 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2061
2062 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2063 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2064 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2065 .unsupported();
2066
2067 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2068
2069 getLegacyLegalizerInfo().computeTables();
2070 verify(*ST.getInstrInfo());
2071}
2072
2075 LostDebugLocObserver &LocObserver) const {
2076 MachineIRBuilder &B = Helper.MIRBuilder;
2077 MachineRegisterInfo &MRI = *B.getMRI();
2078
2079 switch (MI.getOpcode()) {
2080 case TargetOpcode::G_ADDRSPACE_CAST:
2081 return legalizeAddrSpaceCast(MI, MRI, B);
2082 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2083 return legalizeFroundeven(MI, MRI, B);
2084 case TargetOpcode::G_FCEIL:
2085 return legalizeFceil(MI, MRI, B);
2086 case TargetOpcode::G_FREM:
2087 return legalizeFrem(MI, MRI, B);
2088 case TargetOpcode::G_INTRINSIC_TRUNC:
2089 return legalizeIntrinsicTrunc(MI, MRI, B);
2090 case TargetOpcode::G_SITOFP:
2091 return legalizeITOFP(MI, MRI, B, true);
2092 case TargetOpcode::G_UITOFP:
2093 return legalizeITOFP(MI, MRI, B, false);
2094 case TargetOpcode::G_FPTOSI:
2095 return legalizeFPTOI(MI, MRI, B, true);
2096 case TargetOpcode::G_FPTOUI:
2097 return legalizeFPTOI(MI, MRI, B, false);
2098 case TargetOpcode::G_FMINNUM:
2099 case TargetOpcode::G_FMAXNUM:
2100 case TargetOpcode::G_FMINNUM_IEEE:
2101 case TargetOpcode::G_FMAXNUM_IEEE:
2102 return legalizeMinNumMaxNum(Helper, MI);
2103 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2104 return legalizeExtractVectorElt(MI, MRI, B);
2105 case TargetOpcode::G_INSERT_VECTOR_ELT:
2106 return legalizeInsertVectorElt(MI, MRI, B);
2107 case TargetOpcode::G_FSIN:
2108 case TargetOpcode::G_FCOS:
2109 return legalizeSinCos(MI, MRI, B);
2110 case TargetOpcode::G_GLOBAL_VALUE:
2111 return legalizeGlobalValue(MI, MRI, B);
2112 case TargetOpcode::G_LOAD:
2113 case TargetOpcode::G_SEXTLOAD:
2114 case TargetOpcode::G_ZEXTLOAD:
2115 return legalizeLoad(Helper, MI);
2116 case TargetOpcode::G_STORE:
2117 return legalizeStore(Helper, MI);
2118 case TargetOpcode::G_FMAD:
2119 return legalizeFMad(MI, MRI, B);
2120 case TargetOpcode::G_FDIV:
2121 return legalizeFDIV(MI, MRI, B);
2122 case TargetOpcode::G_FFREXP:
2123 return legalizeFFREXP(MI, MRI, B);
2124 case TargetOpcode::G_FSQRT:
2125 return legalizeFSQRT(MI, MRI, B);
2126 case TargetOpcode::G_UDIV:
2127 case TargetOpcode::G_UREM:
2128 case TargetOpcode::G_UDIVREM:
2129 return legalizeUnsignedDIV_REM(MI, MRI, B);
2130 case TargetOpcode::G_SDIV:
2131 case TargetOpcode::G_SREM:
2132 case TargetOpcode::G_SDIVREM:
2133 return legalizeSignedDIV_REM(MI, MRI, B);
2134 case TargetOpcode::G_ATOMIC_CMPXCHG:
2135 return legalizeAtomicCmpXChg(MI, MRI, B);
2136 case TargetOpcode::G_FLOG2:
2137 return legalizeFlog2(MI, B);
2138 case TargetOpcode::G_FLOG:
2139 case TargetOpcode::G_FLOG10:
2140 return legalizeFlogCommon(MI, B);
2141 case TargetOpcode::G_FEXP2:
2142 return legalizeFExp2(MI, B);
2143 case TargetOpcode::G_FEXP:
2144 case TargetOpcode::G_FEXP10:
2145 return legalizeFExp(MI, B);
2146 case TargetOpcode::G_FPOW:
2147 return legalizeFPow(MI, B);
2148 case TargetOpcode::G_FFLOOR:
2149 return legalizeFFloor(MI, MRI, B);
2150 case TargetOpcode::G_BUILD_VECTOR:
2151 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2152 return legalizeBuildVector(MI, MRI, B);
2153 case TargetOpcode::G_MUL:
2154 return legalizeMul(Helper, MI);
2155 case TargetOpcode::G_CTLZ:
2156 case TargetOpcode::G_CTTZ:
2157 return legalizeCTLZ_CTTZ(MI, MRI, B);
2158 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2159 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2160 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2161 return legalizeFPTruncRound(MI, B);
2162 case TargetOpcode::G_STACKSAVE:
2163 return legalizeStackSave(MI, B);
2164 case TargetOpcode::G_GET_FPENV:
2165 return legalizeGetFPEnv(MI, MRI, B);
2166 case TargetOpcode::G_SET_FPENV:
2167 return legalizeSetFPEnv(MI, MRI, B);
2168 case TargetOpcode::G_TRAP:
2169 return legalizeTrap(MI, MRI, B);
2170 case TargetOpcode::G_DEBUGTRAP:
2171 return legalizeDebugTrap(MI, MRI, B);
2172 default:
2173 return false;
2174 }
2175
2176 llvm_unreachable("expected switch to return");
2177}
2178
2180 unsigned AS,
2182 MachineIRBuilder &B) const {
2183 MachineFunction &MF = B.getMF();
2184 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2185 const LLT S32 = LLT::scalar(32);
2186 const LLT S64 = LLT::scalar(64);
2187
2189
2190 if (ST.hasApertureRegs()) {
2191 // Note: this register is somewhat broken. When used as a 32-bit operand,
2192 // it only returns zeroes. The real value is in the upper 32 bits.
2193 // Thus, we must emit extract the high 32 bits.
2194 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2195 ? AMDGPU::SRC_SHARED_BASE
2196 : AMDGPU::SRC_PRIVATE_BASE;
2197 // FIXME: It would be more natural to emit a COPY here, but then copy
2198 // coalescing would kick in and it would think it's okay to use the "HI"
2199 // subregister (instead of extracting the HI 32 bits) which is an artificial
2200 // (unusable) register.
2201 // Register TableGen definitions would need an overhaul to get rid of the
2202 // artificial "HI" aperture registers and prevent this kind of issue from
2203 // happening.
2204 Register Dst = MRI.createGenericVirtualRegister(S64);
2205 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2206 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2207 return B.buildUnmerge(S32, Dst).getReg(1);
2208 }
2209
2210 // TODO: can we be smarter about machine pointer info?
2212 Register LoadAddr = MRI.createGenericVirtualRegister(
2214 // For code object version 5, private_base and shared_base are passed through
2215 // implicit kernargs.
2222 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2223
2224 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2226
2227 if (!loadInputValue(KernargPtrReg, B,
2229 return Register();
2230
2232 PtrInfo,
2236
2237 // Pointer address
2238 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2239 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2240 // Load address
2241 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2242 }
2243
2244 Register QueuePtr = MRI.createGenericVirtualRegister(
2246
2248 return Register();
2249
2250 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2251 // private_segment_aperture_base_hi.
2252 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2253
2255 PtrInfo,
2258 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2259
2260 B.buildPtrAdd(LoadAddr, QueuePtr,
2261 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2262 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2263}
2264
2265/// Return true if the value is a known valid address, such that a null check is
2266/// not necessary.
2268 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2269 MachineInstr *Def = MRI.getVRegDef(Val);
2270 switch (Def->getOpcode()) {
2271 case AMDGPU::G_FRAME_INDEX:
2272 case AMDGPU::G_GLOBAL_VALUE:
2273 case AMDGPU::G_BLOCK_ADDR:
2274 return true;
2275 case AMDGPU::G_CONSTANT: {
2276 const ConstantInt *CI = Def->getOperand(1).getCImm();
2277 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2278 }
2279 default:
2280 return false;
2281 }
2282
2283 return false;
2284}
2285
2288 MachineIRBuilder &B) const {
2289 MachineFunction &MF = B.getMF();
2290
2291 // MI can either be a G_ADDRSPACE_CAST or a
2292 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2293 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2294 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2295 Intrinsic::amdgcn_addrspacecast_nonnull));
2296
2297 const LLT S32 = LLT::scalar(32);
2298 Register Dst = MI.getOperand(0).getReg();
2299 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2300 : MI.getOperand(1).getReg();
2301 LLT DstTy = MRI.getType(Dst);
2302 LLT SrcTy = MRI.getType(Src);
2303 unsigned DestAS = DstTy.getAddressSpace();
2304 unsigned SrcAS = SrcTy.getAddressSpace();
2305
2306 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2307 // vector element.
2308 assert(!DstTy.isVector());
2309
2310 const AMDGPUTargetMachine &TM
2311 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2312
2313 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2314 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2315 return true;
2316 }
2317
2318 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2319 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2320 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2321 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2322 // G_ADDRSPACE_CAST we need to guess.
2323 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2324 // Extract low 32-bits of the pointer.
2325 B.buildExtract(Dst, Src, 0);
2326 MI.eraseFromParent();
2327 return true;
2328 }
2329
2330 unsigned NullVal = TM.getNullPointerValue(DestAS);
2331
2332 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2333 auto FlatNull = B.buildConstant(SrcTy, 0);
2334
2335 // Extract low 32-bits of the pointer.
2336 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2337
2338 auto CmpRes =
2339 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2340 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2341
2342 MI.eraseFromParent();
2343 return true;
2344 }
2345
2346 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2347 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2348 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2349 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2350 if (!ApertureReg.isValid())
2351 return false;
2352
2353 // Coerce the type of the low half of the result so we can use merge_values.
2354 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2355
2356 // TODO: Should we allow mismatched types but matching sizes in merges to
2357 // avoid the ptrtoint?
2358 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2359
2360 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2361 // G_ADDRSPACE_CAST we need to guess.
2362 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2363 B.buildCopy(Dst, BuildPtr);
2364 MI.eraseFromParent();
2365 return true;
2366 }
2367
2368 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2369 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2370
2371 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2372 SegmentNull.getReg(0));
2373
2374 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2375
2376 MI.eraseFromParent();
2377 return true;
2378 }
2379
2380 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2381 SrcTy.getSizeInBits() == 64) {
2382 // Truncate.
2383 B.buildExtract(Dst, Src, 0);
2384 MI.eraseFromParent();
2385 return true;
2386 }
2387
2388 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2389 DstTy.getSizeInBits() == 64) {
2391 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2392 auto PtrLo = B.buildPtrToInt(S32, Src);
2393 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2394 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2395 MI.eraseFromParent();
2396 return true;
2397 }
2398
2399 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2400 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2401
2402 LLVMContext &Ctx = MF.getFunction().getContext();
2403 Ctx.diagnose(InvalidAddrSpaceCast);
2404 B.buildUndef(Dst);
2405 MI.eraseFromParent();
2406 return true;
2407}
2408
2411 MachineIRBuilder &B) const {
2412 Register Src = MI.getOperand(1).getReg();
2413 LLT Ty = MRI.getType(Src);
2414 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2415
2416 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2417 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2418
2419 auto C1 = B.buildFConstant(Ty, C1Val);
2420 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2421
2422 // TODO: Should this propagate fast-math-flags?
2423 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2424 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2425
2426 auto C2 = B.buildFConstant(Ty, C2Val);
2427 auto Fabs = B.buildFAbs(Ty, Src);
2428
2429 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2430 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2431 MI.eraseFromParent();
2432 return true;
2433}
2434
2437 MachineIRBuilder &B) const {
2438
2439 const LLT S1 = LLT::scalar(1);
2440 const LLT S64 = LLT::scalar(64);
2441
2442 Register Src = MI.getOperand(1).getReg();
2443 assert(MRI.getType(Src) == S64);
2444
2445 // result = trunc(src)
2446 // if (src > 0.0 && src != result)
2447 // result += 1.0
2448
2449 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2450
2451 const auto Zero = B.buildFConstant(S64, 0.0);
2452 const auto One = B.buildFConstant(S64, 1.0);
2453 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2454 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2455 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2456 auto Add = B.buildSelect(S64, And, One, Zero);
2457
2458 // TODO: Should this propagate fast-math-flags?
2459 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2460 MI.eraseFromParent();
2461 return true;
2462}
2463
2466 MachineIRBuilder &B) const {
2467 Register DstReg = MI.getOperand(0).getReg();
2468 Register Src0Reg = MI.getOperand(1).getReg();
2469 Register Src1Reg = MI.getOperand(2).getReg();
2470 auto Flags = MI.getFlags();
2471 LLT Ty = MRI.getType(DstReg);
2472
2473 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2474 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2475 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2476 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2477 MI.eraseFromParent();
2478 return true;
2479}
2480
2483 const unsigned FractBits = 52;
2484 const unsigned ExpBits = 11;
2485 LLT S32 = LLT::scalar(32);
2486
2487 auto Const0 = B.buildConstant(S32, FractBits - 32);
2488 auto Const1 = B.buildConstant(S32, ExpBits);
2489
2490 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2491 .addUse(Hi)
2492 .addUse(Const0.getReg(0))
2493 .addUse(Const1.getReg(0));
2494
2495 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2496}
2497
2500 MachineIRBuilder &B) const {
2501 const LLT S1 = LLT::scalar(1);
2502 const LLT S32 = LLT::scalar(32);
2503 const LLT S64 = LLT::scalar(64);
2504
2505 Register Src = MI.getOperand(1).getReg();
2506 assert(MRI.getType(Src) == S64);
2507
2508 // TODO: Should this use extract since the low half is unused?
2509 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2510 Register Hi = Unmerge.getReg(1);
2511
2512 // Extract the upper half, since this is where we will find the sign and
2513 // exponent.
2514 auto Exp = extractF64Exponent(Hi, B);
2515
2516 const unsigned FractBits = 52;
2517
2518 // Extract the sign bit.
2519 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2520 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2521
2522 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2523
2524 const auto Zero32 = B.buildConstant(S32, 0);
2525
2526 // Extend back to 64-bits.
2527 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2528
2529 auto Shr = B.buildAShr(S64, FractMask, Exp);
2530 auto Not = B.buildNot(S64, Shr);
2531 auto Tmp0 = B.buildAnd(S64, Src, Not);
2532 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2533
2534 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2535 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2536
2537 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2538 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2539 MI.eraseFromParent();
2540 return true;
2541}
2542
2545 MachineIRBuilder &B, bool Signed) const {
2546
2547 Register Dst = MI.getOperand(0).getReg();
2548 Register Src = MI.getOperand(1).getReg();
2549
2550 const LLT S64 = LLT::scalar(64);
2551 const LLT S32 = LLT::scalar(32);
2552
2553 assert(MRI.getType(Src) == S64);
2554
2555 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2556 auto ThirtyTwo = B.buildConstant(S32, 32);
2557
2558 if (MRI.getType(Dst) == S64) {
2559 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2560 : B.buildUITOFP(S64, Unmerge.getReg(1));
2561
2562 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2563 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2564
2565 // TODO: Should this propagate fast-math-flags?
2566 B.buildFAdd(Dst, LdExp, CvtLo);
2567 MI.eraseFromParent();
2568 return true;
2569 }
2570
2571 assert(MRI.getType(Dst) == S32);
2572
2573 auto One = B.buildConstant(S32, 1);
2574
2575 MachineInstrBuilder ShAmt;
2576 if (Signed) {
2577 auto ThirtyOne = B.buildConstant(S32, 31);
2578 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2579 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2580 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2581 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2582 .addUse(Unmerge.getReg(1));
2583 auto LS2 = B.buildSub(S32, LS, One);
2584 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2585 } else
2586 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2587 auto Norm = B.buildShl(S64, Src, ShAmt);
2588 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2589 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2590 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2591 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2592 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2593 B.buildFLdexp(Dst, FVal, Scale);
2594 MI.eraseFromParent();
2595 return true;
2596}
2597
2598// TODO: Copied from DAG implementation. Verify logic and document how this
2599// actually works.
2603 bool Signed) const {
2604
2605 Register Dst = MI.getOperand(0).getReg();
2606 Register Src = MI.getOperand(1).getReg();
2607
2608 const LLT S64 = LLT::scalar(64);
2609 const LLT S32 = LLT::scalar(32);
2610
2611 const LLT SrcLT = MRI.getType(Src);
2612 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2613
2614 unsigned Flags = MI.getFlags();
2615
2616 // The basic idea of converting a floating point number into a pair of 32-bit
2617 // integers is illustrated as follows:
2618 //
2619 // tf := trunc(val);
2620 // hif := floor(tf * 2^-32);
2621 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2622 // hi := fptoi(hif);
2623 // lo := fptoi(lof);
2624 //
2625 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2627 if (Signed && SrcLT == S32) {
2628 // However, a 32-bit floating point number has only 23 bits mantissa and
2629 // it's not enough to hold all the significant bits of `lof` if val is
2630 // negative. To avoid the loss of precision, We need to take the absolute
2631 // value after truncating and flip the result back based on the original
2632 // signedness.
2633 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2634 Trunc = B.buildFAbs(S32, Trunc, Flags);
2635 }
2636 MachineInstrBuilder K0, K1;
2637 if (SrcLT == S64) {
2638 K0 = B.buildFConstant(
2639 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2640 K1 = B.buildFConstant(
2641 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2642 } else {
2643 K0 = B.buildFConstant(
2644 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2645 K1 = B.buildFConstant(
2646 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2647 }
2648
2649 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2650 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2651 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2652
2653 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2654 : B.buildFPTOUI(S32, FloorMul);
2655 auto Lo = B.buildFPTOUI(S32, Fma);
2656
2657 if (Signed && SrcLT == S32) {
2658 // Flip the result based on the signedness, which is either all 0s or 1s.
2659 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2660 // r := xor({lo, hi}, sign) - sign;
2661 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2662 Sign);
2663 } else
2664 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2665 MI.eraseFromParent();
2666
2667 return true;
2668}
2669
2671 MachineInstr &MI) const {
2672 MachineFunction &MF = Helper.MIRBuilder.getMF();
2674
2675 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2676 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2677
2678 // With ieee_mode disabled, the instructions have the correct behavior
2679 // already for G_FMINNUM/G_FMAXNUM
2680 if (!MFI->getMode().IEEE)
2681 return !IsIEEEOp;
2682
2683 if (IsIEEEOp)
2684 return true;
2685
2687}
2688
2691 MachineIRBuilder &B) const {
2692 // TODO: Should move some of this into LegalizerHelper.
2693
2694 // TODO: Promote dynamic indexing of s16 to s32
2695
2696 Register Dst = MI.getOperand(0).getReg();
2697 Register Vec = MI.getOperand(1).getReg();
2698
2699 LLT VecTy = MRI.getType(Vec);
2700 LLT EltTy = VecTy.getElementType();
2701 assert(EltTy == MRI.getType(Dst));
2702
2703 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2704 // but we can't go directly to that logic becasue you can't bitcast a vector
2705 // of pointers to a vector of integers. Therefore, introduce an intermediate
2706 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2707 // drive the legalization forward.
2708 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2709 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2710 LLT IntVecTy = VecTy.changeElementType(IntTy);
2711
2712 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2713 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2714 B.buildIntToPtr(Dst, IntElt);
2715
2716 MI.eraseFromParent();
2717 return true;
2718 }
2719
2720 // FIXME: Artifact combiner probably should have replaced the truncated
2721 // constant before this, so we shouldn't need
2722 // getIConstantVRegValWithLookThrough.
2723 std::optional<ValueAndVReg> MaybeIdxVal =
2724 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2725 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2726 return true;
2727 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2728
2729 if (IdxVal < VecTy.getNumElements()) {
2730 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2731 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2732 } else {
2733 B.buildUndef(Dst);
2734 }
2735
2736 MI.eraseFromParent();
2737 return true;
2738}
2739
2742 MachineIRBuilder &B) const {
2743 // TODO: Should move some of this into LegalizerHelper.
2744
2745 // TODO: Promote dynamic indexing of s16 to s32
2746
2747 Register Dst = MI.getOperand(0).getReg();
2748 Register Vec = MI.getOperand(1).getReg();
2749 Register Ins = MI.getOperand(2).getReg();
2750
2751 LLT VecTy = MRI.getType(Vec);
2752 LLT EltTy = VecTy.getElementType();
2753 assert(EltTy == MRI.getType(Ins));
2754
2755 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2756 // but we can't go directly to that logic becasue you can't bitcast a vector
2757 // of pointers to a vector of integers. Therefore, make the pointer vector
2758 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2759 // new value, and then inttoptr the result vector back. This will then allow
2760 // the rest of legalization to take over.
2761 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2762 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2763 LLT IntVecTy = VecTy.changeElementType(IntTy);
2764
2765 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2766 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2767 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2768 MI.getOperand(3));
2769 B.buildIntToPtr(Dst, IntVecDest);
2770 MI.eraseFromParent();
2771 return true;
2772 }
2773
2774 // FIXME: Artifact combiner probably should have replaced the truncated
2775 // constant before this, so we shouldn't need
2776 // getIConstantVRegValWithLookThrough.
2777 std::optional<ValueAndVReg> MaybeIdxVal =
2778 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2779 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2780 return true;
2781
2782 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2783
2784 unsigned NumElts = VecTy.getNumElements();
2785 if (IdxVal < NumElts) {
2787 for (unsigned i = 0; i < NumElts; ++i)
2788 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2789 B.buildUnmerge(SrcRegs, Vec);
2790
2791 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2792 B.buildMergeLikeInstr(Dst, SrcRegs);
2793 } else {
2794 B.buildUndef(Dst);
2795 }
2796
2797 MI.eraseFromParent();
2798 return true;
2799}
2800
2803 MachineIRBuilder &B) const {
2804
2805 Register DstReg = MI.getOperand(0).getReg();
2806 Register SrcReg = MI.getOperand(1).getReg();
2807 LLT Ty = MRI.getType(DstReg);
2808 unsigned Flags = MI.getFlags();
2809
2810 Register TrigVal;
2811 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2812 if (ST.hasTrigReducedRange()) {
2813 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2814 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2815 .addUse(MulVal.getReg(0))
2816 .setMIFlags(Flags)
2817 .getReg(0);
2818 } else
2819 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2820
2821 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2822 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2823 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2824 .addUse(TrigVal)
2825 .setMIFlags(Flags);
2826 MI.eraseFromParent();
2827 return true;
2828}
2829
2832 const GlobalValue *GV,
2833 int64_t Offset,
2834 unsigned GAFlags) const {
2835 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2836 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2837 // to the following code sequence:
2838 //
2839 // For constant address space:
2840 // s_getpc_b64 s[0:1]
2841 // s_add_u32 s0, s0, $symbol
2842 // s_addc_u32 s1, s1, 0
2843 //
2844 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2845 // a fixup or relocation is emitted to replace $symbol with a literal
2846 // constant, which is a pc-relative offset from the encoding of the $symbol
2847 // operand to the global variable.
2848 //
2849 // For global address space:
2850 // s_getpc_b64 s[0:1]
2851 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2852 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2853 //
2854 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2855 // fixups or relocations are emitted to replace $symbol@*@lo and
2856 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2857 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2858 // operand to the global variable.
2859
2861
2862 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2863 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2864
2865 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2866 .addDef(PCReg);
2867
2868 MIB.addGlobalAddress(GV, Offset, GAFlags);
2869 if (GAFlags == SIInstrInfo::MO_NONE)
2870 MIB.addImm(0);
2871 else
2872 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2873
2874 if (!B.getMRI()->getRegClassOrNull(PCReg))
2875 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2876
2877 if (PtrTy.getSizeInBits() == 32)
2878 B.buildExtract(DstReg, PCReg, 0);
2879 return true;
2880}
2881
2882// Emit a ABS32_LO / ABS32_HI relocation stub.
2884 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2885 MachineRegisterInfo &MRI) const {
2886 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2887
2888 LLT S32 = LLT::scalar(32);
2889
2890 // Use the destination directly, if and only if we store the lower address
2891 // part only and we don't have a register class being set.
2892 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2893 ? DstReg
2894 : MRI.createGenericVirtualRegister(S32);
2895
2896 if (!MRI.getRegClassOrNull(AddrLo))
2897 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2898
2899 // Write the lower half.
2900 B.buildInstr(AMDGPU::S_MOV_B32)
2901 .addDef(AddrLo)
2902 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2903
2904 // If required, write the upper half as well.
2905 if (RequiresHighHalf) {
2906 assert(PtrTy.getSizeInBits() == 64 &&
2907 "Must provide a 64-bit pointer type!");
2908
2909 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2910 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2911
2912 B.buildInstr(AMDGPU::S_MOV_B32)
2913 .addDef(AddrHi)
2914 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2915
2916 // Use the destination directly, if and only if we don't have a register
2917 // class being set.
2918 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2919 ? DstReg
2920 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2921
2922 if (!MRI.getRegClassOrNull(AddrDst))
2923 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2924
2925 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2926
2927 // If we created a new register for the destination, cast the result into
2928 // the final output.
2929 if (AddrDst != DstReg)
2930 B.buildCast(DstReg, AddrDst);
2931 } else if (AddrLo != DstReg) {
2932 // If we created a new register for the destination, cast the result into
2933 // the final output.
2934 B.buildCast(DstReg, AddrLo);
2935 }
2936}
2937
2940 MachineIRBuilder &B) const {
2941 Register DstReg = MI.getOperand(0).getReg();
2942 LLT Ty = MRI.getType(DstReg);
2943 unsigned AS = Ty.getAddressSpace();
2944
2945 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2946 MachineFunction &MF = B.getMF();
2948
2950 if (!MFI->isModuleEntryFunction() &&
2951 GV->getName() != "llvm.amdgcn.module.lds") {
2952 const Function &Fn = MF.getFunction();
2953 DiagnosticInfoUnsupported BadLDSDecl(
2954 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2955 DS_Warning);
2956 Fn.getContext().diagnose(BadLDSDecl);
2957
2958 // We currently don't have a way to correctly allocate LDS objects that
2959 // aren't directly associated with a kernel. We do force inlining of
2960 // functions that use local objects. However, if these dead functions are
2961 // not eliminated, we don't want a compile time error. Just emit a warning
2962 // and a trap, since there should be no callable path here.
2963 B.buildTrap();
2964 B.buildUndef(DstReg);
2965 MI.eraseFromParent();
2966 return true;
2967 }
2968
2969 // TODO: We could emit code to handle the initialization somewhere.
2970 // We ignore the initializer for now and legalize it to allow selection.
2971 // The initializer will anyway get errored out during assembly emission.
2972 const SITargetLowering *TLI = ST.getTargetLowering();
2973 if (!TLI->shouldUseLDSConstAddress(GV)) {
2974 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2975 return true; // Leave in place;
2976 }
2977
2978 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2979 Type *Ty = GV->getValueType();
2980 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2981 // zero-sized type in other languages to declare the dynamic shared
2982 // memory which size is not known at the compile time. They will be
2983 // allocated by the runtime and placed directly after the static
2984 // allocated ones. They all share the same offset.
2985 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2986 // Adjust alignment for that dynamic shared memory array.
2987 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2988 LLT S32 = LLT::scalar(32);
2989 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2990 B.buildIntToPtr(DstReg, Sz);
2991 MI.eraseFromParent();
2992 return true;
2993 }
2994 }
2995
2996 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2997 *cast<GlobalVariable>(GV)));
2998 MI.eraseFromParent();
2999 return true;
3000 }
3001
3002 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3003 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3004 MI.eraseFromParent();
3005 return true;
3006 }
3007
3008 const SITargetLowering *TLI = ST.getTargetLowering();
3009
3010 if (TLI->shouldEmitFixup(GV)) {
3011 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3012 MI.eraseFromParent();
3013 return true;
3014 }
3015
3016 if (TLI->shouldEmitPCReloc(GV)) {
3017 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3018 MI.eraseFromParent();
3019 return true;
3020 }
3021
3023 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3024
3025 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3030 LoadTy, Align(8));
3031
3032 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3033
3034 if (Ty.getSizeInBits() == 32) {
3035 // Truncate if this is a 32-bit constant address.
3036 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3037 B.buildExtract(DstReg, Load, 0);
3038 } else
3039 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3040
3041 MI.eraseFromParent();
3042 return true;
3043}
3044
3046 if (Ty.isVector())
3047 return Ty.changeElementCount(
3050}
3051
3053 MachineInstr &MI) const {
3054 MachineIRBuilder &B = Helper.MIRBuilder;
3055 MachineRegisterInfo &MRI = *B.getMRI();
3056 GISelChangeObserver &Observer = Helper.Observer;
3057
3058 Register PtrReg = MI.getOperand(1).getReg();
3059 LLT PtrTy = MRI.getType(PtrReg);
3060 unsigned AddrSpace = PtrTy.getAddressSpace();
3061
3062 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3064 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3065 Observer.changingInstr(MI);
3066 MI.getOperand(1).setReg(Cast.getReg(0));
3067 Observer.changedInstr(MI);
3068 return true;
3069 }
3070
3071 if (MI.getOpcode() != AMDGPU::G_LOAD)
3072 return false;
3073
3074 Register ValReg = MI.getOperand(0).getReg();
3075 LLT ValTy = MRI.getType(ValReg);
3076
3077 if (hasBufferRsrcWorkaround(ValTy)) {
3078 Observer.changingInstr(MI);
3080 Observer.changedInstr(MI);
3081 return true;
3082 }
3083
3084 MachineMemOperand *MMO = *MI.memoperands_begin();
3085 const unsigned ValSize = ValTy.getSizeInBits();
3086 const LLT MemTy = MMO->getMemoryType();
3087 const Align MemAlign = MMO->getAlign();
3088 const unsigned MemSize = MemTy.getSizeInBits();
3089 const uint64_t AlignInBits = 8 * MemAlign.value();
3090
3091 // Widen non-power-of-2 loads to the alignment if needed
3092 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3093 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3094
3095 // This was already the correct extending load result type, so just adjust
3096 // the memory type.
3097 if (WideMemSize == ValSize) {
3098 MachineFunction &MF = B.getMF();
3099
3100 MachineMemOperand *WideMMO =
3101 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3102 Observer.changingInstr(MI);
3103 MI.setMemRefs(MF, {WideMMO});
3104 Observer.changedInstr(MI);
3105 return true;
3106 }
3107
3108 // Don't bother handling edge case that should probably never be produced.
3109 if (ValSize > WideMemSize)
3110 return false;
3111
3112 LLT WideTy = widenToNextPowerOf2(ValTy);
3113
3114 Register WideLoad;
3115 if (!WideTy.isVector()) {
3116 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3117 B.buildTrunc(ValReg, WideLoad).getReg(0);
3118 } else {
3119 // Extract the subvector.
3120
3121 if (isRegisterType(ValTy)) {
3122 // If this a case where G_EXTRACT is legal, use it.
3123 // (e.g. <3 x s32> -> <4 x s32>)
3124 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3125 B.buildExtract(ValReg, WideLoad, 0);
3126 } else {
3127 // For cases where the widened type isn't a nice register value, unmerge
3128 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3129 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3130 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3131 }
3132 }
3133
3134 MI.eraseFromParent();
3135 return true;
3136 }
3137
3138 return false;
3139}
3140
3142 MachineInstr &MI) const {
3143 MachineIRBuilder &B = Helper.MIRBuilder;
3144 MachineRegisterInfo &MRI = *B.getMRI();
3145 GISelChangeObserver &Observer = Helper.Observer;
3146
3147 Register DataReg = MI.getOperand(0).getReg();
3148 LLT DataTy = MRI.getType(DataReg);
3149
3150 if (hasBufferRsrcWorkaround(DataTy)) {
3151 Observer.changingInstr(MI);
3153 Observer.changedInstr(MI);
3154 return true;
3155 }
3156 return false;
3157}
3158
3161 MachineIRBuilder &B) const {
3162 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3163 assert(Ty.isScalar());
3164
3165 MachineFunction &MF = B.getMF();
3167
3168 // TODO: Always legal with future ftz flag.
3169 // FIXME: Do we need just output?
3170 if (Ty == LLT::float32() &&
3172 return true;
3173 if (Ty == LLT::float16() &&
3175 return true;
3176
3177 MachineIRBuilder HelperBuilder(MI);
3178 GISelObserverWrapper DummyObserver;
3179 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3180 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3181}
3182
3185 Register DstReg = MI.getOperand(0).getReg();
3186 Register PtrReg = MI.getOperand(1).getReg();
3187 Register CmpVal = MI.getOperand(2).getReg();
3188 Register NewVal = MI.getOperand(3).getReg();
3189
3190 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3191 "this should not have been custom lowered");
3192
3193 LLT ValTy = MRI.getType(CmpVal);
3194 LLT VecTy = LLT::fixed_vector(2, ValTy);
3195
3196 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3197
3198 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3199 .addDef(DstReg)
3200 .addUse(PtrReg)
3201 .addUse(PackedVal)
3202 .setMemRefs(MI.memoperands());
3203
3204 MI.eraseFromParent();
3205 return true;
3206}
3207
3208/// Return true if it's known that \p Src can never be an f32 denormal value.
3210 Register Src) {
3211 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3212 switch (DefMI->getOpcode()) {
3213 case TargetOpcode::G_INTRINSIC: {
3214 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3215 case Intrinsic::amdgcn_frexp_mant:
3216 return true;
3217 default:
3218 break;
3219 }
3220
3221 break;
3222 }
3223 case TargetOpcode::G_FFREXP: {
3224 if (DefMI->getOperand(0).getReg() == Src)
3225 return true;
3226 break;
3227 }
3228 case TargetOpcode::G_FPEXT: {
3229 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3230 }
3231 default:
3232 return false;
3233 }
3234
3235 return false;
3236}
3237
3238static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3239 if (Flags & MachineInstr::FmAfn)
3240 return true;
3241 const auto &Options = MF.getTarget().Options;
3242 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3243}
3244
3246 unsigned Flags) {
3247 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3250}
3251
3252std::pair<Register, Register>
3254 unsigned Flags) const {
3255 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3256 return {};
3257
3258 const LLT F32 = LLT::scalar(32);
3259 auto SmallestNormal = B.buildFConstant(
3261 auto IsLtSmallestNormal =
3262 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3263
3264 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3265 auto One = B.buildFConstant(F32, 1.0);
3266 auto ScaleFactor =
3267 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3268 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3269
3270 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3271}
3272
3274 MachineIRBuilder &B) const {
3275 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3276 // If we have to handle denormals, scale up the input and adjust the result.
3277
3278 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3279 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3280
3281 Register Dst = MI.getOperand(0).getReg();
3282 Register Src = MI.getOperand(1).getReg();
3283 LLT Ty = B.getMRI()->getType(Dst);
3284 unsigned Flags = MI.getFlags();
3285
3286 if (Ty == LLT::scalar(16)) {
3287 const LLT F32 = LLT::scalar(32);
3288 // Nothing in half is a denormal when promoted to f32.
3289 auto Ext = B.buildFPExt(F32, Src, Flags);
3290 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3291 .addUse(Ext.getReg(0))
3292 .setMIFlags(Flags);
3293 B.buildFPTrunc(Dst, Log2, Flags);
3294 MI.eraseFromParent();
3295 return true;
3296 }
3297
3298 assert(Ty == LLT::scalar(32));
3299
3300 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3301 if (!ScaledInput) {
3302 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3303 .addUse(Src)
3304 .setMIFlags(Flags);
3305 MI.eraseFromParent();
3306 return true;
3307 }
3308
3309 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3310 .addUse(ScaledInput)
3311 .setMIFlags(Flags);
3312
3313 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3314 auto Zero = B.buildFConstant(Ty, 0.0);
3315 auto ResultOffset =
3316 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3317 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3318
3319 MI.eraseFromParent();
3320 return true;
3321}
3322
3324 Register Z, unsigned Flags) {
3325 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3326 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3327}
3328
3330 MachineIRBuilder &B) const {
3331 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3332 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3333
3334 MachineRegisterInfo &MRI = *B.getMRI();
3335 Register Dst = MI.getOperand(0).getReg();
3336 Register X = MI.getOperand(1).getReg();
3337 unsigned Flags = MI.getFlags();
3338 const LLT Ty = MRI.getType(X);
3339 MachineFunction &MF = B.getMF();
3340
3341 const LLT F32 = LLT::scalar(32);
3342 const LLT F16 = LLT::scalar(16);
3343
3344 const AMDGPUTargetMachine &TM =
3345 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3346
3347 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3348 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3349 if (Ty == F16 && !ST.has16BitInsts()) {
3350 Register LogVal = MRI.createGenericVirtualRegister(F32);
3351 auto PromoteSrc = B.buildFPExt(F32, X);
3352 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3353 B.buildFPTrunc(Dst, LogVal);
3354 } else {
3355 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3356 }
3357
3358 MI.eraseFromParent();
3359 return true;
3360 }
3361
3362 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3363 if (ScaledInput)
3364 X = ScaledInput;
3365
3366 auto Y =
3367 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3368
3369 Register R;
3370 if (ST.hasFastFMAF32()) {
3371 // c+cc are ln(2)/ln(10) to more than 49 bits
3372 const float c_log10 = 0x1.344134p-2f;
3373 const float cc_log10 = 0x1.09f79ep-26f;
3374
3375 // c + cc is ln(2) to more than 49 bits
3376 const float c_log = 0x1.62e42ep-1f;
3377 const float cc_log = 0x1.efa39ep-25f;
3378
3379 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3380 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3381
3382 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3383 auto NegR = B.buildFNeg(Ty, R, Flags);
3384 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3385 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3386 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3387 } else {
3388 // ch+ct is ln(2)/ln(10) to more than 36 bits
3389 const float ch_log10 = 0x1.344000p-2f;
3390 const float ct_log10 = 0x1.3509f6p-18f;
3391
3392 // ch + ct is ln(2) to more than 36 bits
3393 const float ch_log = 0x1.62e000p-1f;
3394 const float ct_log = 0x1.0bfbe8p-15f;
3395
3396 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3397 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3398
3399 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3400 auto YH = B.buildAnd(Ty, Y, MaskConst);
3401 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3402 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3403
3404 Register Mad0 =
3405 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3406 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3407 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3408 }
3409
3410 const bool IsFiniteOnly =
3411 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3412 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3413
3414 if (!IsFiniteOnly) {
3415 // Expand isfinite(x) => fabs(x) < inf
3416 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3417 auto Fabs = B.buildFAbs(Ty, Y);
3418 auto IsFinite =
3419 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3420 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3421 }
3422
3423 if (ScaledInput) {
3424 auto Zero = B.buildFConstant(Ty, 0.0);
3425 auto ShiftK =
3426 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3427 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3428 B.buildFSub(Dst, R, Shift, Flags);
3429 } else {
3430 B.buildCopy(Dst, R);
3431 }
3432
3433 MI.eraseFromParent();
3434 return true;
3435}
3436
3438 Register Src, bool IsLog10,
3439 unsigned Flags) const {
3440 const double Log2BaseInverted =
3442
3443 LLT Ty = B.getMRI()->getType(Dst);
3444
3445 if (Ty == LLT::scalar(32)) {
3446 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3447 if (ScaledInput) {
3448 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3449 .addUse(Src)
3450 .setMIFlags(Flags);
3451 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3452 auto Zero = B.buildFConstant(Ty, 0.0);
3453 auto ResultOffset =
3454 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3455 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3456
3457 if (ST.hasFastFMAF32())
3458 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3459 else {
3460 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3461 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3462 }
3463
3464 return true;
3465 }
3466 }
3467
3468 auto Log2Operand = Ty == LLT::scalar(16)
3469 ? B.buildFLog2(Ty, Src, Flags)
3470 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3471 .addUse(Src)
3472 .setMIFlags(Flags);
3473 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3474 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3475 return true;
3476}
3477
3479 MachineIRBuilder &B) const {
3480 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3481 // If we have to handle denormals, scale up the input and adjust the result.
3482
3483 Register Dst = MI.getOperand(0).getReg();
3484 Register Src = MI.getOperand(1).getReg();
3485 unsigned Flags = MI.getFlags();
3486 LLT Ty = B.getMRI()->getType(Dst);
3487 const LLT F16 = LLT::scalar(16);
3488 const LLT F32 = LLT::scalar(32);
3489
3490 if (Ty == F16) {
3491 // Nothing in half is a denormal when promoted to f32.
3492 auto Ext = B.buildFPExt(F32, Src, Flags);
3493 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3494 .addUse(Ext.getReg(0))
3495 .setMIFlags(Flags);
3496 B.buildFPTrunc(Dst, Log2, Flags);
3497 MI.eraseFromParent();
3498 return true;
3499 }
3500
3501 assert(Ty == F32);
3502
3503 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3504 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3505 .addUse(Src)
3506 .setMIFlags(Flags);
3507 MI.eraseFromParent();
3508 return true;
3509 }
3510
3511 // bool needs_scaling = x < -0x1.f80000p+6f;
3512 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3513
3514 // -nextafter(128.0, -1)
3515 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3516 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3517 RangeCheckConst, Flags);
3518
3519 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3520 auto Zero = B.buildFConstant(Ty, 0.0);
3521 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3522 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3523
3524 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3525 .addUse(AddInput.getReg(0))
3526 .setMIFlags(Flags);
3527
3528 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3529 auto One = B.buildFConstant(Ty, 1.0);
3530 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3531 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3532 MI.eraseFromParent();
3533 return true;
3534}
3535
3537 Register X, unsigned Flags) const {
3538 LLT Ty = B.getMRI()->getType(Dst);
3539 LLT F32 = LLT::scalar(32);
3540
3541 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3542 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3543 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3544
3545 if (Ty == F32) {
3546 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3547 .addUse(Mul.getReg(0))
3548 .setMIFlags(Flags);
3549 } else {
3550 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3551 }
3552
3553 return true;
3554 }
3555
3556 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3557 auto NeedsScaling =
3558 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3559 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3560 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3561 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3562
3563 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3564 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3565
3566 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3567 .addUse(ExpInput.getReg(0))
3568 .setMIFlags(Flags);
3569
3570 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3571 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3572 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3573 return true;
3574}
3575
3577 MachineIRBuilder &B) const {
3578 Register Dst = MI.getOperand(0).getReg();
3579 Register X = MI.getOperand(1).getReg();
3580 const unsigned Flags = MI.getFlags();
3581 MachineFunction &MF = B.getMF();
3582 MachineRegisterInfo &MRI = *B.getMRI();
3583 LLT Ty = MRI.getType(Dst);
3584 const LLT F16 = LLT::scalar(16);
3585 const LLT F32 = LLT::scalar(32);
3586 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3587
3588 if (Ty == F16) {
3589 // v_exp_f16 (fmul x, log2e)
3590 if (allowApproxFunc(MF, Flags)) {
3591 // TODO: Does this really require fast?
3592 legalizeFExpUnsafe(B, Dst, X, Flags);
3593 MI.eraseFromParent();
3594 return true;
3595 }
3596
3597 // exp(f16 x) ->
3598 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3599
3600 // Nothing in half is a denormal when promoted to f32.
3601 auto Ext = B.buildFPExt(F32, X, Flags);
3602 Register Lowered = MRI.createGenericVirtualRegister(F32);
3603 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3604 B.buildFPTrunc(Dst, Lowered, Flags);
3605 MI.eraseFromParent();
3606 return true;
3607 }
3608
3609 assert(Ty == F32);
3610
3611 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3612 // library behavior. Also, is known-not-daz source sufficient?
3613 if (allowApproxFunc(MF, Flags)) {
3614 legalizeFExpUnsafe(B, Dst, X, Flags);
3615 MI.eraseFromParent();
3616 return true;
3617 }
3618
3619 // Algorithm:
3620 //
3621 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3622 //
3623 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3624 // n = 64*m + j, 0 <= j < 64
3625 //
3626 // e^x = 2^((64*m + j + f)/64)
3627 // = (2^m) * (2^(j/64)) * 2^(f/64)
3628 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3629 //
3630 // f = x*(64/ln(2)) - n
3631 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3632 //
3633 // e^x = (2^m) * (2^(j/64)) * e^r
3634 //
3635 // (2^(j/64)) is precomputed
3636 //
3637 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3638 // e^r = 1 + q
3639 //
3640 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3641 //
3642 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3643 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3644 Register PH, PL;
3645
3646 if (ST.hasFastFMAF32()) {
3647 const float c_exp = numbers::log2ef;
3648 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3649 const float c_exp10 = 0x1.a934f0p+1f;
3650 const float cc_exp10 = 0x1.2f346ep-24f;
3651
3652 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3653 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3654 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3655 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3656
3657 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3658 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3659 } else {
3660 const float ch_exp = 0x1.714000p+0f;
3661 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3662
3663 const float ch_exp10 = 0x1.a92000p+1f;
3664 const float cl_exp10 = 0x1.4f0978p-11f;
3665
3666 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3667 auto XH = B.buildAnd(Ty, X, MaskConst);
3668 auto XL = B.buildFSub(Ty, X, XH, Flags);
3669
3670 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3671 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3672
3673 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3674 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3675
3676 Register Mad0 =
3677 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3678 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3679 }
3680
3681 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3682
3683 // It is unsafe to contract this fsub into the PH multiply.
3684 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3685 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3686 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3687
3688 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3689 .addUse(A.getReg(0))
3690 .setMIFlags(Flags);
3691 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3692
3693 auto UnderflowCheckConst =
3694 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3695 auto Zero = B.buildFConstant(Ty, 0.0);
3696 auto Underflow =
3697 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3698
3699 R = B.buildSelect(Ty, Underflow, Zero, R);
3700
3701 const auto &Options = MF.getTarget().Options;
3702
3703 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3704 auto OverflowCheckConst =
3705 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3706
3707 auto Overflow =
3708 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3709 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3710 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3711 }
3712
3713 B.buildCopy(Dst, R);
3714 MI.eraseFromParent();
3715 return true;
3716}
3717
3719 MachineIRBuilder &B) const {
3720 Register Dst = MI.getOperand(0).getReg();
3721 Register Src0 = MI.getOperand(1).getReg();
3722 Register Src1 = MI.getOperand(2).getReg();
3723 unsigned Flags = MI.getFlags();
3724 LLT Ty = B.getMRI()->getType(Dst);
3725 const LLT F16 = LLT::float16();
3726 const LLT F32 = LLT::float32();
3727
3728 if (Ty == F32) {
3729 auto Log = B.buildFLog2(F32, Src0, Flags);
3730 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3731 .addUse(Log.getReg(0))
3732 .addUse(Src1)
3733 .setMIFlags(Flags);
3734 B.buildFExp2(Dst, Mul, Flags);
3735 } else if (Ty == F16) {
3736 // There's no f16 fmul_legacy, so we need to convert for it.
3737 auto Log = B.buildFLog2(F16, Src0, Flags);
3738 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3739 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3740 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3741 .addUse(Ext0.getReg(0))
3742 .addUse(Ext1.getReg(0))
3743 .setMIFlags(Flags);
3744 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3745 } else
3746 return false;
3747
3748 MI.eraseFromParent();
3749 return true;
3750}
3751
3752// Find a source register, ignoring any possible source modifiers.
3754 Register ModSrc = OrigSrc;
3755 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3756 ModSrc = SrcFNeg->getOperand(1).getReg();
3757 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3758 ModSrc = SrcFAbs->getOperand(1).getReg();
3759 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3760 ModSrc = SrcFAbs->getOperand(1).getReg();
3761 return ModSrc;
3762}
3763
3766 MachineIRBuilder &B) const {
3767
3768 const LLT S1 = LLT::scalar(1);
3769 const LLT F64 = LLT::float64();
3770 Register Dst = MI.getOperand(0).getReg();
3771 Register OrigSrc = MI.getOperand(1).getReg();
3772 unsigned Flags = MI.getFlags();
3773 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3774 "this should not have been custom lowered");
3775
3776 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3777 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3778 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3779 // V_FRACT bug is:
3780 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3781 //
3782 // Convert floor(x) to (x - fract(x))
3783
3784 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3785 .addUse(OrigSrc)
3786 .setMIFlags(Flags);
3787
3788 // Give source modifier matching some assistance before obscuring a foldable
3789 // pattern.
3790
3791 // TODO: We can avoid the neg on the fract? The input sign to fract
3792 // shouldn't matter?
3793 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3794
3795 auto Const =
3796 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3797
3798 Register Min = MRI.createGenericVirtualRegister(F64);
3799
3800 // We don't need to concern ourselves with the snan handling difference, so
3801 // use the one which will directly select.
3802 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3803 if (MFI->getMode().IEEE)
3804 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3805 else
3806 B.buildFMinNum(Min, Fract, Const, Flags);
3807
3808 Register CorrectedFract = Min;
3809 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3810 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3811 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3812 }
3813
3814 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3815 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3816
3817 MI.eraseFromParent();
3818 return true;
3819}
3820
3821// Turn an illegal packed v2s16 build vector into bit operations.
3822// TODO: This should probably be a bitcast action in LegalizerHelper.
3825 Register Dst = MI.getOperand(0).getReg();
3826 const LLT S32 = LLT::scalar(32);
3827 const LLT S16 = LLT::scalar(16);
3828 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3829
3830 Register Src0 = MI.getOperand(1).getReg();
3831 Register Src1 = MI.getOperand(2).getReg();
3832
3833 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3834 assert(MRI.getType(Src0) == S32);
3835 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3836 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3837 }
3838
3839 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3840 B.buildBitcast(Dst, Merge);
3841
3842 MI.eraseFromParent();
3843 return true;
3844}
3845
3846// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3847//
3848// Source and accumulation registers must all be 32-bits.
3849//
3850// TODO: When the multiply is uniform, we should produce a code sequence
3851// that is better suited to instruction selection on the SALU. Instead of
3852// the outer loop going over parts of the result, the outer loop should go
3853// over parts of one of the factors. This should result in instruction
3854// selection that makes full use of S_ADDC_U32 instructions.
3857 ArrayRef<Register> Src0,
3858 ArrayRef<Register> Src1,
3859 bool UsePartialMad64_32,
3860 bool SeparateOddAlignedProducts) const {
3861 // Use (possibly empty) vectors of S1 registers to represent the set of
3862 // carries from one pair of positions to the next.
3863 using Carry = SmallVector<Register, 2>;
3864
3865 MachineIRBuilder &B = Helper.MIRBuilder;
3866 GISelKnownBits &KB = *Helper.getKnownBits();
3867
3868 const LLT S1 = LLT::scalar(1);
3869 const LLT S32 = LLT::scalar(32);
3870 const LLT S64 = LLT::scalar(64);
3871
3872 Register Zero32;
3873 Register Zero64;
3874
3875 auto getZero32 = [&]() -> Register {
3876 if (!Zero32)
3877 Zero32 = B.buildConstant(S32, 0).getReg(0);
3878 return Zero32;
3879 };
3880 auto getZero64 = [&]() -> Register {
3881 if (!Zero64)
3882 Zero64 = B.buildConstant(S64, 0).getReg(0);
3883 return Zero64;
3884 };
3885
3886 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3887 for (unsigned i = 0; i < Src0.size(); ++i) {
3888 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3889 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3890 }
3891
3892 // Merge the given carries into the 32-bit LocalAccum, which is modified
3893 // in-place.
3894 //
3895 // Returns the carry-out, which is a single S1 register or null.
3896 auto mergeCarry =
3897 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3898 if (CarryIn.empty())
3899 return Register();
3900
3901 bool HaveCarryOut = true;
3902 Register CarryAccum;
3903 if (CarryIn.size() == 1) {
3904 if (!LocalAccum) {
3905 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3906 return Register();
3907 }
3908
3909 CarryAccum = getZero32();
3910 } else {
3911 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3912 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3913 CarryAccum =
3914 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3915 .getReg(0);
3916 }
3917
3918 if (!LocalAccum) {
3919 LocalAccum = getZero32();
3920 HaveCarryOut = false;
3921 }
3922 }
3923
3924 auto Add =
3925 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3926 LocalAccum = Add.getReg(0);
3927 return HaveCarryOut ? Add.getReg(1) : Register();
3928 };
3929
3930 // Build a multiply-add chain to compute
3931 //
3932 // LocalAccum + (partial products at DstIndex)
3933 // + (opportunistic subset of CarryIn)
3934 //
3935 // LocalAccum is an array of one or two 32-bit registers that are updated
3936 // in-place. The incoming registers may be null.
3937 //
3938 // In some edge cases, carry-ins can be consumed "for free". In that case,
3939 // the consumed carry bits are removed from CarryIn in-place.
3940 auto buildMadChain =
3941 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3942 -> Carry {
3943 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3944 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3945
3946 Carry CarryOut;
3947 unsigned j0 = 0;
3948
3949 // Use plain 32-bit multiplication for the most significant part of the
3950 // result by default.
3951 if (LocalAccum.size() == 1 &&
3952 (!UsePartialMad64_32 || !CarryIn.empty())) {
3953 do {
3954 // Skip multiplication if one of the operands is 0
3955 unsigned j1 = DstIndex - j0;
3956 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3957 ++j0;
3958 continue;
3959 }
3960 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3961 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3962 LocalAccum[0] = Mul.getReg(0);
3963 } else {
3964 if (CarryIn.empty()) {
3965 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3966 } else {
3967 LocalAccum[0] =
3968 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3969 .getReg(0);
3970 CarryIn.pop_back();
3971 }
3972 }
3973 ++j0;
3974 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3975 }
3976
3977 // Build full 64-bit multiplies.
3978 if (j0 <= DstIndex) {
3979 bool HaveSmallAccum = false;
3980 Register Tmp;
3981
3982 if (LocalAccum[0]) {
3983 if (LocalAccum.size() == 1) {
3984 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3985 HaveSmallAccum = true;
3986 } else if (LocalAccum[1]) {
3987 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3988 HaveSmallAccum = false;
3989 } else {
3990 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3991 HaveSmallAccum = true;
3992 }
3993 } else {
3994 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3995 Tmp = getZero64();
3996 HaveSmallAccum = true;
3997 }
3998
3999 do {
4000 unsigned j1 = DstIndex - j0;
4001 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4002 ++j0;
4003 continue;
4004 }
4005 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4006 {Src0[j0], Src1[j1], Tmp});
4007 Tmp = Mad.getReg(0);
4008 if (!HaveSmallAccum)
4009 CarryOut.push_back(Mad.getReg(1));
4010 HaveSmallAccum = false;
4011
4012 ++j0;
4013 } while (j0 <= DstIndex);
4014
4015 auto Unmerge = B.buildUnmerge(S32, Tmp);
4016 LocalAccum[0] = Unmerge.getReg(0);
4017 if (LocalAccum.size() > 1)
4018 LocalAccum[1] = Unmerge.getReg(1);
4019 }
4020
4021 return CarryOut;
4022 };
4023
4024 // Outer multiply loop, iterating over destination parts from least
4025 // significant to most significant parts.
4026 //
4027 // The columns of the following diagram correspond to the destination parts
4028 // affected by one iteration of the outer loop (ignoring boundary
4029 // conditions).
4030 //
4031 // Dest index relative to 2 * i: 1 0 -1
4032 // ------
4033 // Carries from previous iteration: e o
4034 // Even-aligned partial product sum: E E .
4035 // Odd-aligned partial product sum: O O
4036 //
4037 // 'o' is OddCarry, 'e' is EvenCarry.
4038 // EE and OO are computed from partial products via buildMadChain and use
4039 // accumulation where possible and appropriate.
4040 //
4041 Register SeparateOddCarry;
4042 Carry EvenCarry;
4043 Carry OddCarry;
4044
4045 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4046 Carry OddCarryIn = std::move(OddCarry);
4047 Carry EvenCarryIn = std::move(EvenCarry);
4048 OddCarry.clear();
4049 EvenCarry.clear();
4050
4051 // Partial products at offset 2 * i.
4052 if (2 * i < Accum.size()) {
4053 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4054 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4055 }
4056
4057 // Partial products at offset 2 * i - 1.
4058 if (i > 0) {
4059 if (!SeparateOddAlignedProducts) {
4060 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4061 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4062 } else {
4063 bool IsHighest = 2 * i >= Accum.size();
4064 Register SeparateOddOut[2];
4065 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4066 .take_front(IsHighest ? 1 : 2);
4067 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4068
4070
4071 if (i == 1) {
4072 if (!IsHighest)
4073 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4074 else
4075 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4076 } else {
4077 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4078 SeparateOddCarry);
4079 }
4080 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4081
4082 if (!IsHighest) {
4083 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4084 Lo->getOperand(1).getReg());
4085 Accum[2 * i] = Hi.getReg(0);
4086 SeparateOddCarry = Hi.getReg(1);
4087 }
4088 }
4089 }
4090
4091 // Add in the carries from the previous iteration
4092 if (i > 0) {
4093 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4094 EvenCarryIn.push_back(CarryOut);
4095
4096 if (2 * i < Accum.size()) {
4097 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4098 OddCarry.push_back(CarryOut);
4099 }
4100 }
4101 }
4102}
4103
4104// Custom narrowing of wide multiplies using wide multiply-add instructions.
4105//
4106// TODO: If the multiply is followed by an addition, we should attempt to
4107// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4109 MachineInstr &MI) const {
4110 assert(ST.hasMad64_32());
4111 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4112
4113 MachineIRBuilder &B = Helper.MIRBuilder;
4114 MachineRegisterInfo &MRI = *B.getMRI();
4115
4116 Register DstReg = MI.getOperand(0).getReg();
4117 Register Src0 = MI.getOperand(1).getReg();
4118 Register Src1 = MI.getOperand(2).getReg();
4119
4120 LLT Ty = MRI.getType(DstReg);
4121 assert(Ty.isScalar());
4122
4123 unsigned Size = Ty.getSizeInBits();
4124 unsigned NumParts = Size / 32;
4125 assert((Size % 32) == 0);
4126 assert(NumParts >= 2);
4127
4128 // Whether to use MAD_64_32 for partial products whose high half is
4129 // discarded. This avoids some ADD instructions but risks false dependency
4130 // stalls on some subtargets in some cases.
4131 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4132
4133 // Whether to compute odd-aligned partial products separately. This is
4134 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4135 // in an even-aligned VGPR.
4136 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4137
4138 LLT S32 = LLT::scalar(32);
4139 SmallVector<Register, 2> Src0Parts, Src1Parts;
4140 for (unsigned i = 0; i < NumParts; ++i) {
4141 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4142 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4143 }
4144 B.buildUnmerge(Src0Parts, Src0);
4145 B.buildUnmerge(Src1Parts, Src1);
4146
4147 SmallVector<Register, 2> AccumRegs(NumParts);
4148 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4149 SeparateOddAlignedProducts);
4150
4151 B.buildMergeLikeInstr(DstReg, AccumRegs);
4152 MI.eraseFromParent();
4153 return true;
4154}
4155
4156// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4157// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4158// case with a single min instruction instead of a compare+select.
4161 MachineIRBuilder &B) const {
4162 Register Dst = MI.getOperand(0).getReg();
4163 Register Src = MI.getOperand(1).getReg();
4164 LLT DstTy = MRI.getType(Dst);
4165 LLT SrcTy = MRI.getType(Src);
4166
4167 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4168 ? AMDGPU::G_AMDGPU_FFBH_U32
4169 : AMDGPU::G_AMDGPU_FFBL_B32;
4170 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4171 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4172
4173 MI.eraseFromParent();
4174 return true;
4175}
4176
4179 MachineIRBuilder &B) const {
4180 Register Dst = MI.getOperand(0).getReg();
4181 Register Src = MI.getOperand(1).getReg();
4182 LLT SrcTy = MRI.getType(Src);
4183 TypeSize NumBits = SrcTy.getSizeInBits();
4184
4185 assert(NumBits < 32u);
4186
4187 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4188 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4189 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4190 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4191 B.buildTrunc(Dst, Ctlz);
4192 MI.eraseFromParent();
4193 return true;
4194}
4195
4196// Check that this is a G_XOR x, -1
4197static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4198 if (MI.getOpcode() != TargetOpcode::G_XOR)
4199 return false;
4200 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4201 return ConstVal && *ConstVal == -1;
4202}
4203
4204// Return the use branch instruction, otherwise null if the usage is invalid.
4205static MachineInstr *
4207 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4208 Register CondDef = MI.getOperand(0).getReg();
4209 if (!MRI.hasOneNonDBGUse(CondDef))
4210 return nullptr;
4211
4212 MachineBasicBlock *Parent = MI.getParent();
4213 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4214
4215 if (isNot(MRI, *UseMI)) {
4216 Register NegatedCond = UseMI->getOperand(0).getReg();
4217 if (!MRI.hasOneNonDBGUse(NegatedCond))
4218 return nullptr;
4219
4220 // We're deleting the def of this value, so we need to remove it.
4221 eraseInstr(*UseMI, MRI);
4222
4223 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4224 Negated = true;
4225 }
4226
4227 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4228 return nullptr;
4229
4230 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4231 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4232 if (Next == Parent->end()) {
4233 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4234 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4235 return nullptr;
4236 UncondBrTarget = &*NextMBB;
4237 } else {
4238 if (Next->getOpcode() != AMDGPU::G_BR)
4239 return nullptr;
4240 Br = &*Next;
4241 UncondBrTarget = Br->getOperand(0).getMBB();
4242 }
4243
4244 return UseMI;
4245}
4246
4248 const ArgDescriptor *Arg,
4249 const TargetRegisterClass *ArgRC,
4250 LLT ArgTy) const {
4251 MCRegister SrcReg = Arg->getRegister();
4252 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4253 assert(DstReg.isVirtual() && "Virtual register expected");
4254
4255 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4256 *ArgRC, B.getDebugLoc(), ArgTy);
4257 if (Arg->isMasked()) {
4258 // TODO: Should we try to emit this once in the entry block?
4259 const LLT S32 = LLT::scalar(32);
4260 const unsigned Mask = Arg->getMask();
4261 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4262
4263 Register AndMaskSrc = LiveIn;
4264
4265 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4266 // 0.
4267 if (Shift != 0) {
4268 auto ShiftAmt = B.buildConstant(S32, Shift);
4269 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4270 }
4271
4272 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4273 } else {
4274 B.buildCopy(DstReg, LiveIn);
4275 }
4276
4277 return true;
4278}
4279
4281 Register DstReg, MachineIRBuilder &B,
4283 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4284 const ArgDescriptor *Arg = nullptr;
4285 const TargetRegisterClass *ArgRC;
4286 LLT ArgTy;
4287
4288 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4289 const ArgDescriptor WorkGroupIDX =
4290 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4291 // If GridZ is not programmed in an entry function then the hardware will set
4292 // it to all zeros, so there is no need to mask the GridY value in the low
4293 // order bits.
4294 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4295 AMDGPU::TTMP7,
4296 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4297 const ArgDescriptor WorkGroupIDZ =
4298 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4299 if (ST.hasArchitectedSGPRs() &&
4301 switch (ArgType) {
4303 Arg = &WorkGroupIDX;
4304 ArgRC = &AMDGPU::SReg_32RegClass;
4305 ArgTy = LLT::scalar(32);
4306 break;
4308 Arg = &WorkGroupIDY;
4309 ArgRC = &AMDGPU::SReg_32RegClass;
4310 ArgTy = LLT::scalar(32);
4311 break;
4313 Arg = &WorkGroupIDZ;
4314 ArgRC = &AMDGPU::SReg_32RegClass;
4315 ArgTy = LLT::scalar(32);
4316 break;
4317 default:
4318 break;
4319 }
4320 }
4321
4322 if (!Arg)
4323 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4324
4325 if (!Arg) {
4327 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4328 // case the pointer argument may be missing and we use null.
4329 B.buildConstant(DstReg, 0);
4330 return true;
4331 }
4332
4333 // It's undefined behavior if a function marked with the amdgpu-no-*
4334 // attributes uses the corresponding intrinsic.
4335 B.buildUndef(DstReg);
4336 return true;
4337 }
4338
4339 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4340 return false; // TODO: Handle these
4341 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4342}
4343
4347 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4348 return false;
4349
4350 MI.eraseFromParent();
4351 return true;
4352}
4353
4355 int64_t C) {
4356 B.buildConstant(MI.getOperand(0).getReg(), C);
4357 MI.eraseFromParent();
4358 return true;
4359}
4360
4363 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4364 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4365 if (MaxID == 0)
4366 return replaceWithConstant(B, MI, 0);
4367
4368 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4369 const ArgDescriptor *Arg;
4370 const TargetRegisterClass *ArgRC;
4371 LLT ArgTy;
4372 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4373
4374 Register DstReg = MI.getOperand(0).getReg();
4375 if (!Arg) {
4376 // It's undefined behavior if a function marked with the amdgpu-no-*
4377 // attributes uses the corresponding intrinsic.
4378 B.buildUndef(DstReg);
4379 MI.eraseFromParent();
4380 return true;
4381 }
4382
4383 if (Arg->isMasked()) {
4384 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4385 // masking operations anyway.
4386 //
4387 // TODO: We could assert the top bit is 0 for the source copy.
4388 if (!loadInputValue(DstReg, B, ArgType))
4389 return false;
4390 } else {
4391 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4392 if (!loadInputValue(TmpReg, B, ArgType))
4393 return false;
4394 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4395 }
4396
4397 MI.eraseFromParent();
4398 return true;
4399}
4400
4402 int64_t Offset) const {
4404 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4405
4406 // TODO: If we passed in the base kernel offset we could have a better
4407 // alignment than 4, but we don't really need it.
4408 if (!loadInputValue(KernArgReg, B,
4410 llvm_unreachable("failed to find kernarg segment ptr");
4411
4412 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4413 // TODO: Should get nuw
4414 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4415}
4416
4417/// Legalize a value that's loaded from kernel arguments. This is only used by
4418/// legacy intrinsics.
4422 Align Alignment) const {
4423 Register DstReg = MI.getOperand(0).getReg();
4424
4425 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4426 "unexpected kernarg parameter type");
4427
4430 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4433 MI.eraseFromParent();
4434 return true;
4435}
4436
4439 MachineIRBuilder &B) const {
4440 Register Dst = MI.getOperand(0).getReg();
4441 LLT DstTy = MRI.getType(Dst);
4442 LLT S16 = LLT::scalar(16);
4443 LLT S32 = LLT::scalar(32);
4444 LLT S64 = LLT::scalar(64);
4445
4446 if (DstTy == S16)
4447 return legalizeFDIV16(MI, MRI, B);
4448 if (DstTy == S32)
4449 return legalizeFDIV32(MI, MRI, B);
4450 if (DstTy == S64)
4451 return legalizeFDIV64(MI, MRI, B);
4452
4453 return false;
4454}
4455
4457 Register DstDivReg,
4458 Register DstRemReg,
4459 Register X,
4460 Register Y) const {
4461 const LLT S1 = LLT::scalar(1);
4462 const LLT S32 = LLT::scalar(32);
4463
4464 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4465 // algorithm used here.
4466
4467 // Initial estimate of inv(y).
4468 auto FloatY = B.buildUITOFP(S32, Y);
4469 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4470 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4471 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4472 auto Z = B.buildFPTOUI(S32, ScaledY);
4473
4474 // One round of UNR.
4475 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4476 auto NegYZ = B.buildMul(S32, NegY, Z);
4477 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4478
4479 // Quotient/remainder estimate.
4480 auto Q = B.buildUMulH(S32, X, Z);
4481 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4482
4483 // First quotient/remainder refinement.
4484 auto One = B.buildConstant(S32, 1);
4485 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4486 if (DstDivReg)
4487 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4488 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4489
4490 // Second quotient/remainder refinement.
4491 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4492 if (DstDivReg)
4493 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4494
4495 if (DstRemReg)
4496 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4497}
4498
4499// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4500//
4501// Return lo, hi of result
4502//
4503// %cvt.lo = G_UITOFP Val.lo
4504// %cvt.hi = G_UITOFP Val.hi
4505// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4506// %rcp = G_AMDGPU_RCP_IFLAG %mad
4507// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4508// %mul2 = G_FMUL %mul1, 2**(-32)
4509// %trunc = G_INTRINSIC_TRUNC %mul2
4510// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4511// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4512static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4513 Register Val) {
4514 const LLT S32 = LLT::scalar(32);
4515 auto Unmerge = B.buildUnmerge(S32, Val);
4516
4517 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4518 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4519
4520 auto Mad = B.buildFMAD(
4521 S32, CvtHi, // 2**32
4522 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4523
4524 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4525 auto Mul1 = B.buildFMul(
4526 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4527
4528 // 2**(-32)
4529 auto Mul2 = B.buildFMul(
4530 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4531 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4532
4533 // -(2**32)
4534 auto Mad2 = B.buildFMAD(
4535 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4536 Mul1);
4537
4538 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4539 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4540
4541 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4542}
4543
4545 Register DstDivReg,
4546 Register DstRemReg,
4547 Register Numer,
4548 Register Denom) const {
4549 const LLT S32 = LLT::scalar(32);
4550 const LLT S64 = LLT::scalar(64);
4551 const LLT S1 = LLT::scalar(1);
4552 Register RcpLo, RcpHi;
4553
4554 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4555
4556 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4557
4558 auto Zero64 = B.buildConstant(S64, 0);
4559 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4560
4561 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4562 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4563
4564 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4565 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4566 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4567
4568 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4569 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4570 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4571
4572 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4573 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4574 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4575 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4576 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4577
4578 auto Zero32 = B.buildConstant(S32, 0);
4579 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4580 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4581 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4582
4583 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4584 Register NumerLo = UnmergeNumer.getReg(0);
4585 Register NumerHi = UnmergeNumer.getReg(1);
4586
4587 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4588 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4589 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4590 Register Mul3_Lo = UnmergeMul3.getReg(0);
4591 Register Mul3_Hi = UnmergeMul3.getReg(1);
4592 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4593 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4594 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4595 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4596
4597 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4598 Register DenomLo = UnmergeDenom.getReg(0);
4599 Register DenomHi = UnmergeDenom.getReg(1);
4600
4601 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4602 auto C1 = B.buildSExt(S32, CmpHi);
4603
4604 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4605 auto C2 = B.buildSExt(S32, CmpLo);
4606
4607 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4608 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4609
4610 // TODO: Here and below portions of the code can be enclosed into if/endif.
4611 // Currently control flow is unconditional and we have 4 selects after
4612 // potential endif to substitute PHIs.
4613
4614 // if C3 != 0 ...
4615 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4616 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4617 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4618 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4619
4620 auto One64 = B.buildConstant(S64, 1);
4621 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4622
4623 auto C4 =
4624 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4625 auto C5 =
4626 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4627 auto C6 = B.buildSelect(
4628 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4629
4630 // if (C6 != 0)
4631 auto Add4 = B.buildAdd(S64, Add3, One64);
4632 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4633
4634 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4635 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4636 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4637
4638 // endif C6
4639 // endif C3
4640
4641 if (DstDivReg) {
4642 auto Sel1 = B.buildSelect(
4643 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4644 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4645 Sel1, MulHi3);
4646 }
4647
4648 if (DstRemReg) {
4649 auto Sel2 = B.buildSelect(
4650 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4651 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4652 Sel2, Sub1);
4653 }
4654}
4655
4658 MachineIRBuilder &B) const {
4659 Register DstDivReg, DstRemReg;
4660 switch (MI.getOpcode()) {
4661 default:
4662 llvm_unreachable("Unexpected opcode!");
4663 case AMDGPU::G_UDIV: {
4664 DstDivReg = MI.getOperand(0).getReg();
4665 break;
4666 }
4667 case AMDGPU::G_UREM: {
4668 DstRemReg = MI.getOperand(0).getReg();
4669 break;
4670 }
4671 case AMDGPU::G_UDIVREM: {
4672 DstDivReg = MI.getOperand(0).getReg();
4673 DstRemReg = MI.getOperand(1).getReg();
4674 break;
4675 }
4676 }
4677
4678 const LLT S64 = LLT::scalar(64);
4679 const LLT S32 = LLT::scalar(32);
4680 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4681 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4682 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4683 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4684
4685 if (Ty == S32)
4686 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4687 else if (Ty == S64)
4688 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4689 else
4690 return false;
4691
4692 MI.eraseFromParent();
4693 return true;
4694}
4695
4698 MachineIRBuilder &B) const {
4699 const LLT S64 = LLT::scalar(64);
4700 const LLT S32 = LLT::scalar(32);
4701
4702 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4703 if (Ty != S32 && Ty != S64)
4704 return false;
4705
4706 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4707 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4708 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4709
4710 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4711 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4712 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4713
4714 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4715 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4716
4717 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4718 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4719
4720 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4721 switch (MI.getOpcode()) {
4722 default:
4723 llvm_unreachable("Unexpected opcode!");
4724 case AMDGPU::G_SDIV: {
4725 DstDivReg = MI.getOperand(0).getReg();
4726 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4727 break;
4728 }
4729 case AMDGPU::G_SREM: {
4730 DstRemReg = MI.getOperand(0).getReg();
4731 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4732 break;
4733 }
4734 case AMDGPU::G_SDIVREM: {
4735 DstDivReg = MI.getOperand(0).getReg();
4736 DstRemReg = MI.getOperand(1).getReg();
4737 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4738 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4739 break;
4740 }
4741 }
4742
4743 if (Ty == S32)
4744 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4745 else
4746 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4747
4748 if (DstDivReg) {
4749 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4750 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4751 B.buildSub(DstDivReg, SignXor, Sign);
4752 }
4753
4754 if (DstRemReg) {
4755 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4756 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4757 B.buildSub(DstRemReg, SignXor, Sign);
4758 }
4759
4760 MI.eraseFromParent();
4761 return true;
4762}
4763
4766 MachineIRBuilder &B) const {
4767 Register Res = MI.getOperand(0).getReg();
4768 Register LHS = MI.getOperand(1).getReg();
4769 Register RHS = MI.getOperand(2).getReg();
4770 uint16_t Flags = MI.getFlags();
4771 LLT ResTy = MRI.getType(Res);
4772
4773 const MachineFunction &MF = B.getMF();
4774 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4776
4777 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4778 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4779 return false;
4780
4781 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4782 // the CI documentation has a worst case error of 1 ulp.
4783 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4784 // use it as long as we aren't trying to use denormals.
4785 //
4786 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4787
4788 // 1 / x -> RCP(x)
4789 if (CLHS->isExactlyValue(1.0)) {
4790 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4791 .addUse(RHS)
4792 .setMIFlags(Flags);
4793
4794 MI.eraseFromParent();
4795 return true;
4796 }
4797
4798 // -1 / x -> RCP( FNEG(x) )
4799 if (CLHS->isExactlyValue(-1.0)) {
4800 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4801 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4802 .addUse(FNeg.getReg(0))
4803 .setMIFlags(Flags);
4804
4805 MI.eraseFromParent();
4806 return true;
4807 }
4808 }
4809
4810 // For f16 require afn or arcp.
4811 // For f32 require afn.
4812 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4813 !MI.getFlag(MachineInstr::FmArcp)))
4814 return false;
4815
4816 // x / y -> x * (1.0 / y)
4817 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4818 .addUse(RHS)
4819 .setMIFlags(Flags);
4820 B.buildFMul(Res, LHS, RCP, Flags);
4821
4822 MI.eraseFromParent();
4823 return true;
4824}
4825
4828 MachineIRBuilder &B) const {
4829 Register Res = MI.getOperand(0).getReg();
4830 Register X = MI.getOperand(1).getReg();
4831 Register Y = MI.getOperand(2).getReg();
4832 uint16_t Flags = MI.getFlags();
4833 LLT ResTy = MRI.getType(Res);
4834
4835 const MachineFunction &MF = B.getMF();
4836 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4837 MI.getFlag(MachineInstr::FmAfn);
4838
4839 if (!AllowInaccurateRcp)
4840 return false;
4841
4842 auto NegY = B.buildFNeg(ResTy, Y);
4843 auto One = B.buildFConstant(ResTy, 1.0);
4844
4845 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4846 .addUse(Y)
4847 .setMIFlags(Flags);
4848
4849 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4850 R = B.buildFMA(ResTy, Tmp0, R, R);
4851
4852 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4853 R = B.buildFMA(ResTy, Tmp1, R, R);
4854
4855 auto Ret = B.buildFMul(ResTy, X, R);
4856 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4857
4858 B.buildFMA(Res, Tmp2, R, Ret);
4859 MI.eraseFromParent();
4860 return true;
4861}
4862
4865 MachineIRBuilder &B) const {
4867 return true;
4868
4869 Register Res = MI.getOperand(0).getReg();
4870 Register LHS = MI.getOperand(1).getReg();
4871 Register RHS = MI.getOperand(2).getReg();
4872
4873 uint16_t Flags = MI.getFlags();
4874
4875 LLT S16 = LLT::scalar(16);
4876 LLT S32 = LLT::scalar(32);
4877
4878 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4879 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4880
4881 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4882 .addUse(RHSExt.getReg(0))
4883 .setMIFlags(Flags);
4884
4885 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4886 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4887
4888 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4889 .addUse(RDst.getReg(0))
4890 .addUse(RHS)
4891 .addUse(LHS)
4892 .setMIFlags(Flags);
4893
4894 MI.eraseFromParent();
4895 return true;
4896}
4897
4898static constexpr unsigned SPDenormModeBitField =
4900
4901// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4902// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4904 const GCNSubtarget &ST,
4906 // Set SP denorm mode to this value.
4907 unsigned SPDenormMode =
4908 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4909
4910 if (ST.hasDenormModeInst()) {
4911 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4912 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4913
4914 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4915 B.buildInstr(AMDGPU::S_DENORM_MODE)
4916 .addImm(NewDenormModeValue);
4917
4918 } else {
4919 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4920 .addImm(SPDenormMode)
4921 .addImm(SPDenormModeBitField);
4922 }
4923}
4924
4927 MachineIRBuilder &B) const {
4929 return true;
4930
4931 Register Res = MI.getOperand(0).getReg();
4932 Register LHS = MI.getOperand(1).getReg();
4933 Register RHS = MI.getOperand(2).getReg();
4934 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4935 SIModeRegisterDefaults Mode = MFI->getMode();
4936
4937 uint16_t Flags = MI.getFlags();
4938
4939 LLT S32 = LLT::scalar(32);
4940 LLT S1 = LLT::scalar(1);
4941
4942 auto One = B.buildFConstant(S32, 1.0f);
4943
4944 auto DenominatorScaled =
4945 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4946 .addUse(LHS)
4947 .addUse(RHS)
4948 .addImm(0)
4949 .setMIFlags(Flags);
4950 auto NumeratorScaled =
4951 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4952 .addUse(LHS)
4953 .addUse(RHS)
4954 .addImm(1)
4955 .setMIFlags(Flags);
4956
4957 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4958 .addUse(DenominatorScaled.getReg(0))
4959 .setMIFlags(Flags);
4960 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4961
4962 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4963 const bool HasDynamicDenormals =
4964 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4965 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4966
4967 Register SavedSPDenormMode;
4968 if (!PreservesDenormals) {
4969 if (HasDynamicDenormals) {
4970 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4971 B.buildInstr(AMDGPU::S_GETREG_B32)
4972 .addDef(SavedSPDenormMode)
4973 .addImm(SPDenormModeBitField);
4974 }
4975 toggleSPDenormMode(true, B, ST, Mode);
4976 }
4977
4978 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4979 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4980 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4981 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4982 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4983 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4984
4985 if (!PreservesDenormals) {
4986 if (HasDynamicDenormals) {
4987 assert(SavedSPDenormMode);
4988 B.buildInstr(AMDGPU::S_SETREG_B32)
4989 .addReg(SavedSPDenormMode)
4990 .addImm(SPDenormModeBitField);
4991 } else
4992 toggleSPDenormMode(false, B, ST, Mode);
4993 }
4994
4995 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4996 .addUse(Fma4.getReg(0))
4997 .addUse(Fma1.getReg(0))
4998 .addUse(Fma3.getReg(0))
4999 .addUse(NumeratorScaled.getReg(1))
5000 .setMIFlags(Flags);
5001
5002 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5003 .addUse(Fmas.getReg(0))
5004 .addUse(RHS)
5005 .addUse(LHS)
5006 .setMIFlags(Flags);
5007
5008 MI.eraseFromParent();
5009 return true;
5010}
5011
5014 MachineIRBuilder &B) const {
5016 return true;
5017
5018 Register Res = MI.getOperand(0).getReg();
5019 Register LHS = MI.getOperand(1).getReg();
5020 Register RHS = MI.getOperand(2).getReg();
5021
5022 uint16_t Flags = MI.getFlags();
5023
5024 LLT S64 = LLT::scalar(64);
5025 LLT S1 = LLT::scalar(1);
5026
5027 auto One = B.buildFConstant(S64, 1.0);
5028
5029 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5030 .addUse(LHS)
5031 .addUse(RHS)
5032 .addImm(0)
5033 .setMIFlags(Flags);
5034
5035 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5036
5037 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5038 .addUse(DivScale0.getReg(0))
5039 .setMIFlags(Flags);
5040
5041 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5042 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5043 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5044
5045 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5046 .addUse(LHS)
5047 .addUse(RHS)
5048 .addImm(1)
5049 .setMIFlags(Flags);
5050
5051 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5052 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5053 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5054
5055 Register Scale;
5057 // Workaround a hardware bug on SI where the condition output from div_scale
5058 // is not usable.
5059
5060 LLT S32 = LLT::scalar(32);
5061
5062 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5063 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5064 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5065 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5066
5067 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5068 Scale1Unmerge.getReg(1));
5069 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5070 Scale0Unmerge.getReg(1));
5071 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5072 } else {
5073 Scale = DivScale1.getReg(1);
5074 }
5075
5076 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5077 .addUse(Fma4.getReg(0))
5078 .addUse(Fma3.getReg(0))
5079 .addUse(Mul.getReg(0))
5080 .addUse(Scale)
5081 .setMIFlags(Flags);
5082
5083 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5084 .addUse(Fmas.getReg(0))
5085 .addUse(RHS)
5086 .addUse(LHS)
5087 .setMIFlags(Flags);
5088
5089 MI.eraseFromParent();
5090 return true;
5091}
5092
5095 MachineIRBuilder &B) const {
5096 Register Res0 = MI.getOperand(0).getReg();
5097 Register Res1 = MI.getOperand(1).getReg();
5098 Register Val = MI.getOperand(2).getReg();
5099 uint16_t Flags = MI.getFlags();
5100
5101 LLT Ty = MRI.getType(Res0);
5102 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5103
5104 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5105 .addUse(Val)
5106 .setMIFlags(Flags);
5107 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5108 .addUse(Val)
5109 .setMIFlags(Flags);
5110
5111 if (ST.hasFractBug()) {
5112 auto Fabs = B.buildFAbs(Ty, Val);
5113 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5114 auto IsFinite =
5115 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5116 auto Zero = B.buildConstant(InstrExpTy, 0);
5117 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5118 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5119 }
5120
5121 B.buildCopy(Res0, Mant);
5122 B.buildSExtOrTrunc(Res1, Exp);
5123
5124 MI.eraseFromParent();
5125 return true;
5126}
5127
5130 MachineIRBuilder &B) const {
5131 Register Res = MI.getOperand(0).getReg();
5132 Register LHS = MI.getOperand(2).getReg();
5133 Register RHS = MI.getOperand(3).getReg();
5134 uint16_t Flags = MI.getFlags();
5135
5136 LLT S32 = LLT::scalar(32);
5137 LLT S1 = LLT::scalar(1);
5138
5139 auto Abs = B.buildFAbs(S32, RHS, Flags);
5140 const APFloat C0Val(1.0f);
5141
5142 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5143 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5144 auto C2 = B.buildFConstant(S32, 1.0f);
5145
5146 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5147 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5148
5149 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5150
5151 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5152 .addUse(Mul0.getReg(0))
5153 .setMIFlags(Flags);
5154
5155 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5156
5157 B.buildFMul(Res, Sel, Mul1, Flags);
5158
5159 MI.eraseFromParent();
5160 return true;
5161}
5162
5165 MachineIRBuilder &B) const {
5166 // Bypass the correct expansion a standard promotion through G_FSQRT would
5167 // get. The f32 op is accurate enough for the f16 cas.
5168 unsigned Flags = MI.getFlags();
5169 assert(!ST.has16BitInsts());
5170 const LLT F32 = LLT::scalar(32);
5171 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5172 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5173 .addUse(Ext.getReg(0))
5174 .setMIFlags(Flags);
5175 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5176 MI.eraseFromParent();
5177 return true;
5178}
5179
5182 MachineIRBuilder &B) const {
5183 MachineFunction &MF = B.getMF();
5184 Register Dst = MI.getOperand(0).getReg();
5185 Register X = MI.getOperand(1).getReg();
5186 const unsigned Flags = MI.getFlags();
5187 const LLT S1 = LLT::scalar(1);
5188 const LLT F32 = LLT::scalar(32);
5189 const LLT I32 = LLT::scalar(32);
5190
5191 if (allowApproxFunc(MF, Flags)) {
5192 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5193 .addUse(X)
5194 .setMIFlags(Flags);
5195 MI.eraseFromParent();
5196 return true;
5197 }
5198
5199 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5200 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5201 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5202 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5203 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5204
5205 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5206 if (needsDenormHandlingF32(MF, X, Flags)) {
5207 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5208 .addUse(SqrtX.getReg(0))
5209 .setMIFlags(Flags);
5210
5211 auto NegOne = B.buildConstant(I32, -1);
5212 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5213
5214 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5215 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5216
5217 auto PosOne = B.buildConstant(I32, 1);
5218 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5219
5220 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5221 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5222
5223 auto Zero = B.buildFConstant(F32, 0.0f);
5224 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5225
5226 SqrtS =
5227 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5228
5229 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5230 SqrtS =
5231 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5232 } else {
5233 auto SqrtR =
5234 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5235 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5236
5237 auto Half = B.buildFConstant(F32, 0.5f);
5238 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5239 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5240 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5241 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5242 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5243 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5244 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5245 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5246 }
5247
5248 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5249
5250 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5251
5252 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5253
5254 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5255 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5256
5257 MI.eraseFromParent();
5258 return true;
5259}
5260
5263 MachineIRBuilder &B) const {
5264 // For double type, the SQRT and RSQ instructions don't have required
5265 // precision, we apply Goldschmidt's algorithm to improve the result:
5266 //
5267 // y0 = rsq(x)
5268 // g0 = x * y0
5269 // h0 = 0.5 * y0
5270 //
5271 // r0 = 0.5 - h0 * g0
5272 // g1 = g0 * r0 + g0
5273 // h1 = h0 * r0 + h0
5274 //
5275 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5276 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5277 // h2 = h1 * r1 + h1
5278 //
5279 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5280 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5281 //
5282 // sqrt(x) = g3
5283
5284 const LLT S1 = LLT::scalar(1);
5285 const LLT S32 = LLT::scalar(32);
5286 const LLT F64 = LLT::scalar(64);
5287
5288 Register Dst = MI.getOperand(0).getReg();
5289 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5290
5291 Register X = MI.getOperand(1).getReg();
5292 unsigned Flags = MI.getFlags();
5293
5294 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5295
5296 auto ZeroInt = B.buildConstant(S32, 0);
5297 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5298
5299 // Scale up input if it is too small.
5300 auto ScaleUpFactor = B.buildConstant(S32, 256);
5301 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5302 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5303
5304 auto SqrtY =
5305 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5306
5307 auto Half = B.buildFConstant(F64, 0.5);
5308 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5309 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5310
5311 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5312 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5313
5314 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5315 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5316
5317 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5318 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5319
5320 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5321
5322 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5323 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5324
5325 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5326
5327 // Scale down the result.
5328 auto ScaleDownFactor = B.buildConstant(S32, -128);
5329 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5330 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5331
5332 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5333 // with finite only or nsz because rsq(+/-0) = +/-inf
5334
5335 // TODO: Check for DAZ and expand to subnormals
5336 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5337
5338 // If x is +INF, +0, or -0, use its original value
5339 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5340
5341 MI.eraseFromParent();
5342 return true;
5343}
5344
5347 MachineIRBuilder &B) const {
5348 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5349 if (Ty == LLT::scalar(32))
5350 return legalizeFSQRTF32(MI, MRI, B);
5351 if (Ty == LLT::scalar(64))
5352 return legalizeFSQRTF64(MI, MRI, B);
5353 if (Ty == LLT::scalar(16))
5354 return legalizeFSQRTF16(MI, MRI, B);
5355 return false;
5356}
5357
5358// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5359// FIXME: Why do we handle this one but not other removed instructions?
5360//
5361// Reciprocal square root. The clamp prevents infinite results, clamping
5362// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5363// +-max_float.
5366 MachineIRBuilder &B) const {
5368 return true;
5369
5370 Register Dst = MI.getOperand(0).getReg();
5371 Register Src = MI.getOperand(2).getReg();
5372 auto Flags = MI.getFlags();
5373
5374 LLT Ty = MRI.getType(Dst);
5375
5376 const fltSemantics *FltSemantics;
5377 if (Ty == LLT::scalar(32))
5378 FltSemantics = &APFloat::IEEEsingle();
5379 else if (Ty == LLT::scalar(64))
5380 FltSemantics = &APFloat::IEEEdouble();
5381 else
5382 return false;
5383
5384 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5385 .addUse(Src)
5386 .setMIFlags(Flags);
5387
5388 // We don't need to concern ourselves with the snan handling difference, since
5389 // the rsq quieted (or not) so use the one which will directly select.
5390 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5391 const bool UseIEEE = MFI->getMode().IEEE;
5392
5393 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5394 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5395 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5396
5397 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5398
5399 if (UseIEEE)
5400 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5401 else
5402 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5403 MI.eraseFromParent();
5404 return true;
5405}
5406
5407// TODO: Fix pointer type handling
5410 Intrinsic::ID IID) const {
5411
5412 MachineIRBuilder &B = Helper.MIRBuilder;
5413 MachineRegisterInfo &MRI = *B.getMRI();
5414
5415 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5416 IID == Intrinsic::amdgcn_permlanex16;
5417
5418 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5419 Register Src2, LLT VT) -> Register {
5420 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5421 switch (IID) {
5422 case Intrinsic::amdgcn_readfirstlane:
5423 case Intrinsic::amdgcn_permlane64:
5424 return LaneOp.getReg(0);
5425 case Intrinsic::amdgcn_readlane:
5426 return LaneOp.addUse(Src1).getReg(0);
5427 case Intrinsic::amdgcn_writelane:
5428 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5429 case Intrinsic::amdgcn_permlane16:
5430 case Intrinsic::amdgcn_permlanex16: {
5431 Register Src3 = MI.getOperand(5).getReg();
5432 Register Src4 = MI.getOperand(6).getImm();
5433 Register Src5 = MI.getOperand(7).getImm();
5434 return LaneOp.addUse(Src1)
5435 .addUse(Src2)
5436 .addUse(Src3)
5437 .addImm(Src4)
5438 .addImm(Src5)
5439 .getReg(0);
5440 }
5441 default:
5442 llvm_unreachable("unhandled lane op");
5443 }
5444 };
5445
5446 Register DstReg = MI.getOperand(0).getReg();
5447 Register Src0 = MI.getOperand(2).getReg();
5448 Register Src1, Src2;
5449 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5450 IsPermLane16) {
5451 Src1 = MI.getOperand(3).getReg();
5452 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5453 Src2 = MI.getOperand(4).getReg();
5454 }
5455 }
5456
5457 LLT Ty = MRI.getType(DstReg);
5458 unsigned Size = Ty.getSizeInBits();
5459
5460 if (Size == 32) {
5461 // Already legal
5462 return true;
5463 }
5464
5465 if (Size < 32) {
5466 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5467
5468 if (IsPermLane16)
5469 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5470
5471 if (IID == Intrinsic::amdgcn_writelane)
5472 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5473
5474 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5475 B.buildTrunc(DstReg, LaneOpDst);
5476 MI.eraseFromParent();
5477 return true;
5478 }
5479
5480 if (Size % 32 != 0)
5481 return false;
5482
5483 LLT PartialResTy = S32;
5484 if (Ty.isVector()) {
5485 LLT EltTy = Ty.getElementType();
5486 switch (EltTy.getSizeInBits()) {
5487 case 16:
5488 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5489 break;
5490 case 32:
5491 PartialResTy = EltTy;
5492 break;
5493 default:
5494 // Handle all other cases via S32 pieces;
5495 break;
5496 }
5497 }
5498
5499 SmallVector<Register, 2> PartialRes;
5500 unsigned NumParts = Size / 32;
5501 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5502 MachineInstrBuilder Src1Parts, Src2Parts;
5503
5504 if (IsPermLane16)
5505 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5506
5507 if (IID == Intrinsic::amdgcn_writelane)
5508 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5509
5510 for (unsigned i = 0; i < NumParts; ++i) {
5511 Src0 = Src0Parts.getReg(i);
5512
5513 if (IsPermLane16)
5514 Src1 = Src1Parts.getReg(i);
5515
5516 if (IID == Intrinsic::amdgcn_writelane)
5517 Src2 = Src2Parts.getReg(i);
5518
5519 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5520 }
5521
5522 B.buildMergeLikeInstr(DstReg, PartialRes);
5523 MI.eraseFromParent();
5524 return true;
5525}
5526
5529 MachineIRBuilder &B) const {
5533 LLT DstTy = MRI.getType(DstReg);
5534 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5535
5536 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5537 if (!loadInputValue(KernargPtrReg, B,
5539 return false;
5540
5541 // FIXME: This should be nuw
5542 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5543 return true;
5544}
5545
5546/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5547/// bits of the pointer and replace them with the stride argument, then
5548/// merge_values everything together. In the common case of a raw buffer (the
5549/// stride component is 0), we can just AND off the upper half.
5552 Register Result = MI.getOperand(0).getReg();
5553 Register Pointer = MI.getOperand(2).getReg();
5554 Register Stride = MI.getOperand(3).getReg();
5555 Register NumRecords = MI.getOperand(4).getReg();
5556 Register Flags = MI.getOperand(5).getReg();
5557
5558 LLT S32 = LLT::scalar(32);
5559
5560 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5561 auto Unmerge = B.buildUnmerge(S32, Pointer);
5562 Register LowHalf = Unmerge.getReg(0);
5563 Register HighHalf = Unmerge.getReg(1);
5564
5565 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5566 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5567
5568 MachineInstrBuilder NewHighHalf = Masked;
5569 std::optional<ValueAndVReg> StrideConst =
5571 if (!StrideConst || !StrideConst->Value.isZero()) {
5572 MachineInstrBuilder ShiftedStride;
5573 if (StrideConst) {
5574 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5575 uint32_t ShiftedStrideVal = StrideVal << 16;
5576 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5577 } else {
5578 auto ExtStride = B.buildAnyExt(S32, Stride);
5579 auto ShiftConst = B.buildConstant(S32, 16);
5580 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5581 }
5582 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5583 }
5584 Register NewHighHalfReg = NewHighHalf.getReg(0);
5585 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5586 MI.eraseFromParent();
5587 return true;
5588}
5589
5592 MachineIRBuilder &B) const {
5593 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5594 if (!MFI->isEntryFunction()) {
5597 }
5598
5599 Register DstReg = MI.getOperand(0).getReg();
5600 if (!getImplicitArgPtr(DstReg, MRI, B))
5601 return false;
5602
5603 MI.eraseFromParent();
5604 return true;
5605}
5606
5609 MachineIRBuilder &B) const {
5610 Function &F = B.getMF().getFunction();
5611 std::optional<uint32_t> KnownSize =
5613 if (KnownSize.has_value())
5614 B.buildConstant(DstReg, *KnownSize);
5615 return false;
5616}
5617
5620 MachineIRBuilder &B) const {
5621
5622 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5623 if (!MFI->isEntryFunction()) {
5626 }
5627
5628 Register DstReg = MI.getOperand(0).getReg();
5629 if (!getLDSKernelId(DstReg, MRI, B))
5630 return false;
5631
5632 MI.eraseFromParent();
5633 return true;
5634}
5635
5639 unsigned AddrSpace) const {
5640 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5641 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5642 Register Hi32 = Unmerge.getReg(1);
5643
5644 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5645 MI.eraseFromParent();
5646 return true;
5647}
5648
5649// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5650// offset (the offset that is included in bounds checking and swizzling, to be
5651// split between the instruction's voffset and immoffset fields) and soffset
5652// (the offset that is excluded from bounds checking and swizzling, to go in
5653// the instruction's soffset field). This function takes the first kind of
5654// offset and figures out how to split it between voffset and immoffset.
5655std::pair<Register, unsigned>
5657 Register OrigOffset) const {
5658 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5659 Register BaseReg;
5660 unsigned ImmOffset;
5661 const LLT S32 = LLT::scalar(32);
5662 MachineRegisterInfo &MRI = *B.getMRI();
5663
5664 std::tie(BaseReg, ImmOffset) =
5666
5667 // If BaseReg is a pointer, convert it to int.
5668 if (MRI.getType(BaseReg).isPointer())
5669 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5670
5671 // If the immediate value is too big for the immoffset field, put only bits
5672 // that would normally fit in the immoffset field. The remaining value that
5673 // is copied/added for the voffset field is a large power of 2, and it
5674 // stands more chance of being CSEd with the copy/add for another similar
5675 // load/store.
5676 // However, do not do that rounding down if that is a negative
5677 // number, as it appears to be illegal to have a negative offset in the
5678 // vgpr, even if adding the immediate offset makes it positive.
5679 unsigned Overflow = ImmOffset & ~MaxImm;
5680 ImmOffset -= Overflow;
5681 if ((int32_t)Overflow < 0) {
5682 Overflow += ImmOffset;
5683 ImmOffset = 0;
5684 }
5685
5686 if (Overflow != 0) {
5687 if (!BaseReg) {
5688 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5689 } else {
5690 auto OverflowVal = B.buildConstant(S32, Overflow);
5691 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5692 }
5693 }
5694
5695 if (!BaseReg)
5696 BaseReg = B.buildConstant(S32, 0).getReg(0);
5697
5698 return std::pair(BaseReg, ImmOffset);
5699}
5700
5701/// Handle register layout difference for f16 images for some subtargets.
5704 Register Reg,
5705 bool ImageStore) const {
5706 const LLT S16 = LLT::scalar(16);
5707 const LLT S32 = LLT::scalar(32);
5708 LLT StoreVT = MRI.getType(Reg);
5709 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5710
5711 if (ST.hasUnpackedD16VMem()) {
5712 auto Unmerge = B.buildUnmerge(S16, Reg);
5713
5714 SmallVector<Register, 4> WideRegs;
5715 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5716 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5717
5718 int NumElts = StoreVT.getNumElements();
5719
5720 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5721 .getReg(0);
5722 }
5723
5724 if (ImageStore && ST.hasImageStoreD16Bug()) {
5725 if (StoreVT.getNumElements() == 2) {
5726 SmallVector<Register, 4> PackedRegs;
5727 Reg = B.buildBitcast(S32, Reg).getReg(0);
5728 PackedRegs.push_back(Reg);
5729 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5730 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5731 .getReg(0);
5732 }
5733
5734 if (StoreVT.getNumElements() == 3) {
5735 SmallVector<Register, 4> PackedRegs;
5736 auto Unmerge = B.buildUnmerge(S16, Reg);
5737 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5738 PackedRegs.push_back(Unmerge.getReg(I));
5739 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5740 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5741 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5742 }
5743
5744 if (StoreVT.getNumElements() == 4) {
5745 SmallVector<Register, 4> PackedRegs;
5746 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5747 auto Unmerge = B.buildUnmerge(S32, Reg);
5748 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5749 PackedRegs.push_back(Unmerge.getReg(I));
5750 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5751 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5752 .getReg(0);
5753 }
5754
5755 llvm_unreachable("invalid data type");
5756 }
5757
5758 if (StoreVT == LLT::fixed_vector(3, S16)) {
5759 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5760 .getReg(0);
5761 }
5762 return Reg;
5763}
5764
5766 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5767 MachineRegisterInfo *MRI = B.getMRI();
5768 LLT Ty = MRI->getType(VData);
5769
5770 const LLT S16 = LLT::scalar(16);
5771
5772 // Fixup buffer resources themselves needing to be v4i128.
5774 return castBufferRsrcToV4I32(VData, B);
5775
5776 // Fixup illegal register types for i8 stores.
5777 if (Ty == LLT::scalar(8) || Ty == S16) {
5778 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5779 return AnyExt;
5780 }
5781
5782 if (Ty.isVector()) {
5783 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5784 if (IsFormat)
5785 return handleD16VData(B, *MRI, VData);
5786 }
5787 }
5788
5789 return VData;
5790}
5791
5795 bool IsTyped,
5796 bool IsFormat) const {
5797 Register VData = MI.getOperand(1).getReg();
5798 LLT Ty = MRI.getType(VData);
5799 LLT EltTy = Ty.getScalarType();
5800 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5801 const LLT S32 = LLT::scalar(32);
5802
5803 VData = fixStoreSourceType(B, VData, IsFormat);
5805 Register RSrc = MI.getOperand(2).getReg();
5806
5807 MachineMemOperand *MMO = *MI.memoperands_begin();
5808 const int MemSize = MMO->getSize().getValue();
5809
5810 unsigned ImmOffset;
5811
5812 // The typed intrinsics add an immediate after the registers.
5813 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5814
5815 // The struct intrinsic variants add one additional operand over raw.
5816 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5817 Register VIndex;
5818 int OpOffset = 0;
5819 if (HasVIndex) {
5820 VIndex = MI.getOperand(3).getReg();
5821 OpOffset = 1;
5822 } else {
5823 VIndex = B.buildConstant(S32, 0).getReg(0);
5824 }
5825
5826 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5827 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5828
5829 unsigned Format = 0;
5830 if (IsTyped) {
5831 Format = MI.getOperand(5 + OpOffset).getImm();
5832 ++OpOffset;
5833 }
5834
5835 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5836
5837 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5838
5839 unsigned Opc;
5840 if (IsTyped) {
5841 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5842 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5843 } else if (IsFormat) {
5844 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5845 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5846 } else {
5847 switch (MemSize) {
5848 case 1:
5849 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5850 break;
5851 case 2:
5852 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5853 break;
5854 default:
5855 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5856 break;
5857 }
5858 }
5859
5860 auto MIB = B.buildInstr(Opc)
5861 .addUse(VData) // vdata
5862 .addUse(RSrc) // rsrc
5863 .addUse(VIndex) // vindex
5864 .addUse(VOffset) // voffset
5865 .addUse(SOffset) // soffset
5866 .addImm(ImmOffset); // offset(imm)
5867
5868 if (IsTyped)
5869 MIB.addImm(Format);
5870
5871 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5872 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5873 .addMemOperand(MMO);
5874
5875 MI.eraseFromParent();
5876 return true;
5877}
5878
5879static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5880 Register VIndex, Register VOffset, Register SOffset,
5881 unsigned ImmOffset, unsigned Format,
5882 unsigned AuxiliaryData, MachineMemOperand *MMO,
5883 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5884 auto MIB = B.buildInstr(Opc)
5885 .addDef(LoadDstReg) // vdata
5886 .addUse(RSrc) // rsrc
5887 .addUse(VIndex) // vindex
5888 .addUse(VOffset) // voffset
5889 .addUse(SOffset) // soffset
5890 .addImm(ImmOffset); // offset(imm)
5891
5892 if (IsTyped)
5893 MIB.addImm(Format);
5894
5895 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5896 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5897 .addMemOperand(MMO);
5898}
5899
5903 bool IsFormat,
5904 bool IsTyped) const {
5905 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5906 MachineMemOperand *MMO = *MI.memoperands_begin();
5907 const LLT MemTy = MMO->getMemoryType();
5908 const LLT S32 = LLT::scalar(32);
5909
5910 Register Dst = MI.getOperand(0).getReg();
5911
5912 Register StatusDst;
5913 int OpOffset = 0;
5914 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5915 bool IsTFE = MI.getNumExplicitDefs() == 2;
5916 if (IsTFE) {
5917 StatusDst = MI.getOperand(1).getReg();
5918 ++OpOffset;
5919 }
5920
5921 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5922 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5923
5924 // The typed intrinsics add an immediate after the registers.
5925 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5926
5927 // The struct intrinsic variants add one additional operand over raw.
5928 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5929 Register VIndex;
5930 if (HasVIndex) {
5931 VIndex = MI.getOperand(3 + OpOffset).getReg();
5932 ++OpOffset;
5933 } else {
5934 VIndex = B.buildConstant(S32, 0).getReg(0);
5935 }
5936
5937 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5938 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5939
5940 unsigned Format = 0;
5941 if (IsTyped) {
5942 Format = MI.getOperand(5 + OpOffset).getImm();
5943 ++OpOffset;
5944 }
5945
5946 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5947 unsigned ImmOffset;
5948
5949 LLT Ty = MRI.getType(Dst);
5950 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5951 // logic doesn't have to handle that case.
5952 if (hasBufferRsrcWorkaround(Ty)) {
5953 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5954 Dst = MI.getOperand(0).getReg();
5955 }
5956 LLT EltTy = Ty.getScalarType();
5957 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5958 const bool Unpacked = ST.hasUnpackedD16VMem();
5959
5960 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5961
5962 unsigned Opc;
5963
5964 // TODO: Support TFE for typed and narrow loads.
5965 if (IsTyped) {
5966 if (IsTFE)
5967 return false;
5968 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5969 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5970 } else if (IsFormat) {
5971 if (IsD16) {
5972 if (IsTFE)
5973 return false;
5974 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5975 } else {
5976 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5977 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5978 }
5979 } else {
5980 switch (MemTy.getSizeInBits()) {
5981 case 8:
5982 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5983 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5984 break;
5985 case 16:
5986 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5987 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5988 break;
5989 default:
5990 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
5991 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
5992 break;
5993 }
5994 }
5995
5996 if (IsTFE) {
5997 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5998 unsigned NumLoadDWords = NumValueDWords + 1;
5999 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6000 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6001 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6002 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6003 if (MemTy.getSizeInBits() < 32) {
6004 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6005 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6006 B.buildTrunc(Dst, ExtDst);
6007 } else if (NumValueDWords == 1) {
6008 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6009 } else {
6010 SmallVector<Register, 5> LoadElts;
6011 for (unsigned I = 0; I != NumValueDWords; ++I)
6012 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6013 LoadElts.push_back(StatusDst);
6014 B.buildUnmerge(LoadElts, LoadDstReg);
6015 LoadElts.truncate(NumValueDWords);
6016 B.buildMergeLikeInstr(Dst, LoadElts);
6017 }
6018 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6019 (IsD16 && !Ty.isVector())) {
6020 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6021 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6022 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6023 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6024 B.buildTrunc(Dst, LoadDstReg);
6025 } else if (Unpacked && IsD16 && Ty.isVector()) {
6026 LLT UnpackedTy = Ty.changeElementSize(32);
6027 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6028 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6029 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6030 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6031 // FIXME: G_TRUNC should work, but legalization currently fails
6032 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6034 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6035 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6036 B.buildMergeLikeInstr(Dst, Repack);
6037 } else {
6038 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6039 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6040 }
6041
6042 MI.eraseFromParent();
6043 return true;
6044}
6045
6046static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6047 switch (IntrID) {
6048 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6049 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6050 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6052 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6053 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6054 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6055 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6056 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6057 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6058 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6060 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6061 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6062 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6063 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6064 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6065 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6066 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6067 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6068 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6069 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6070 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6071 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6072 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6073 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6074 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6075 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6076 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6077 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6078 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6079 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6080 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6082 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6083 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6084 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6085 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6086 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6088 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6089 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6090 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6091 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6092 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6093 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6094 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6095 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6096 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6097 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6098 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6099 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6100 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6101 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6102 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6103 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6104 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6105 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6106 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6107 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6108 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6110 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6111 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6112 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6113 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6114 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6115 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6116 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6118 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6119 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6120 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6121 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6122 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6123 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6124 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6125 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6126 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6127 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6128 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6129 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6130 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6131 default:
6132 llvm_unreachable("unhandled atomic opcode");
6133 }
6134}
6135
6138 Intrinsic::ID IID) const {
6139 const bool IsCmpSwap =
6140 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6141 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6142 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6143 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6144
6145 Register Dst = MI.getOperand(0).getReg();
6146 // Since we don't have 128-bit atomics, we don't need to handle the case of
6147 // p8 argmunents to the atomic itself
6148 Register VData = MI.getOperand(2).getReg();
6149
6150 Register CmpVal;
6151 int OpOffset = 0;
6152
6153 if (IsCmpSwap) {
6154 CmpVal = MI.getOperand(3).getReg();
6155 ++OpOffset;
6156 }
6157
6158 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6159 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6160 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6161
6162 // The struct intrinsic variants add one additional operand over raw.
6163 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6164 Register VIndex;
6165 if (HasVIndex) {
6166 VIndex = MI.getOperand(4 + OpOffset).getReg();
6167 ++OpOffset;
6168 } else {
6169 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6170 }
6171
6172 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6173 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6174 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6175
6176 MachineMemOperand *MMO = *MI.memoperands_begin();
6177
6178 unsigned ImmOffset;
6179 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6180
6181 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6182 .addDef(Dst)
6183 .addUse(VData); // vdata
6184
6185 if (IsCmpSwap)
6186 MIB.addReg(CmpVal);
6187
6188 MIB.addUse(RSrc) // rsrc
6189 .addUse(VIndex) // vindex
6190 .addUse(VOffset) // voffset
6191 .addUse(SOffset) // soffset
6192 .addImm(ImmOffset) // offset(imm)
6193 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6194 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6195 .addMemOperand(MMO);
6196
6197 MI.eraseFromParent();
6198 return true;
6199}
6200
6201/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6202/// vector with s16 typed elements.
6204 SmallVectorImpl<Register> &PackedAddrs,
6205 unsigned ArgOffset,
6207 bool IsA16, bool IsG16) {
6208 const LLT S16 = LLT::scalar(16);
6209 const LLT V2S16 = LLT::fixed_vector(2, 16);
6210 auto EndIdx = Intr->VAddrEnd;
6211
6212 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6213 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6214 if (!SrcOp.isReg())
6215 continue; // _L to _LZ may have eliminated this.
6216
6217 Register AddrReg = SrcOp.getReg();
6218
6219 if ((I < Intr->GradientStart) ||
6220 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6221 (I >= Intr->CoordStart && !IsA16)) {
6222 if ((I < Intr->GradientStart) && IsA16 &&
6223 (B.getMRI()->getType(AddrReg) == S16)) {
6224 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6225 // Special handling of bias when A16 is on. Bias is of type half but
6226 // occupies full 32-bit.
6227 PackedAddrs.push_back(
6228 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6229 .getReg(0));
6230 } else {
6231 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6232 "Bias needs to be converted to 16 bit in A16 mode");
6233 // Handle any gradient or coordinate operands that should not be packed
6234 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6235 PackedAddrs.push_back(AddrReg);
6236 }
6237 } else {
6238 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6239 // derivatives dx/dh and dx/dv are packed with undef.
6240 if (((I + 1) >= EndIdx) ||
6241 ((Intr->NumGradients / 2) % 2 == 1 &&
6242 (I == static_cast<unsigned>(Intr->GradientStart +
6243 (Intr->NumGradients / 2) - 1) ||
6244 I == static_cast<unsigned>(Intr->GradientStart +
6245 Intr->NumGradients - 1))) ||
6246 // Check for _L to _LZ optimization
6247 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6248 PackedAddrs.push_back(
6249 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6250 .getReg(0));
6251 } else {
6252 PackedAddrs.push_back(
6253 B.buildBuildVector(
6254 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6255 .getReg(0));
6256 ++I;
6257 }
6258 }
6259 }
6260}
6261
6262/// Convert from separate vaddr components to a single vector address register,
6263/// and replace the remaining operands with $noreg.
6265 int DimIdx, int NumVAddrs) {
6266 const LLT S32 = LLT::scalar(32);
6267 (void)S32;
6268 SmallVector<Register, 8> AddrRegs;
6269 for (int I = 0; I != NumVAddrs; ++I) {
6270 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6271 if (SrcOp.isReg()) {
6272 AddrRegs.push_back(SrcOp.getReg());
6273 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6274 }
6275 }
6276
6277 int NumAddrRegs = AddrRegs.size();
6278 if (NumAddrRegs != 1) {
6279 auto VAddr =
6280 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6281 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6282 }
6283
6284 for (int I = 1; I != NumVAddrs; ++I) {
6285 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6286 if (SrcOp.isReg())
6287 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6288 }
6289}
6290
6291/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6292///
6293/// Depending on the subtarget, load/store with 16-bit element data need to be
6294/// rewritten to use the low half of 32-bit registers, or directly use a packed
6295/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6296/// registers.
6297///
6298/// We don't want to directly select image instructions just yet, but also want
6299/// to exposes all register repacking to the legalizer/combiners. We also don't
6300/// want a selected instruction entering RegBankSelect. In order to avoid
6301/// defining a multitude of intermediate image instructions, directly hack on
6302/// the intrinsic's arguments. In cases like a16 addresses, this requires
6303/// padding now unnecessary arguments with $noreg.
6306 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6307
6308 const MachineFunction &MF = *MI.getMF();
6309 const unsigned NumDefs = MI.getNumExplicitDefs();
6310 const unsigned ArgOffset = NumDefs + 1;
6311 bool IsTFE = NumDefs == 2;
6312 // We are only processing the operands of d16 image operations on subtargets
6313 // that use the unpacked register layout, or need to repack the TFE result.
6314
6315 // TODO: Do we need to guard against already legalized intrinsics?
6316 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6318
6319 MachineRegisterInfo *MRI = B.getMRI();
6320 const LLT S32 = LLT::scalar(32);
6321 const LLT S16 = LLT::scalar(16);
6322 const LLT V2S16 = LLT::fixed_vector(2, 16);
6323
6324 unsigned DMask = 0;
6325 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6326 LLT Ty = MRI->getType(VData);
6327
6328 const bool IsAtomicPacked16Bit =
6329 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6330 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6331
6332 // Check for 16 bit addresses and pack if true.
6333 LLT GradTy =
6334 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6335 LLT AddrTy =
6336 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6337 const bool IsG16 =
6338 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6339 const bool IsA16 = AddrTy == S16;
6340 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6341
6342 int DMaskLanes = 0;
6343 if (!BaseOpcode->Atomic) {
6344 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6345 if (BaseOpcode->Gather4) {
6346 DMaskLanes = 4;
6347 } else if (DMask != 0) {
6348 DMaskLanes = llvm::popcount(DMask);
6349 } else if (!IsTFE && !BaseOpcode->Store) {
6350 // If dmask is 0, this is a no-op load. This can be eliminated.
6351 B.buildUndef(MI.getOperand(0));
6352 MI.eraseFromParent();
6353 return true;
6354 }
6355 }
6356
6357 Observer.changingInstr(MI);
6358 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6359
6360 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6361 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6362 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6363 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6364 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6365
6366 // Track that we legalized this
6367 MI.setDesc(B.getTII().get(NewOpcode));
6368
6369 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6370 // dmask to be at least 1 otherwise the instruction will fail
6371 if (IsTFE && DMask == 0) {
6372 DMask = 0x1;
6373 DMaskLanes = 1;
6374 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6375 }
6376
6377 if (BaseOpcode->Atomic) {
6378 Register VData0 = MI.getOperand(2).getReg();
6379 LLT Ty = MRI->getType(VData0);
6380
6381 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6382 if (Ty.isVector() && !IsAtomicPacked16Bit)
6383 return false;
6384
6385 if (BaseOpcode->AtomicX2) {
6386 Register VData1 = MI.getOperand(3).getReg();
6387 // The two values are packed in one register.
6388 LLT PackedTy = LLT::fixed_vector(2, Ty);
6389 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6390 MI.getOperand(2).setReg(Concat.getReg(0));
6391 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6392 }
6393 }
6394
6395 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6396
6397 // Rewrite the addressing register layout before doing anything else.
6398 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6399 // 16 bit gradients are supported, but are tied to the A16 control
6400 // so both gradients and addresses must be 16 bit
6401 return false;
6402 }
6403
6404 if (IsA16 && !ST.hasA16()) {
6405 // A16 not supported
6406 return false;
6407 }
6408
6409 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6410 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6411
6412 if (IsA16 || IsG16) {
6413 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6414 // instructions expect VGPR_32
6415 SmallVector<Register, 4> PackedRegs;
6416
6417 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6418
6419 // See also below in the non-a16 branch
6420 const bool UseNSA = ST.hasNSAEncoding() &&
6421 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6422 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6423 const bool UsePartialNSA =
6424 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6425
6426 if (UsePartialNSA) {
6427 // Pack registers that would go over NSAMaxSize into last VAddr register
6428 LLT PackedAddrTy =
6429 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6430 auto Concat = B.buildConcatVectors(
6431 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6432 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6433 PackedRegs.resize(NSAMaxSize);
6434 } else if (!UseNSA && PackedRegs.size() > 1) {
6435 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6436 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6437 PackedRegs[0] = Concat.getReg(0);
6438 PackedRegs.resize(1);
6439 }
6440
6441 const unsigned NumPacked = PackedRegs.size();
6442 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6443 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6444 if (!SrcOp.isReg()) {
6445 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6446 continue;
6447 }
6448
6449 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6450
6451 if (I - Intr->VAddrStart < NumPacked)
6452 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6453 else
6454 SrcOp.setReg(AMDGPU::NoRegister);
6455 }
6456 } else {
6457 // If the register allocator cannot place the address registers contiguously
6458 // without introducing moves, then using the non-sequential address encoding
6459 // is always preferable, since it saves VALU instructions and is usually a
6460 // wash in terms of code size or even better.
6461 //
6462 // However, we currently have no way of hinting to the register allocator
6463 // that MIMG addresses should be placed contiguously when it is possible to
6464 // do so, so force non-NSA for the common 2-address case as a heuristic.
6465 //
6466 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6467 // allocation when possible.
6468 //
6469 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6470 // set of the remaining addresses.
6471 const bool UseNSA = ST.hasNSAEncoding() &&
6472 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6473 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6474 const bool UsePartialNSA =
6475 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6476
6477 if (UsePartialNSA) {
6479 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6480 Intr->NumVAddrs - NSAMaxSize + 1);
6481 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6482 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6483 Intr->NumVAddrs);
6484 }
6485 }
6486
6487 int Flags = 0;
6488 if (IsA16)
6489 Flags |= 1;
6490 if (IsG16)
6491 Flags |= 2;
6492 MI.addOperand(MachineOperand::CreateImm(Flags));
6493
6494 if (BaseOpcode->Store) { // No TFE for stores?
6495 // TODO: Handle dmask trim
6496 if (!Ty.isVector() || !IsD16)
6497 return true;
6498
6499 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6500 if (RepackedReg != VData) {
6501 MI.getOperand(1).setReg(RepackedReg);
6502 }
6503
6504 return true;
6505 }
6506
6507 Register DstReg = MI.getOperand(0).getReg();
6508 const LLT EltTy = Ty.getScalarType();
6509 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6510
6511 // Confirm that the return type is large enough for the dmask specified
6512 if (NumElts < DMaskLanes)
6513 return false;
6514
6515 if (NumElts > 4 || DMaskLanes > 4)
6516 return false;
6517
6518 // Image atomic instructions are using DMask to specify how many bits
6519 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6520 // DMaskLanes for image atomic has default value '0'.
6521 // We must be sure that atomic variants (especially packed) will not be
6522 // truncated from v2s16 or v4s16 to s16 type.
6523 //
6524 // ChangeElementCount will be needed for image load where Ty is always scalar.
6525 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6526 const LLT AdjustedTy =
6527 DMaskLanes == 0
6528 ? Ty
6529 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6530
6531 // The raw dword aligned data component of the load. The only legal cases
6532 // where this matters should be when using the packed D16 format, for
6533 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6534 LLT RoundedTy;
6535
6536 // S32 vector to cover all data, plus TFE result element.
6537 LLT TFETy;
6538
6539 // Register type to use for each loaded component. Will be S32 or V2S16.
6540 LLT RegTy;
6541
6542 if (IsD16 && ST.hasUnpackedD16VMem()) {
6543 RoundedTy =
6544 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6545 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6546 RegTy = S32;
6547 } else {
6548 unsigned EltSize = EltTy.getSizeInBits();
6549 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6550 unsigned RoundedSize = 32 * RoundedElts;
6551 RoundedTy = LLT::scalarOrVector(
6552 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6553 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6554 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6555 }
6556
6557 // The return type does not need adjustment.
6558 // TODO: Should we change s16 case to s32 or <2 x s16>?
6559 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6560 return true;
6561
6562 Register Dst1Reg;
6563
6564 // Insert after the instruction.
6565 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6566
6567 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6568 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6569 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6570 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6571
6572 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6573
6574 MI.getOperand(0).setReg(NewResultReg);
6575
6576 // In the IR, TFE is supposed to be used with a 2 element struct return
6577 // type. The instruction really returns these two values in one contiguous
6578 // register, with one additional dword beyond the loaded data. Rewrite the
6579 // return type to use a single register result.
6580
6581 if (IsTFE) {
6582 Dst1Reg = MI.getOperand(1).getReg();
6583 if (MRI->getType(Dst1Reg) != S32)
6584 return false;
6585
6586 // TODO: Make sure the TFE operand bit is set.
6587 MI.removeOperand(1);
6588
6589 // Handle the easy case that requires no repack instructions.
6590 if (Ty == S32) {
6591 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6592 return true;
6593 }
6594 }
6595
6596 // Now figure out how to copy the new result register back into the old
6597 // result.
6598 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6599
6600 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6601
6602 if (ResultNumRegs == 1) {
6603 assert(!IsTFE);
6604 ResultRegs[0] = NewResultReg;
6605 } else {
6606 // We have to repack into a new vector of some kind.
6607 for (int I = 0; I != NumDataRegs; ++I)
6608 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6609 B.buildUnmerge(ResultRegs, NewResultReg);
6610
6611 // Drop the final TFE element to get the data part. The TFE result is
6612 // directly written to the right place already.
6613 if (IsTFE)
6614 ResultRegs.resize(NumDataRegs);
6615 }
6616
6617 // For an s16 scalar result, we form an s32 result with a truncate regardless
6618 // of packed vs. unpacked.
6619 if (IsD16 && !Ty.isVector()) {
6620 B.buildTrunc(DstReg, ResultRegs[0]);
6621 return true;
6622 }
6623
6624 // Avoid a build/concat_vector of 1 entry.
6625 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6626 B.buildBitcast(DstReg, ResultRegs[0]);
6627 return true;
6628 }
6629
6630 assert(Ty.isVector());
6631
6632 if (IsD16) {
6633 // For packed D16 results with TFE enabled, all the data components are
6634 // S32. Cast back to the expected type.
6635 //
6636 // TODO: We don't really need to use load s32 elements. We would only need one
6637 // cast for the TFE result if a multiple of v2s16 was used.
6638 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6639 for (Register &Reg : ResultRegs)
6640 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6641 } else if (ST.hasUnpackedD16VMem()) {
6642 for (Register &Reg : ResultRegs)
6643 Reg = B.buildTrunc(S16, Reg).getReg(0);
6644 }
6645 }
6646
6647 auto padWithUndef = [&](LLT Ty, int NumElts) {
6648 if (NumElts == 0)
6649 return;
6650 Register Undef = B.buildUndef(Ty).getReg(0);
6651 for (int I = 0; I != NumElts; ++I)
6652 ResultRegs.push_back(Undef);
6653 };
6654
6655 // Pad out any elements eliminated due to the dmask.
6656 LLT ResTy = MRI->getType(ResultRegs[0]);
6657 if (!ResTy.isVector()) {
6658 padWithUndef(ResTy, NumElts - ResultRegs.size());
6659 B.buildBuildVector(DstReg, ResultRegs);
6660 return true;
6661 }
6662
6663 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6664 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6665
6666 // Deal with the one annoying legal case.
6667 const LLT V3S16 = LLT::fixed_vector(3, 16);
6668 if (Ty == V3S16) {
6669 if (IsTFE) {
6670 if (ResultRegs.size() == 1) {
6671 NewResultReg = ResultRegs[0];
6672 } else if (ResultRegs.size() == 2) {
6673 LLT V4S16 = LLT::fixed_vector(4, 16);
6674 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6675 } else {
6676 return false;
6677 }
6678 }
6679
6680 if (MRI->getType(DstReg).getNumElements() <
6681 MRI->getType(NewResultReg).getNumElements()) {
6682 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6683 } else {
6684 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6685 }
6686 return true;
6687 }
6688
6689 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6690 B.buildConcatVectors(DstReg, ResultRegs);
6691 return true;
6692}
6693
6695 MachineInstr &MI) const {
6696 MachineIRBuilder &B = Helper.MIRBuilder;
6697 GISelChangeObserver &Observer = Helper.Observer;
6698
6699 Register OrigDst = MI.getOperand(0).getReg();
6700 Register Dst;
6701 LLT Ty = B.getMRI()->getType(OrigDst);
6702 unsigned Size = Ty.getSizeInBits();
6703 MachineFunction &MF = B.getMF();
6704 unsigned Opc = 0;
6705 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6706 assert(Size == 8 || Size == 16);
6707 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6708 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6709 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6710 // destination register.
6711 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6712 } else {
6713 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6714 Dst = OrigDst;
6715 }
6716
6717 Observer.changingInstr(MI);
6718
6719 // Handle needing to s.buffer.load() a p8 value.
6720 if (hasBufferRsrcWorkaround(Ty)) {
6721 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6722 B.setInsertPt(B.getMBB(), MI);
6723 }
6725 Ty = getBitcastRegisterType(Ty);
6726 Helper.bitcastDst(MI, Ty, 0);
6727 B.setInsertPt(B.getMBB(), MI);
6728 }
6729
6730 // FIXME: We don't really need this intermediate instruction. The intrinsic
6731 // should be fixed to have a memory operand. Since it's readnone, we're not
6732 // allowed to add one.
6733 MI.setDesc(B.getTII().get(Opc));
6734 MI.removeOperand(1); // Remove intrinsic ID
6735
6736 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6737 const unsigned MemSize = (Size + 7) / 8;
6738 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6744 MemSize, MemAlign);
6745 MI.addMemOperand(MF, MMO);
6746 if (Dst != OrigDst) {
6747 MI.getOperand(0).setReg(Dst);
6748 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6749 B.buildTrunc(OrigDst, Dst);
6750 }
6751
6752 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6753 // always be legal. We may need to restore this to a 96-bit result if it turns
6754 // out this needs to be converted to a vector load during RegBankSelect.
6755 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6756 if (Ty.isVector())
6758 else
6759 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6760 }
6761
6762 Observer.changedInstr(MI);
6763 return true;
6764}
6765
6766// TODO: Move to selection
6769 MachineIRBuilder &B) const {
6770 if (!ST.isTrapHandlerEnabled() ||
6772 return legalizeTrapEndpgm(MI, MRI, B);
6773
6774 return ST.supportsGetDoorbellID() ?
6776}
6777
6780 const DebugLoc &DL = MI.getDebugLoc();
6781 MachineBasicBlock &BB = B.getMBB();
6782 MachineFunction *MF = BB.getParent();
6783
6784 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6785 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6786 .addImm(0);
6787 MI.eraseFromParent();
6788 return true;
6789 }
6790
6791 // We need a block split to make the real endpgm a terminator. We also don't
6792 // want to break phis in successor blocks, so we can't just delete to the
6793 // end of the block.
6794 BB.splitAt(MI, false /*UpdateLiveIns*/);
6796 MF->push_back(TrapBB);
6797 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6798 .addImm(0);
6799 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6800 .addMBB(TrapBB);
6801
6802 BB.addSuccessor(TrapBB);
6803 MI.eraseFromParent();
6804 return true;
6805}
6806
6809 MachineFunction &MF = B.getMF();
6810 const LLT S64 = LLT::scalar(64);
6811
6812 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6813 // For code object version 5, queue_ptr is passed through implicit kernarg.
6819 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6820
6821 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6823
6824 if (!loadInputValue(KernargPtrReg, B,
6826 return false;
6827
6828 // TODO: can we be smarter about machine pointer info?
6831 PtrInfo,
6835
6836 // Pointer address
6837 Register LoadAddr = MRI.createGenericVirtualRegister(
6839 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6840 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6841 // Load address
6842 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6843 B.buildCopy(SGPR01, Temp);
6844 B.buildInstr(AMDGPU::S_TRAP)
6845 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6846 .addReg(SGPR01, RegState::Implicit);
6847 MI.eraseFromParent();
6848 return true;
6849 }
6850
6851 // Pass queue pointer to trap handler as input, and insert trap instruction
6852 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6853 Register LiveIn =
6854 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6856 return false;
6857
6858 B.buildCopy(SGPR01, LiveIn);
6859 B.buildInstr(AMDGPU::S_TRAP)
6860 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6861 .addReg(SGPR01, RegState::Implicit);
6862
6863 MI.eraseFromParent();
6864 return true;
6865}
6866
6869 MachineIRBuilder &B) const {
6870 // We need to simulate the 's_trap 2' instruction on targets that run in
6871 // PRIV=1 (where it is treated as a nop).
6872 if (ST.hasPrivEnabledTrap2NopBug()) {
6873 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6874 MI.getDebugLoc());
6875 MI.eraseFromParent();
6876 return true;
6877 }
6878
6879 B.buildInstr(AMDGPU::S_TRAP)
6880 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6881 MI.eraseFromParent();
6882 return true;
6883}
6884
6887 MachineIRBuilder &B) const {
6888 // Is non-HSA path or trap-handler disabled? Then, report a warning
6889 // accordingly
6890 if (!ST.isTrapHandlerEnabled() ||
6892 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6893 "debugtrap handler not supported",
6894 MI.getDebugLoc(), DS_Warning);
6895 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6896 Ctx.diagnose(NoTrap);
6897 } else {
6898 // Insert debug-trap instruction
6899 B.buildInstr(AMDGPU::S_TRAP)
6900 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6901 }
6902
6903 MI.eraseFromParent();
6904 return true;
6905}
6906
6908 MachineIRBuilder &B) const {
6909 MachineRegisterInfo &MRI = *B.getMRI();
6910 const LLT S16 = LLT::scalar(16);
6911 const LLT S32 = LLT::scalar(32);
6912 const LLT V2S16 = LLT::fixed_vector(2, 16);
6913 const LLT V3S32 = LLT::fixed_vector(3, 32);
6914
6915 Register DstReg = MI.getOperand(0).getReg();
6916 Register NodePtr = MI.getOperand(2).getReg();
6917 Register RayExtent = MI.getOperand(3).getReg();
6918 Register RayOrigin = MI.getOperand(4).getReg();
6919 Register RayDir = MI.getOperand(5).getReg();
6920 Register RayInvDir = MI.getOperand(6).getReg();
6921 Register TDescr = MI.getOperand(7).getReg();
6922
6923 if (!ST.hasGFX10_AEncoding()) {
6924 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6925 "intrinsic not supported on subtarget",
6926 MI.getDebugLoc());
6927 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6928 return false;
6929 }
6930
6931 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6932 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6933 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6934 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6935 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6936 const unsigned NumVDataDwords = 4;
6937 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6938 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6939 const bool UseNSA =
6940 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6941
6942 const unsigned BaseOpcodes[2][2] = {
6943 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6944 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6945 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6946 int Opcode;
6947 if (UseNSA) {
6948 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6949 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6950 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6951 : AMDGPU::MIMGEncGfx10NSA,
6952 NumVDataDwords, NumVAddrDwords);
6953 } else {
6954 assert(!IsGFX12Plus);
6955 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6956 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6957 : AMDGPU::MIMGEncGfx10Default,
6958 NumVDataDwords, NumVAddrDwords);
6959 }
6960 assert(Opcode != -1);
6961
6963 if (UseNSA && IsGFX11Plus) {
6964 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6965 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6966 auto Merged = B.buildMergeLikeInstr(
6967 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6968 Ops.push_back(Merged.getReg(0));
6969 };
6970
6971 Ops.push_back(NodePtr);
6972 Ops.push_back(RayExtent);
6973 packLanes(RayOrigin);
6974
6975 if (IsA16) {
6976 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6977 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6978 auto MergedDir = B.buildMergeLikeInstr(
6979 V3S32,
6980 {B.buildBitcast(
6981 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
6982 UnmergeRayDir.getReg(0)}))
6983 .getReg(0),
6984 B.buildBitcast(
6985 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
6986 UnmergeRayDir.getReg(1)}))
6987 .getReg(0),
6988 B.buildBitcast(
6989 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
6990 UnmergeRayDir.getReg(2)}))
6991 .getReg(0)});
6992 Ops.push_back(MergedDir.getReg(0));
6993 } else {
6994 packLanes(RayDir);
6995 packLanes(RayInvDir);
6996 }
6997 } else {
6998 if (Is64) {
6999 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7000 Ops.push_back(Unmerge.getReg(0));
7001 Ops.push_back(Unmerge.getReg(1));
7002 } else {
7003 Ops.push_back(NodePtr);
7004 }
7005 Ops.push_back(RayExtent);
7006
7007 auto packLanes = [&Ops, &S32, &B](Register Src) {
7008 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7009 Ops.push_back(Unmerge.getReg(0));
7010 Ops.push_back(Unmerge.getReg(1));
7011 Ops.push_back(Unmerge.getReg(2));
7012 };
7013
7014 packLanes(RayOrigin);
7015 if (IsA16) {
7016 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7017 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7018 Register R1 = MRI.createGenericVirtualRegister(S32);
7019 Register R2 = MRI.createGenericVirtualRegister(S32);
7020 Register R3 = MRI.createGenericVirtualRegister(S32);
7021 B.buildMergeLikeInstr(R1,
7022 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7023 B.buildMergeLikeInstr(
7024 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7025 B.buildMergeLikeInstr(
7026 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7027 Ops.push_back(R1);
7028 Ops.push_back(R2);
7029 Ops.push_back(R3);
7030 } else {
7031 packLanes(RayDir);
7032 packLanes(RayInvDir);
7033 }
7034 }
7035
7036 if (!UseNSA) {
7037 // Build a single vector containing all the operands so far prepared.
7038 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7039 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7040 Ops.clear();
7041 Ops.push_back(MergedOps);
7042 }
7043
7044 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7045 .addDef(DstReg)
7046 .addImm(Opcode);
7047
7048 for (Register R : Ops) {
7049 MIB.addUse(R);
7050 }
7051
7052 MIB.addUse(TDescr)
7053 .addImm(IsA16 ? 1 : 0)
7054 .cloneMemRefs(MI);
7055
7056 MI.eraseFromParent();
7057 return true;
7058}
7059
7061 MachineIRBuilder &B) const {
7062 unsigned Opc;
7063 int RoundMode = MI.getOperand(2).getImm();
7064
7065 if (RoundMode == (int)RoundingMode::TowardPositive)
7066 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7067 else if (RoundMode == (int)RoundingMode::TowardNegative)
7068 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7069 else
7070 return false;
7071
7072 B.buildInstr(Opc)
7073 .addDef(MI.getOperand(0).getReg())
7074 .addUse(MI.getOperand(1).getReg());
7075
7076 MI.eraseFromParent();
7077
7078 return true;
7079}
7080
7082 MachineIRBuilder &B) const {
7083 const SITargetLowering *TLI = ST.getTargetLowering();
7085 Register DstReg = MI.getOperand(0).getReg();
7086 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7087 MI.eraseFromParent();
7088 return true;
7089}
7090
7092 MachineIRBuilder &B) const {
7093 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7094 if (!ST.hasArchitectedSGPRs())
7095 return false;
7096 LLT S32 = LLT::scalar(32);
7097 Register DstReg = MI.getOperand(0).getReg();
7098 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7099 auto LSB = B.buildConstant(S32, 25);
7100 auto Width = B.buildConstant(S32, 5);
7101 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7102 MI.eraseFromParent();
7103 return true;
7104}
7105
7106static constexpr unsigned FPEnvModeBitField =
7108
7109static constexpr unsigned FPEnvTrapBitField =
7111
7114 MachineIRBuilder &B) const {
7115 Register Src = MI.getOperand(0).getReg();
7116 if (MRI.getType(Src) != S64)
7117 return false;
7118
7119 auto ModeReg =
7120 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7121 /*HasSideEffects=*/true, /*isConvergent=*/false)
7122 .addImm(FPEnvModeBitField);
7123 auto TrapReg =
7124 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7125 /*HasSideEffects=*/true, /*isConvergent=*/false)
7126 .addImm(FPEnvTrapBitField);
7127 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7128 MI.eraseFromParent();
7129 return true;
7130}
7131
7134 MachineIRBuilder &B) const {
7135 Register Src = MI.getOperand(0).getReg();
7136 if (MRI.getType(Src) != S64)
7137 return false;
7138
7139 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7140 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7141 /*HasSideEffects=*/true, /*isConvergent=*/false)
7142 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7143 .addReg(Unmerge.getReg(0));
7144 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7145 /*HasSideEffects=*/true, /*isConvergent=*/false)
7146 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7147 .addReg(Unmerge.getReg(1));
7148 MI.eraseFromParent();
7149 return true;
7150}
7151
7153 MachineInstr &MI) const {
7154 MachineIRBuilder &B = Helper.MIRBuilder;
7155 MachineRegisterInfo &MRI = *B.getMRI();
7156
7157 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7158 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7159 switch (IntrID) {
7160 case Intrinsic::amdgcn_if:
7161 case Intrinsic::amdgcn_else: {
7162 MachineInstr *Br = nullptr;
7163 MachineBasicBlock *UncondBrTarget = nullptr;
7164 bool Negated = false;
7165 if (MachineInstr *BrCond =
7166 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7167 const SIRegisterInfo *TRI
7168 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7169
7170 Register Def = MI.getOperand(1).getReg();
7171 Register Use = MI.getOperand(3).getReg();
7172
7173 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7174
7175 if (Negated)
7176 std::swap(CondBrTarget, UncondBrTarget);
7177
7178 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7179 if (IntrID == Intrinsic::amdgcn_if) {
7180 B.buildInstr(AMDGPU::SI_IF)
7181 .addDef(Def)
7182 .addUse(Use)
7183 .addMBB(UncondBrTarget);
7184 } else {
7185 B.buildInstr(AMDGPU::SI_ELSE)
7186 .addDef(Def)
7187 .addUse(Use)
7188 .addMBB(UncondBrTarget);
7189 }
7190
7191 if (Br) {
7192 Br->getOperand(0).setMBB(CondBrTarget);
7193 } else {
7194 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7195 // since we're swapping branch targets it needs to be reinserted.
7196 // FIXME: IRTranslator should probably not do this
7197 B.buildBr(*CondBrTarget);
7198 }
7199
7200 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7201 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7202 MI.eraseFromParent();
7203 BrCond->eraseFromParent();
7204 return true;
7205 }
7206
7207 return false;
7208 }
7209 case Intrinsic::amdgcn_loop: {
7210 MachineInstr *Br = nullptr;
7211 MachineBasicBlock *UncondBrTarget = nullptr;
7212 bool Negated = false;
7213 if (MachineInstr *BrCond =
7214 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7215 const SIRegisterInfo *TRI
7216 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7217
7218 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7219 Register Reg = MI.getOperand(2).getReg();
7220
7221 if (Negated)
7222 std::swap(CondBrTarget, UncondBrTarget);
7223
7224 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7225 B.buildInstr(AMDGPU::SI_LOOP)
7226 .addUse(Reg)
7227 .addMBB(UncondBrTarget);
7228
7229 if (Br)
7230 Br->getOperand(0).setMBB(CondBrTarget);
7231 else
7232 B.buildBr(*CondBrTarget);
7233
7234 MI.eraseFromParent();
7235 BrCond->eraseFromParent();
7236 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7237 return true;
7238 }
7239
7240 return false;
7241 }
7242 case Intrinsic::amdgcn_addrspacecast_nonnull:
7243 return legalizeAddrSpaceCast(MI, MRI, B);
7244 case Intrinsic::amdgcn_make_buffer_rsrc:
7246 case Intrinsic::amdgcn_kernarg_segment_ptr:
7247 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7248 // This only makes sense to call in a kernel, so just lower to null.
7249 B.buildConstant(MI.getOperand(0).getReg(), 0);
7250 MI.eraseFromParent();
7251 return true;
7252 }
7253
7256 case Intrinsic::amdgcn_implicitarg_ptr:
7257 return legalizeImplicitArgPtr(MI, MRI, B);
7258 case Intrinsic::amdgcn_workitem_id_x:
7259 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7261 case Intrinsic::amdgcn_workitem_id_y:
7262 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7264 case Intrinsic::amdgcn_workitem_id_z:
7265 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7267 case Intrinsic::amdgcn_workgroup_id_x:
7270 case Intrinsic::amdgcn_workgroup_id_y:
7273 case Intrinsic::amdgcn_workgroup_id_z:
7276 case Intrinsic::amdgcn_wave_id:
7277 return legalizeWaveID(MI, B);
7278 case Intrinsic::amdgcn_lds_kernel_id:
7281 case Intrinsic::amdgcn_dispatch_ptr:
7284 case Intrinsic::amdgcn_queue_ptr:
7287 case Intrinsic::amdgcn_implicit_buffer_ptr:
7290 case Intrinsic::amdgcn_dispatch_id:
7293 case Intrinsic::r600_read_ngroups_x:
7294 // TODO: Emit error for hsa
7297 case Intrinsic::r600_read_ngroups_y:
7300 case Intrinsic::r600_read_ngroups_z:
7303 case Intrinsic::r600_read_local_size_x:
7304 // TODO: Could insert G_ASSERT_ZEXT from s16
7306 case Intrinsic::r600_read_local_size_y:
7307 // TODO: Could insert G_ASSERT_ZEXT from s16
7309 // TODO: Could insert G_ASSERT_ZEXT from s16
7310 case Intrinsic::r600_read_local_size_z:
7312 case Intrinsic::r600_read_global_size_x:
7314 case Intrinsic::r600_read_global_size_y:
7316 case Intrinsic::r600_read_global_size_z:
7318 case Intrinsic::amdgcn_fdiv_fast:
7319 return legalizeFDIVFastIntrin(MI, MRI, B);
7320 case Intrinsic::amdgcn_is_shared:
7322 case Intrinsic::amdgcn_is_private:
7324 case Intrinsic::amdgcn_wavefrontsize: {
7325 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7326 MI.eraseFromParent();
7327 return true;
7328 }
7329 case Intrinsic::amdgcn_s_buffer_load:
7330 return legalizeSBufferLoad(Helper, MI);
7331 case Intrinsic::amdgcn_raw_buffer_store:
7332 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7333 case Intrinsic::amdgcn_struct_buffer_store:
7334 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7335 return legalizeBufferStore(MI, MRI, B, false, false);
7336 case Intrinsic::amdgcn_raw_buffer_store_format:
7337 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7338 case Intrinsic::amdgcn_struct_buffer_store_format:
7339 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7340 return legalizeBufferStore(MI, MRI, B, false, true);
7341 case Intrinsic::amdgcn_raw_tbuffer_store:
7342 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7343 case Intrinsic::amdgcn_struct_tbuffer_store:
7344 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7345 return legalizeBufferStore(MI, MRI, B, true, true);
7346 case Intrinsic::amdgcn_raw_buffer_load:
7347 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7348 case Intrinsic::amdgcn_struct_buffer_load:
7349 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7350 return legalizeBufferLoad(MI, MRI, B, false, false);
7351 case Intrinsic::amdgcn_raw_buffer_load_format:
7352 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7353 case Intrinsic::amdgcn_struct_buffer_load_format:
7354 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7355 return legalizeBufferLoad(MI, MRI, B, true, false);
7356 case Intrinsic::amdgcn_raw_tbuffer_load:
7357 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7358 case Intrinsic::amdgcn_struct_tbuffer_load:
7359 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7360 return legalizeBufferLoad(MI, MRI, B, true, true);
7361 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7362 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7363 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7364 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7365 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7366 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7367 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7368 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7369 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7370 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7371 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7372 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7373 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7374 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7375 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7376 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7377 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7378 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7379 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7380 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7381 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7382 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7383 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7384 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7385 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7386 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7387 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7388 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7389 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7390 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7391 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7392 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7393 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7394 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7395 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7396 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7397 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7398 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7399 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7400 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7401 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7402 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7403 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7404 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7405 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7406 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7407 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7408 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7409 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7410 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7411 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7412 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7413 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7414 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7415 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7416 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7417 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7418 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7419 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7420 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7421 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7422 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7423 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7424 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7425 return legalizeBufferAtomic(MI, B, IntrID);
7426 case Intrinsic::amdgcn_rsq_clamp:
7428 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7429 return legalizeBVHIntrinsic(MI, B);
7430 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7431 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7432 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7433 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7434 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7435 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7436 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7437 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7438 Register Index = MI.getOperand(5).getReg();
7439 LLT S32 = LLT::scalar(32);
7440 if (MRI.getType(Index) != S32)
7441 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7442 return true;
7443 }
7444 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7445 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7446 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7447 Register Index = MI.getOperand(7).getReg();
7448 LLT S32 = LLT::scalar(32);
7449 if (MRI.getType(Index) != S32)
7450 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7451 return true;
7452 }
7453 case Intrinsic::amdgcn_fmed3: {
7454 GISelChangeObserver &Observer = Helper.Observer;
7455
7456 // FIXME: This is to workaround the inability of tablegen match combiners to
7457 // match intrinsics in patterns.
7458 Observer.changingInstr(MI);
7459 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7460 MI.removeOperand(1);
7461 Observer.changedInstr(MI);
7462 return true;
7463 }
7464 case Intrinsic::amdgcn_readlane:
7465 case Intrinsic::amdgcn_writelane:
7466 case Intrinsic::amdgcn_readfirstlane:
7467 case Intrinsic::amdgcn_permlane16:
7468 case Intrinsic::amdgcn_permlanex16:
7469 case Intrinsic::amdgcn_permlane64:
7470 return legalizeLaneOp(Helper, MI, IntrID);
7471 default: {
7472 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7474 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7475 return true;
7476 }
7477 }
7478
7479 return true;
7480}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
unsigned Intr
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static const LLT V3S64
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static const LLT V16S16
static const LLT S128
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static const LLT V4S32
static const LLT V2S32
static const LLT V8S64
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static const LLT V12S32
static const LLT V8S32
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static const LLT V2S16
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static const LLT V4S64
static const LLT S1
static const LLT V3S32
static const LLT S64
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static const LLT S32
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static const LLT V6S32
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static const LLT V7S32
static const LLT V5S32
static const LLT V4S16
static const LLT V11S32
static const LLT F64
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static const LLT V32S32
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static const LLT V9S32
static const LLT V10S32
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static const LLT V12S16
static const LLT V16S64
static const LLT S512
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V16S32
static const LLT V7S64
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static const LLT V5S64
static const LLT S160
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static const LLT V4S128
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static const LLT V2BF16
static const LLT V6S64
static constexpr unsigned MaxRegisterSize
static const LLT V2S8
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static const LLT S96
static const LLT V2S64
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static const LLT S16
static const LLT V10S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V2S128
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static const LLT V2F16
static const LLT S256
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static const LLT S8
static const LLT V6S16
static bool isRegisterVectorType(LLT Ty)
static const LLT S224
static const LLT V8S16
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
static const LLT F32
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static Error unsupported(const char *Str, const Triple &T)
Definition: MachO.cpp:71
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define R2(n)
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
ppc ctr loops verify
const char LLVMTargetMachineRef TM
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1171
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
Value * RHS
Value * LHS
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1058
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1038
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:998
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasA16() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:473
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:262
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:451
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:457
bool hasMad64_32() const
Definition: GCNSubtarget.h:740
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:453
bool hasIntClamp() const
Definition: GCNSubtarget.h:353
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:270
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:373
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:597
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:729
bool hasNSAEncoding() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:983
Generation getGeneration() const
Definition: GCNSubtarget.h:313
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:727
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:731
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasFractBug() const
Definition: GCNSubtarget.h:391
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
static constexpr LLT float64()
Get a 64-bit IEEE double value.
Definition: LowLevelType.h:94
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
Definition: LowLevelType.h:214
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:64
constexpr bool isPointerVector() const
Definition: LowLevelType.h:152
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:159
constexpr bool isVector() const
Definition: LowLevelType.h:148
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr bool isPointer() const
Definition: LowLevelType.h:149
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:184
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
static constexpr LLT float16()
Get a 16-bit IEEE half value.
Definition: LowLevelType.h:84
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr bool isPointerOrPointerVector() const
Definition: LowLevelType.h:153
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
Definition: LowLevelType.h:230
constexpr LLT getScalarType() const
Definition: LowLevelType.h:208
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
Definition: LowLevelType.h:124
static constexpr LLT float32()
Get a 32-bit IEEE float value.
Definition: LowLevelType.h:89
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:387
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:412
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void truncate(size_type N)
Like resize, but requires that N is less than size().
Definition: SmallVector.h:657
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:38
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:897
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:337
@ Offset
Definition: DWP.cpp:480
Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition: Utils.cpp:1910
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:639
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:452
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
void * PointerTy
Definition: GenericValue.h:21
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:307
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Add
Sum of integers.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition: Utils.cpp:1660
@ DS_Warning
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:426
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:360
@ Enable
Enable colors.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static constexpr uint64_t encode(Fields... Values)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:271
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:272
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:76
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.