LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::float32();
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::float64();
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
323constexpr LLT V2BF16 = V2F16; // FIXME
324
325constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
326constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
327constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
328constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
329constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
330constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
331constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
332constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
333constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
334constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
335constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
336constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
337constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
338
339constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
340constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
341constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
342constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
343constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
344constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
345constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
346constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
347
348constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
349constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
350
351constexpr std::initializer_list<LLT> AllScalarTypes = {
353
354constexpr std::initializer_list<LLT> AllS16Vectors{
356
357constexpr std::initializer_list<LLT> AllS32Vectors = {
360
361constexpr std::initializer_list<LLT> AllS64Vectors = {
363
369
370// Checks whether a type is in the list of legal register types.
371static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
374
377 (ST.useRealTrue16Insts() && Ty == S16) ||
379}
380
382 unsigned TypeIdx) {
383 return [&ST, TypeIdx](const LegalityQuery &Query) {
384 return isRegisterClassType(ST, Query.Types[TypeIdx]);
385 };
386}
387
388// If we have a truncating store or an extending load with a data size larger
389// than 32-bits, we need to reduce to a 32-bit type.
391 return [=](const LegalityQuery &Query) {
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
395 };
396}
397
398// If we have a truncating store or an extending load with a data size larger
399// than 32-bits and mem location is a power of 2
401 return [=](const LegalityQuery &Query) {
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
403 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
404 isPowerOf2_64(MemSize);
405 };
406}
407
408// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
409// handle some operations by just promoting the register during
410// selection. There are also d16 loads on GFX9+ which preserve the high bits.
411static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
412 bool IsLoad, bool IsAtomic) {
413 switch (AS) {
415 // FIXME: Private element size.
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
418 return ST.useDS128() ? 128 : 64;
423 // Treat constant and global as identical. SMRD loads are sometimes usable for
424 // global loads (ideally constant address space should be eliminated)
425 // depending on the context. Legality cannot be context dependent, but
426 // RegBankSelect can split the load as necessary depending on the pointer
427 // register bank/uniformity and if the memory is invariant or not written in a
428 // kernel.
429 return IsLoad ? 512 : 128;
430 default:
431 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
432 // if they may alias scratch depending on the subtarget. This needs to be
433 // moved to custom handling to use addressMayBeAccessedAsPrivate
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
435 }
436}
437
438static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
439 const LegalityQuery &Query) {
440 const LLT Ty = Query.Types[0];
441
442 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
443 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
444
445 unsigned RegSize = Ty.getSizeInBits();
446 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
447 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
448 unsigned AS = Query.Types[1].getAddressSpace();
449
450 // All of these need to be custom lowered to cast the pointer operand.
452 return false;
453
454 // Do not handle extending vector loads.
455 if (Ty.isVector() && MemSize != RegSize)
456 return false;
457
458 // TODO: We should be able to widen loads if the alignment is high enough, but
459 // we also need to modify the memory access size.
460#if 0
461 // Accept widening loads based on alignment.
462 if (IsLoad && MemSize < Size)
463 MemSize = std::max(MemSize, Align);
464#endif
465
466 // Only 1-byte and 2-byte to 32-bit extloads are valid.
467 if (MemSize != RegSize && RegSize != 32)
468 return false;
469
470 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
471 Query.MMODescrs[0].Ordering !=
473 return false;
474
475 switch (MemSize) {
476 case 8:
477 case 16:
478 case 32:
479 case 64:
480 case 128:
481 break;
482 case 96:
483 if (!ST.hasDwordx3LoadStores())
484 return false;
485 break;
486 case 256:
487 case 512:
488 // These may contextually need to be broken down.
489 break;
490 default:
491 return false;
492 }
493
494 assert(RegSize >= MemSize);
495
496 if (AlignBits < MemSize) {
497 const SITargetLowering *TLI = ST.getTargetLowering();
498 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
499 Align(AlignBits / 8)))
500 return false;
501 }
502
503 return true;
504}
505
506// The newer buffer intrinsic forms take their resource arguments as
507// pointers in address space 8, aka s128 values. However, in order to not break
508// SelectionDAG, the underlying operations have to continue to take v4i32
509// arguments. Therefore, we convert resource pointers - or vectors of them
510// to integer values here.
511static bool hasBufferRsrcWorkaround(const LLT Ty) {
512 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
513 return true;
514 if (Ty.isVector()) {
515 const LLT ElemTy = Ty.getElementType();
516 return hasBufferRsrcWorkaround(ElemTy);
517 }
518 return false;
519}
520
521// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
522// workaround this. Eventually it should ignore the type for loads and only care
523// about the size. Return true in cases where we will workaround this for now by
524// bitcasting.
525static bool loadStoreBitcastWorkaround(const LLT Ty) {
527 return false;
528
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
531 return true;
532 if (Size <= 64)
533 return false;
534 // Address space 8 pointers get their own workaround.
536 return false;
537 if (!Ty.isVector())
538 return true;
539
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
542}
543
544static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
545 const LLT Ty = Query.Types[0];
546 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
548}
549
550/// Return true if a load or store of the type should be lowered with a bitcast
551/// to a different type.
552static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
553 const LLT MemTy) {
554 const unsigned MemSizeInBits = MemTy.getSizeInBits();
555 const unsigned Size = Ty.getSizeInBits();
556 if (Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
558
560 return true;
561
562 // Don't try to handle bitcasting vector ext loads for now.
563 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
564 (Size <= 32 || isRegisterSize(ST, Size)) &&
565 !isRegisterVectorElementType(Ty.getElementType());
566}
567
568/// Return true if we should legalize a load by widening an odd sized memory
569/// access up to the alignment. Note this case when the memory access itself
570/// changes, not the size of the result register.
571static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
572 uint64_t AlignInBits, unsigned AddrSpace,
573 unsigned Opcode) {
574 unsigned SizeInBits = MemoryTy.getSizeInBits();
575 // We don't want to widen cases that are naturally legal.
576 if (isPowerOf2_32(SizeInBits))
577 return false;
578
579 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
580 // end up widening these for a scalar load during RegBankSelect, if we don't
581 // have 96-bit scalar loads.
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
583 return false;
584
585 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
586 return false;
587
588 // A load is known dereferenceable up to the alignment, so it's legal to widen
589 // to it.
590 //
591 // TODO: Could check dereferenceable for less aligned cases.
592 unsigned RoundedSize = NextPowerOf2(SizeInBits);
593 if (AlignInBits < RoundedSize)
594 return false;
595
596 // Do not widen if it would introduce a slow unaligned load.
597 const SITargetLowering *TLI = ST.getTargetLowering();
598 unsigned Fast = 0;
600 RoundedSize, AddrSpace, Align(AlignInBits / 8),
602 Fast;
603}
604
605static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
606 unsigned Opcode) {
607 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
608 return false;
609
610 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
611 Query.MMODescrs[0].AlignInBits,
612 Query.Types[1].getAddressSpace(), Opcode);
613}
614
615/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
616/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
617/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
619 MachineRegisterInfo &MRI, unsigned Idx) {
620 MachineOperand &MO = MI.getOperand(Idx);
621
622 const LLT PointerTy = MRI.getType(MO.getReg());
623
624 // Paranoidly prevent us from doing this multiple times.
626 return PointerTy;
627
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630 if (!PointerTy.isVector()) {
631 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
632 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
633 const LLT S32 = LLT::scalar(32);
634
635 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
638 for (unsigned I = 0; I < NumParts; ++I)
639 VectorElems[I] =
640 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
641 B.buildMergeValues(MO, VectorElems);
642 MO.setReg(VectorReg);
643 return VectorTy;
644 }
645 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
646 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
647 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
648 B.buildIntToPtr(MO, Scalar);
649 MO.setReg(BitcastReg);
650
651 return VectorTy;
652}
653
654/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
655/// the form in which the value must be in order to be passed to the low-level
656/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
657/// needed in order to account for the fact that we can't define a register
658/// class for s128 without breaking SelectionDAG.
660 MachineRegisterInfo &MRI = *B.getMRI();
661 const LLT PointerTy = MRI.getType(Pointer);
662 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
663 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
664
665 if (!PointerTy.isVector()) {
666 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
667 SmallVector<Register, 4> PointerParts;
668 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
669 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
670 for (unsigned I = 0; I < NumParts; ++I)
671 PointerParts.push_back(Unmerged.getReg(I));
672 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
673 }
674 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
675 return B.buildBitcast(VectorTy, Scalar).getReg(0);
676}
677
679 unsigned Idx) {
680 MachineOperand &MO = MI.getOperand(Idx);
681
682 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
683 // Paranoidly prevent us from doing this multiple times.
685 return;
687}
688
690 const GCNTargetMachine &TM)
691 : ST(ST_) {
692 using namespace TargetOpcode;
693
694 auto GetAddrSpacePtr = [&TM](unsigned AS) {
695 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
696 };
697
698 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
699 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
700 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
701 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
702 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
703 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
704 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
705 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
706 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
707 const LLT BufferStridedPtr =
708 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
709
710 const LLT CodePtr = FlatPtr;
711
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
714 };
715
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 };
719
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
721
722 const std::initializer_list<LLT> FPTypesBase = {
723 S32, S64
724 };
725
726 const std::initializer_list<LLT> FPTypes16 = {
727 S32, S64, S16
728 };
729
730 const std::initializer_list<LLT> FPTypesPK16 = {
731 S32, S64, S16, V2S16
732 };
733
734 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
735
736 // s1 for VCC branches, s32 for SCC branches.
738
739 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
740 // elements for v3s16
743 .legalFor(AllS32Vectors)
745 .legalFor(AddrSpaces64)
746 .legalFor(AddrSpaces32)
747 .legalFor(AddrSpaces128)
748 .legalIf(isPointer(0))
749 .clampScalar(0, S16, S256)
751 .clampMaxNumElements(0, S32, 16)
753 .scalarize(0);
754
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
756 // Full set of gfx9 features.
757 if (ST.hasScalarAddSub64()) {
758 getActionDefinitionsBuilder({G_ADD, G_SUB})
759 .legalFor({S64, S32, S16, V2S16})
760 .clampMaxNumElementsStrict(0, S16, 2)
761 .scalarize(0)
762 .minScalar(0, S16)
764 .maxScalar(0, S32);
765 } else {
766 getActionDefinitionsBuilder({G_ADD, G_SUB})
767 .legalFor({S32, S16, V2S16})
768 .clampMaxNumElementsStrict(0, S16, 2)
769 .scalarize(0)
770 .minScalar(0, S16)
772 .maxScalar(0, S32);
773 }
774
775 if (ST.hasScalarSMulU64()) {
777 .legalFor({S64, S32, S16, V2S16})
778 .clampMaxNumElementsStrict(0, S16, 2)
779 .scalarize(0)
780 .minScalar(0, S16)
782 .custom();
783 } else {
785 .legalFor({S32, S16, V2S16})
786 .clampMaxNumElementsStrict(0, S16, 2)
787 .scalarize(0)
788 .minScalar(0, S16)
790 .custom();
791 }
792 assert(ST.hasMad64_32());
793
794 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
795 .legalFor({S32, S16, V2S16}) // Clamp modifier
796 .minScalarOrElt(0, S16)
798 .scalarize(0)
800 .lower();
801 } else if (ST.has16BitInsts()) {
802 getActionDefinitionsBuilder({G_ADD, G_SUB})
803 .legalFor({S32, S16})
804 .minScalar(0, S16)
806 .maxScalar(0, S32)
807 .scalarize(0);
808
810 .legalFor({S32, S16})
811 .scalarize(0)
812 .minScalar(0, S16)
814 .custom();
815 assert(ST.hasMad64_32());
816
817 // Technically the saturating operations require clamp bit support, but this
818 // was introduced at the same time as 16-bit operations.
819 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
820 .legalFor({S32, S16}) // Clamp modifier
821 .minScalar(0, S16)
822 .scalarize(0)
824 .lower();
825
826 // We're just lowering this, but it helps get a better result to try to
827 // coerce to the desired type first.
828 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
829 .minScalar(0, S16)
830 .scalarize(0)
831 .lower();
832 } else {
833 getActionDefinitionsBuilder({G_ADD, G_SUB})
834 .legalFor({S32})
835 .widenScalarToNextMultipleOf(0, 32)
836 .clampScalar(0, S32, S32)
837 .scalarize(0);
838
839 auto &Mul = getActionDefinitionsBuilder(G_MUL)
840 .legalFor({S32})
841 .scalarize(0)
842 .minScalar(0, S32)
844
845 if (ST.hasMad64_32())
846 Mul.custom();
847 else
848 Mul.maxScalar(0, S32);
849
850 if (ST.hasIntClamp()) {
851 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
852 .legalFor({S32}) // Clamp modifier.
853 .scalarize(0)
855 .lower();
856 } else {
857 // Clamp bit support was added in VI, along with 16-bit operations.
858 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
859 .minScalar(0, S32)
860 .scalarize(0)
861 .lower();
862 }
863
864 // FIXME: DAG expansion gets better results. The widening uses the smaller
865 // range values and goes for the min/max lowering directly.
866 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
867 .minScalar(0, S32)
868 .scalarize(0)
869 .lower();
870 }
871
873 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
874 .customFor({S32, S64})
875 .clampScalar(0, S32, S64)
877 .scalarize(0);
878
879 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
880 .legalFor({S32})
881 .maxScalar(0, S32);
882
883 if (ST.hasVOP3PInsts()) {
884 Mulh
885 .clampMaxNumElements(0, S8, 2)
886 .lowerFor({V2S8});
887 }
888
889 Mulh
890 .scalarize(0)
891 .lower();
892
893 // Report legal for any types we can handle anywhere. For the cases only legal
894 // on the SALU, RegBankSelect will be able to re-legalize.
895 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
896 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
897 .clampScalar(0, S32, S64)
903 .scalarize(0);
904
906 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
907 .legalFor({{S32, S1}, {S32, S32}})
908 .clampScalar(0, S32, S32)
909 .scalarize(0);
910
912 // Don't worry about the size constraint.
914 .lower();
915
917 .legalFor({S1, S32, S64, S16, GlobalPtr,
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
919 .legalIf(isPointer(0))
920 .clampScalar(0, S32, S64)
922
923 getActionDefinitionsBuilder(G_FCONSTANT)
924 .legalFor({S32, S64, S16})
925 .clampScalar(0, S16, S64);
926
927 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
928 .legalIf(isRegisterClassType(ST, 0))
929 // s1 and s16 are special cases because they have legal operations on
930 // them, but don't really occupy registers in the normal way.
931 .legalFor({S1, S16})
932 .clampNumElements(0, V16S32, V32S32)
936 .clampMaxNumElements(0, S32, 16);
937
938 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
939
940 // If the amount is divergent, we have to do a wave reduction to get the
941 // maximum value, so this is expanded during RegBankSelect.
942 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
943 .legalFor({{PrivatePtr, S32}});
944
945 getActionDefinitionsBuilder(G_STACKSAVE)
946 .customFor({PrivatePtr});
947 getActionDefinitionsBuilder(G_STACKRESTORE)
948 .legalFor({PrivatePtr});
949
950 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
951
952 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
953 .customIf(typeIsNot(0, PrivatePtr));
954
955 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
956
957 auto &FPOpActions = getActionDefinitionsBuilder(
958 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
960 .legalFor({S32, S64});
961 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
962 .customFor({S32, S64});
963 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
964 .customFor({S32, S64});
965
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor({S16, V2S16});
969 else
970 FPOpActions.legalFor({S16});
971
972 TrigActions.customFor({S16});
973 FDIVActions.customFor({S16});
974 }
975
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor({V2S32});
978 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
979 }
980
981 auto &MinNumMaxNumIeee =
982 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
983
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(FPTypesPK16)
986 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
987 .clampMaxNumElements(0, S16, 2)
988 .clampScalar(0, S16, S64)
989 .scalarize(0);
990 } else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
992 } else {
993 MinNumMaxNumIeee.legalFor(FPTypesBase)
994 .clampScalar(0, S32, S64)
995 .scalarize(0);
996 }
997
998 auto &MinNumMaxNum = getActionDefinitionsBuilder(
999 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1000
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(FPTypesPK16)
1003 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1004 .clampMaxNumElements(0, S16, 2)
1005 .clampScalar(0, S16, S64)
1006 .scalarize(0);
1007 } else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(FPTypes16)
1009 .clampScalar(0, S16, S64)
1010 .scalarize(0);
1011 } else {
1012 MinNumMaxNum.customFor(FPTypesBase)
1013 .clampScalar(0, S32, S64)
1014 .scalarize(0);
1015 }
1016
1017 if (ST.hasVOP3PInsts())
1018 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1019
1020 FPOpActions
1021 .scalarize(0)
1022 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1023
1024 TrigActions
1025 .scalarize(0)
1026 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1027
1028 FDIVActions
1029 .scalarize(0)
1030 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1031
1032 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1033 .legalFor(FPTypesPK16)
1035 .scalarize(0)
1036 .clampScalar(0, S16, S64);
1037
1038 if (ST.has16BitInsts()) {
1040 .legalFor({S16})
1041 .customFor({S32, S64})
1042 .scalarize(0)
1043 .unsupported();
1045 .legalFor({S32, S64, S16})
1046 .scalarize(0)
1047 .clampScalar(0, S16, S64);
1048
1049 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1050 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1051 .scalarize(0)
1052 .maxScalarIf(typeIs(0, S16), 1, S16)
1053 .clampScalar(1, S32, S32)
1054 .lower();
1055
1057 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1058 .scalarize(0)
1059 .lower();
1060
1062 .lowerFor({S16, S32, S64})
1063 .scalarize(0)
1064 .lower();
1065 } else {
1067 .customFor({S32, S64, S16})
1068 .scalarize(0)
1069 .unsupported();
1070
1071
1072 if (ST.hasFractBug()) {
1074 .customFor({S64})
1075 .legalFor({S32, S64})
1076 .scalarize(0)
1077 .clampScalar(0, S32, S64);
1078 } else {
1080 .legalFor({S32, S64})
1081 .scalarize(0)
1082 .clampScalar(0, S32, S64);
1083 }
1084
1085 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1086 .legalFor({{S32, S32}, {S64, S32}})
1087 .scalarize(0)
1088 .clampScalar(0, S32, S64)
1089 .clampScalar(1, S32, S32)
1090 .lower();
1091
1093 .customFor({{S32, S32}, {S64, S32}})
1094 .scalarize(0)
1095 .minScalar(0, S32)
1096 .clampScalar(1, S32, S32)
1097 .lower();
1098
1100 .lowerFor({S32, S64})
1101 .scalarize(0)
1102 .lower();
1103 }
1104
1105 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1106 if (ST.hasCvtPkF16F32Inst()) {
1107 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1108 .clampMaxNumElements(0, S16, 2);
1109 } else {
1110 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1111 }
1112 FPTruncActions.scalarize(0).lower();
1113
1115 .legalFor({{S64, S32}, {S32, S16}})
1116 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1117 .scalarize(0);
1118
1119 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1120 if (ST.has16BitInsts()) {
1121 FSubActions
1122 // Use actual fsub instruction
1123 .legalFor({S32, S16})
1124 // Must use fadd + fneg
1125 .lowerFor({S64, V2S16});
1126 } else {
1127 FSubActions
1128 // Use actual fsub instruction
1129 .legalFor({S32})
1130 // Must use fadd + fneg
1131 .lowerFor({S64, S16, V2S16});
1132 }
1133
1134 FSubActions
1135 .scalarize(0)
1136 .clampScalar(0, S32, S64);
1137
1138 // Whether this is legal depends on the floating point mode for the function.
1139 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1140 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1141 FMad.customFor({S32, S16});
1142 else if (ST.hasMadMacF32Insts())
1143 FMad.customFor({S32});
1144 else if (ST.hasMadF16())
1145 FMad.customFor({S16});
1146 FMad.scalarize(0)
1147 .lower();
1148
1149 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1150 if (ST.has16BitInsts()) {
1151 FRem.customFor({S16, S32, S64});
1152 } else {
1153 FRem.minScalar(0, S32)
1154 .customFor({S32, S64});
1155 }
1156 FRem.scalarize(0);
1157
1158 // TODO: Do we need to clamp maximum bitwidth?
1160 .legalIf(isScalar(0))
1161 .legalFor({{V2S16, V2S32}})
1162 .clampMaxNumElements(0, S16, 2)
1163 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1164 // situations (like an invalid implicit use), we don't want to infinite loop
1165 // in the legalizer.
1167 .alwaysLegal();
1168
1169 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1170 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1171 {S32, S1}, {S64, S1}, {S16, S1}})
1172 .scalarize(0)
1173 .clampScalar(0, S32, S64)
1174 .widenScalarToNextPow2(1, 32);
1175
1176 // TODO: Split s1->s64 during regbankselect for VALU.
1177 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1178 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1179 .lowerIf(typeIs(1, S1))
1180 .customFor({{S32, S64}, {S64, S64}});
1181 if (ST.has16BitInsts())
1182 IToFP.legalFor({{S16, S16}});
1183 IToFP.clampScalar(1, S32, S64)
1184 .minScalar(0, S32)
1185 .scalarize(0)
1187
1188 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1189 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1190 .customFor({{S64, S32}, {S64, S64}})
1191 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1192 if (ST.has16BitInsts())
1193 FPToI.legalFor({{S16, S16}});
1194 else
1195 FPToI.minScalar(1, S32);
1196
1197 FPToI.minScalar(0, S32)
1198 .widenScalarToNextPow2(0, 32)
1199 .scalarize(0)
1200 .lower();
1201
1202 // clang-format off
1203 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1204 .legalFor({{S32, S32}, {S32, S64}})
1205 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1206 FPToISat.minScalar(1, S32);
1207 FPToISat.minScalar(0, S32)
1208 .widenScalarToNextPow2(0, 32)
1209 .scalarize(0)
1210 .lower();
1211 // clang-format on
1212
1213 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1214 .clampScalar(0, S16, S64)
1215 .scalarize(0)
1216 .lower();
1217
1218 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1219 .legalFor({S16, S32})
1220 .scalarize(0)
1221 .lower();
1222
1223 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1224 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1225 .scalarize(0)
1226 .lower();
1227
1228 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1229 .clampScalar(0, S16, S64)
1230 .scalarize(0)
1231 .lower();
1232
1233 if (ST.has16BitInsts()) {
1235 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1236 .legalFor({S16, S32, S64})
1237 .clampScalar(0, S16, S64)
1238 .scalarize(0);
1239 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1241 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1242 .legalFor({S32, S64})
1243 .clampScalar(0, S32, S64)
1244 .scalarize(0);
1245 } else {
1247 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1248 .legalFor({S32})
1249 .customFor({S64})
1250 .clampScalar(0, S32, S64)
1251 .scalarize(0);
1252 }
1253
1255 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1256 .legalIf(all(isPointer(0), sameSize(0, 1)))
1257 .scalarize(0)
1258 .scalarSameSizeAs(1, 0);
1259
1261 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1262 .scalarSameSizeAs(1, 0)
1263 .scalarize(0);
1264
1265 auto &CmpBuilder =
1267 // The compare output type differs based on the register bank of the output,
1268 // so make both s1 and s32 legal.
1269 //
1270 // Scalar compares producing output in scc will be promoted to s32, as that
1271 // is the allocatable register type that will be needed for the copy from
1272 // scc. This will be promoted during RegBankSelect, and we assume something
1273 // before that won't try to use s32 result types.
1274 //
1275 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1276 // bank.
1278 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1279 .legalForCartesianProduct(
1280 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1281 if (ST.has16BitInsts()) {
1282 CmpBuilder.legalFor({{S1, S16}});
1283 }
1284
1285 CmpBuilder
1287 .clampScalar(1, S32, S64)
1288 .scalarize(0)
1289 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1290
1291 auto &FCmpBuilder =
1293 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1294
1295 if (ST.hasSALUFloatInsts())
1296 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1297
1298 FCmpBuilder
1300 .clampScalar(1, S32, S64)
1301 .scalarize(0);
1302
1303 // FIXME: fpow has a selection pattern that should move to custom lowering.
1304 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1305 if (ST.has16BitInsts())
1306 ExpOps.customFor({{S32}, {S16}});
1307 else
1308 ExpOps.customFor({S32});
1309 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1310 .scalarize(0);
1311
1313 .clampScalar(0, MinScalarFPTy, S32)
1314 .lower();
1315
1317 .legalFor(ST.has16BitInsts(), {S16})
1318 .customFor({S32, S16})
1319 .scalarize(0)
1320 .lower();
1321
1323 .legalFor(ST.has16BitInsts(), {S16})
1324 .customFor({S32, S64, S16})
1325 .scalarize(0)
1326 .lower();
1327
1328 auto &LogOps =
1329 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1330 LogOps.customFor({S32, S16, S64});
1331 LogOps.clampScalar(0, MinScalarFPTy, S32)
1332 .scalarize(0);
1333
1334 // The 64-bit versions produce 32-bit results, but only on the SALU.
1336 .legalFor({{S32, S32}, {S32, S64}})
1337 .clampScalar(0, S32, S32)
1338 .widenScalarToNextPow2(1, 32)
1339 .clampScalar(1, S32, S64)
1340 .scalarize(0)
1341 .widenScalarToNextPow2(0, 32);
1342
1343 // If no 16 bit instr is available, lower into different instructions.
1344 if (ST.has16BitInsts())
1345 getActionDefinitionsBuilder(G_IS_FPCLASS)
1346 .legalForCartesianProduct({S1}, FPTypes16)
1347 .widenScalarToNextPow2(1)
1348 .scalarize(0)
1349 .lower();
1350 else
1351 getActionDefinitionsBuilder(G_IS_FPCLASS)
1352 .legalForCartesianProduct({S1}, FPTypesBase)
1353 .lowerFor({S1, S16})
1354 .widenScalarToNextPow2(1)
1355 .scalarize(0)
1356 .lower();
1357
1358 // The hardware instructions return a different result on 0 than the generic
1359 // instructions expect. The hardware produces -1, but these produce the
1360 // bitwidth.
1361 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1362 .scalarize(0)
1363 .clampScalar(0, S32, S32)
1364 .clampScalar(1, S32, S64)
1365 .widenScalarToNextPow2(0, 32)
1366 .widenScalarToNextPow2(1, 32)
1367 .custom();
1368
1369 // The 64-bit versions produce 32-bit results, but only on the SALU.
1370 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1371 .legalFor({{S32, S32}, {S32, S64}})
1372 .customIf(scalarNarrowerThan(1, 32))
1373 .clampScalar(0, S32, S32)
1374 .clampScalar(1, S32, S64)
1375 .scalarize(0)
1376 .widenScalarToNextPow2(0, 32)
1377 .widenScalarToNextPow2(1, 32);
1378
1379 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1380 .legalFor({{S32, S32}, {S32, S64}})
1381 .clampScalar(0, S32, S32)
1382 .clampScalar(1, S32, S64)
1383 .scalarize(0)
1384 .widenScalarToNextPow2(0, 32)
1385 .widenScalarToNextPow2(1, 32);
1386
1387 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1388 // RegBankSelect.
1389 getActionDefinitionsBuilder(G_BITREVERSE)
1390 .legalFor({S32, S64})
1391 .clampScalar(0, S32, S64)
1392 .scalarize(0)
1394
1395 if (ST.has16BitInsts()) {
1397 .legalFor({S16, S32, V2S16})
1398 .clampMaxNumElementsStrict(0, S16, 2)
1399 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1400 // narrowScalar limitation.
1402 .clampScalar(0, S16, S32)
1403 .scalarize(0);
1404
1405 if (ST.hasVOP3PInsts()) {
1407 .legalFor({S32, S16, V2S16})
1408 .clampMaxNumElements(0, S16, 2)
1409 .minScalar(0, S16)
1411 .scalarize(0)
1412 .lower();
1413 if (ST.hasIntMinMax64()) {
1414 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1415 .legalFor({S32, S16, S64, V2S16})
1416 .clampMaxNumElements(0, S16, 2)
1417 .minScalar(0, S16)
1419 .scalarize(0)
1420 .lower();
1421 } else {
1422 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1423 .legalFor({S32, S16, V2S16})
1424 .clampMaxNumElements(0, S16, 2)
1425 .minScalar(0, S16)
1427 .scalarize(0)
1428 .lower();
1429 }
1430 } else {
1431 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1432 .legalFor({S32, S16})
1433 .widenScalarToNextPow2(0)
1434 .minScalar(0, S16)
1435 .scalarize(0)
1436 .lower();
1437 }
1438 } else {
1439 // TODO: Should have same legality without v_perm_b32
1441 .legalFor({S32})
1442 .lowerIf(scalarNarrowerThan(0, 32))
1443 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1444 // narrowScalar limitation.
1446 .maxScalar(0, S32)
1447 .scalarize(0)
1448 .lower();
1449
1450 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1451 .legalFor({S32})
1452 .minScalar(0, S32)
1454 .scalarize(0)
1455 .lower();
1456 }
1457
1458 getActionDefinitionsBuilder(G_INTTOPTR)
1459 // List the common cases
1460 .legalForCartesianProduct(AddrSpaces64, {S64})
1461 .legalForCartesianProduct(AddrSpaces32, {S32})
1462 .scalarize(0)
1463 // Accept any address space as long as the size matches
1464 .legalIf(sameSize(0, 1))
1466 [](const LegalityQuery &Query) {
1467 return std::pair(
1468 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1469 })
1470 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1471 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1472 });
1473
1474 getActionDefinitionsBuilder(G_PTRTOINT)
1475 // List the common cases
1476 .legalForCartesianProduct(AddrSpaces64, {S64})
1477 .legalForCartesianProduct(AddrSpaces32, {S32})
1478 .scalarize(0)
1479 // Accept any address space as long as the size matches
1480 .legalIf(sameSize(0, 1))
1482 [](const LegalityQuery &Query) {
1483 return std::pair(
1484 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1485 })
1486 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1487 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1488 });
1489
1490 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1491 .scalarize(0)
1492 .custom();
1493
1494 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1495 bool IsLoad) -> bool {
1496 const LLT DstTy = Query.Types[0];
1497
1498 // Split vector extloads.
1499 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1500
1501 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1502 return true;
1503
1504 const LLT PtrTy = Query.Types[1];
1505 unsigned AS = PtrTy.getAddressSpace();
1506 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1507 Query.MMODescrs[0].Ordering !=
1509 return true;
1510
1511 // Catch weird sized loads that don't evenly divide into the access sizes
1512 // TODO: May be able to widen depending on alignment etc.
1513 unsigned NumRegs = (MemSize + 31) / 32;
1514 if (NumRegs == 3) {
1515 if (!ST.hasDwordx3LoadStores())
1516 return true;
1517 } else {
1518 // If the alignment allows, these should have been widened.
1519 if (!isPowerOf2_32(NumRegs))
1520 return true;
1521 }
1522
1523 return false;
1524 };
1525
1526 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1527 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1528 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1529
1530 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1531 // LDS
1532 // TODO: Unsupported flat for SI.
1533
1534 for (unsigned Op : {G_LOAD, G_STORE}) {
1535 const bool IsStore = Op == G_STORE;
1536
1537 auto &Actions = getActionDefinitionsBuilder(Op);
1538 // Explicitly list some common cases.
1539 // TODO: Does this help compile time at all?
1540 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1541 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1542 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1543 {S64, GlobalPtr, S64, GlobalAlign32},
1544 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1545 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1546 {S32, GlobalPtr, S8, GlobalAlign8},
1547 {S32, GlobalPtr, S16, GlobalAlign16},
1548
1549 {S32, LocalPtr, S32, 32},
1550 {S64, LocalPtr, S64, 32},
1551 {V2S32, LocalPtr, V2S32, 32},
1552 {S32, LocalPtr, S8, 8},
1553 {S32, LocalPtr, S16, 16},
1554 {V2S16, LocalPtr, S32, 32},
1555
1556 {S32, PrivatePtr, S32, 32},
1557 {S32, PrivatePtr, S8, 8},
1558 {S32, PrivatePtr, S16, 16},
1559 {V2S16, PrivatePtr, S32, 32},
1560
1561 {S32, ConstantPtr, S32, GlobalAlign32},
1562 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1563 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1564 {S64, ConstantPtr, S64, GlobalAlign32},
1565 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1566 Actions.legalIf(
1567 [=](const LegalityQuery &Query) -> bool {
1568 return isLoadStoreLegal(ST, Query);
1569 });
1570
1571 // The custom pointers (fat pointers, buffer resources) don't work with load
1572 // and store at this level. Fat pointers should have been lowered to
1573 // intrinsics before the translation to MIR.
1574 Actions.unsupportedIf(
1575 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1576
1577 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1578 // ptrtoint. This is needed to account for the fact that we can't have i128
1579 // as a register class for SelectionDAG reasons.
1580 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1581 return hasBufferRsrcWorkaround(Query.Types[0]);
1582 });
1583
1584 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1585 // 64-bits.
1586 //
1587 // TODO: Should generalize bitcast action into coerce, which will also cover
1588 // inserting addrspacecasts.
1589 Actions.customIf(typeIs(1, Constant32Ptr));
1590
1591 // Turn any illegal element vectors into something easier to deal
1592 // with. These will ultimately produce 32-bit scalar shifts to extract the
1593 // parts anyway.
1594 //
1595 // For odd 16-bit element vectors, prefer to split those into pieces with
1596 // 16-bit vector parts.
1597 Actions.bitcastIf(
1598 [=](const LegalityQuery &Query) -> bool {
1599 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1600 Query.MMODescrs[0].MemoryTy);
1601 }, bitcastToRegisterType(0));
1602
1603 if (!IsStore) {
1604 // Widen suitably aligned loads by loading extra bytes. The standard
1605 // legalization actions can't properly express widening memory operands.
1606 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1607 return shouldWidenLoad(ST, Query, G_LOAD);
1608 });
1609 }
1610
1611 // FIXME: load/store narrowing should be moved to lower action
1612 Actions
1613 .narrowScalarIf(
1614 [=](const LegalityQuery &Query) -> bool {
1615 return !Query.Types[0].isVector() &&
1616 needToSplitMemOp(Query, Op == G_LOAD);
1617 },
1618 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1619 const LLT DstTy = Query.Types[0];
1620 const LLT PtrTy = Query.Types[1];
1621
1622 const unsigned DstSize = DstTy.getSizeInBits();
1623 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1624
1625 // Split extloads.
1626 if (DstSize > MemSize)
1627 return std::pair(0, LLT::scalar(MemSize));
1628
1629 unsigned MaxSize = maxSizeForAddrSpace(
1630 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1631 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1632 if (MemSize > MaxSize)
1633 return std::pair(0, LLT::scalar(MaxSize));
1634
1635 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1636 return std::pair(0, LLT::scalar(Align));
1637 })
1638 .fewerElementsIf(
1639 [=](const LegalityQuery &Query) -> bool {
1640 return Query.Types[0].isVector() &&
1641 needToSplitMemOp(Query, Op == G_LOAD);
1642 },
1643 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1644 const LLT DstTy = Query.Types[0];
1645 const LLT PtrTy = Query.Types[1];
1646
1647 LLT EltTy = DstTy.getElementType();
1648 unsigned MaxSize = maxSizeForAddrSpace(
1649 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1650 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1651
1652 // FIXME: Handle widened to power of 2 results better. This ends
1653 // up scalarizing.
1654 // FIXME: 3 element stores scalarized on SI
1655
1656 // Split if it's too large for the address space.
1657 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1658 if (MemSize > MaxSize) {
1659 unsigned NumElts = DstTy.getNumElements();
1660 unsigned EltSize = EltTy.getSizeInBits();
1661
1662 if (MaxSize % EltSize == 0) {
1663 return std::pair(
1665 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1666 }
1667
1668 unsigned NumPieces = MemSize / MaxSize;
1669
1670 // FIXME: Refine when odd breakdowns handled
1671 // The scalars will need to be re-legalized.
1672 if (NumPieces == 1 || NumPieces >= NumElts ||
1673 NumElts % NumPieces != 0)
1674 return std::pair(0, EltTy);
1675
1676 return std::pair(0,
1677 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1678 }
1679
1680 // FIXME: We could probably handle weird extending loads better.
1681 if (DstTy.getSizeInBits() > MemSize)
1682 return std::pair(0, EltTy);
1683
1684 unsigned EltSize = EltTy.getSizeInBits();
1685 unsigned DstSize = DstTy.getSizeInBits();
1686 if (!isPowerOf2_32(DstSize)) {
1687 // We're probably decomposing an odd sized store. Try to split
1688 // to the widest type. TODO: Account for alignment. As-is it
1689 // should be OK, since the new parts will be further legalized.
1690 unsigned FloorSize = llvm::bit_floor(DstSize);
1691 return std::pair(
1693 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1694 }
1695
1696 // May need relegalization for the scalars.
1697 return std::pair(0, EltTy);
1698 })
1699 .minScalar(0, S32)
1700 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1702 .widenScalarToNextPow2(0)
1703 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1704 .lower();
1705 }
1706
1707 // FIXME: Unaligned accesses not lowered.
1708 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1709 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1710 {S32, GlobalPtr, S16, 2 * 8},
1711 {S32, LocalPtr, S8, 8},
1712 {S32, LocalPtr, S16, 16},
1713 {S32, PrivatePtr, S8, 8},
1714 {S32, PrivatePtr, S16, 16},
1715 {S32, ConstantPtr, S8, 8},
1716 {S32, ConstantPtr, S16, 2 * 8}})
1717 .legalIf(
1718 [=](const LegalityQuery &Query) -> bool {
1719 return isLoadStoreLegal(ST, Query);
1720 });
1721
1722 if (ST.hasFlatAddressSpace()) {
1723 ExtLoads.legalForTypesWithMemDesc(
1724 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1725 }
1726
1727 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1728 // 64-bits.
1729 //
1730 // TODO: Should generalize bitcast action into coerce, which will also cover
1731 // inserting addrspacecasts.
1732 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1733
1734 ExtLoads.clampScalar(0, S32, S32)
1736 .lower();
1737
1738 auto &Atomics = getActionDefinitionsBuilder(
1739 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1740 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1741 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1742 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1743 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1744 {S64, GlobalPtr}, {S64, LocalPtr},
1745 {S32, RegionPtr}, {S64, RegionPtr}});
1746 if (ST.hasFlatAddressSpace()) {
1747 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1748 }
1749
1750 auto &Atomics32 =
1751 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1752 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1753 if (ST.hasFlatAddressSpace()) {
1754 Atomics32.legalFor({{S32, FlatPtr}});
1755 }
1756
1757 // TODO: v2bf16 operations, and fat buffer pointer support.
1758 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1759 if (ST.hasLDSFPAtomicAddF32()) {
1760 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1761 if (ST.hasLdsAtomicAddF64())
1762 Atomic.legalFor({{S64, LocalPtr}});
1763 if (ST.hasAtomicDsPkAdd16Insts())
1764 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1765 }
1766 if (ST.hasAtomicFaddInsts())
1767 Atomic.legalFor({{S32, GlobalPtr}});
1768 if (ST.hasFlatAtomicFaddF32Inst())
1769 Atomic.legalFor({{S32, FlatPtr}});
1770
1771 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1772 // These are legal with some caveats, and should have undergone expansion in
1773 // the IR in most situations
1774 // TODO: Move atomic expansion into legalizer
1775 Atomic.legalFor({
1776 {S32, GlobalPtr},
1777 {S64, GlobalPtr},
1778 {S64, FlatPtr}
1779 });
1780 }
1781
1782 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1783 ST.hasAtomicBufferGlobalPkAddF16Insts())
1784 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1785 if (ST.hasAtomicGlobalPkAddBF16Inst())
1786 Atomic.legalFor({{V2BF16, GlobalPtr}});
1787 if (ST.hasAtomicFlatPkAdd16Insts())
1788 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1789
1790
1791 // Most of the legalization work here is done by AtomicExpand. We could
1792 // probably use a simpler legality rule that just assumes anything is OK.
1793 auto &AtomicFMinFMax =
1794 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1795 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1796
1797 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1798 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1799 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1800 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1801 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1802 AtomicFMinFMax.legalFor({F32, FlatPtr});
1803 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1804 AtomicFMinFMax.legalFor({F64, FlatPtr});
1805
1806 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1807 // demarshalling
1808 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1809 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1810 {S32, FlatPtr}, {S64, FlatPtr}})
1811 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1812 {S32, RegionPtr}, {S64, RegionPtr}});
1813 // TODO: Pointer types, any 32-bit or 64-bit vector
1814
1815 // Condition should be s32 for scalar, s1 for vector.
1818 LocalPtr, FlatPtr, PrivatePtr,
1819 LLT::fixed_vector(2, LocalPtr),
1820 LLT::fixed_vector(2, PrivatePtr)},
1821 {S1, S32})
1822 .clampScalar(0, S16, S64)
1823 .scalarize(1)
1826 .clampMaxNumElements(0, S32, 2)
1827 .clampMaxNumElements(0, LocalPtr, 2)
1828 .clampMaxNumElements(0, PrivatePtr, 2)
1829 .scalarize(0)
1831 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1832
1833 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1834 // be more flexible with the shift amount type.
1835 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1836 .legalFor({{S32, S32}, {S64, S32}});
1837 if (ST.has16BitInsts()) {
1838 if (ST.hasVOP3PInsts()) {
1839 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1840 .clampMaxNumElements(0, S16, 2);
1841 } else
1842 Shifts.legalFor({{S16, S16}});
1843
1844 // TODO: Support 16-bit shift amounts for all types
1845 Shifts.widenScalarIf(
1846 [=](const LegalityQuery &Query) {
1847 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1848 // 32-bit amount.
1849 const LLT ValTy = Query.Types[0];
1850 const LLT AmountTy = Query.Types[1];
1851 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1852 AmountTy.getSizeInBits() < 16;
1853 }, changeTo(1, S16));
1854 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1855 Shifts.clampScalar(1, S32, S32);
1856 Shifts.widenScalarToNextPow2(0, 16);
1857 Shifts.clampScalar(0, S16, S64);
1858
1859 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1860 .minScalar(0, S16)
1861 .scalarize(0)
1862 .lower();
1863 } else {
1864 // Make sure we legalize the shift amount type first, as the general
1865 // expansion for the shifted type will produce much worse code if it hasn't
1866 // been truncated already.
1867 Shifts.clampScalar(1, S32, S32);
1868 Shifts.widenScalarToNextPow2(0, 32);
1869 Shifts.clampScalar(0, S32, S64);
1870
1871 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1872 .minScalar(0, S32)
1873 .scalarize(0)
1874 .lower();
1875 }
1876 Shifts.scalarize(0);
1877
1878 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1879 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1880 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1881 unsigned IdxTypeIdx = 2;
1882
1884 .customIf([=](const LegalityQuery &Query) {
1885 const LLT EltTy = Query.Types[EltTypeIdx];
1886 const LLT VecTy = Query.Types[VecTypeIdx];
1887 const LLT IdxTy = Query.Types[IdxTypeIdx];
1888 const unsigned EltSize = EltTy.getSizeInBits();
1889 const bool isLegalVecType =
1891 // Address space 8 pointers are 128-bit wide values, but the logic
1892 // below will try to bitcast them to 2N x s64, which will fail.
1893 // Therefore, as an intermediate step, wrap extracts/insertions from a
1894 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1895 // extraction result) in order to produce a vector operation that can
1896 // be handled by the logic below.
1897 if (EltTy.isPointer() && EltSize > 64)
1898 return true;
1899 return (EltSize == 32 || EltSize == 64) &&
1900 VecTy.getSizeInBits() % 32 == 0 &&
1901 VecTy.getSizeInBits() <= MaxRegisterSize &&
1902 IdxTy.getSizeInBits() == 32 &&
1903 isLegalVecType;
1904 })
1905 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1906 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1907 bitcastToVectorElement32(VecTypeIdx))
1908 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1909 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1910 scalarOrEltWiderThan(VecTypeIdx, 64)),
1911 [=](const LegalityQuery &Query) {
1912 // For > 64-bit element types, try to turn this into a
1913 // 64-bit element vector since we may be able to do better
1914 // indexing if this is scalar. If not, fall back to 32.
1915 const LLT EltTy = Query.Types[EltTypeIdx];
1916 const LLT VecTy = Query.Types[VecTypeIdx];
1917 const unsigned DstEltSize = EltTy.getSizeInBits();
1918 const unsigned VecSize = VecTy.getSizeInBits();
1919
1920 const unsigned TargetEltSize =
1921 DstEltSize % 64 == 0 ? 64 : 32;
1922 return std::pair(VecTypeIdx,
1923 LLT::fixed_vector(VecSize / TargetEltSize,
1924 TargetEltSize));
1925 })
1926 .clampScalar(EltTypeIdx, S32, S64)
1927 .clampScalar(VecTypeIdx, S32, S64)
1928 .clampScalar(IdxTypeIdx, S32, S32)
1929 .clampMaxNumElements(VecTypeIdx, S32, 32)
1930 // TODO: Clamp elements for 64-bit vectors?
1931 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1933 // It should only be necessary with variable indexes.
1934 // As a last resort, lower to the stack
1935 .lower();
1936 }
1937
1938 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1939 .unsupportedIf([=](const LegalityQuery &Query) {
1940 const LLT &EltTy = Query.Types[1].getElementType();
1941 return Query.Types[0] != EltTy;
1942 });
1943
1944 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1945 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1946 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1947
1948 // FIXME: Doesn't handle extract of illegal sizes.
1950 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1951 .lowerIf([=](const LegalityQuery &Query) {
1952 // Sub-vector(or single element) insert and extract.
1953 // TODO: verify immediate offset here since lower only works with
1954 // whole elements.
1955 const LLT BigTy = Query.Types[BigTyIdx];
1956 return BigTy.isVector();
1957 })
1958 // FIXME: Multiples of 16 should not be legal.
1959 .legalIf([=](const LegalityQuery &Query) {
1960 const LLT BigTy = Query.Types[BigTyIdx];
1961 const LLT LitTy = Query.Types[LitTyIdx];
1962 return (BigTy.getSizeInBits() % 32 == 0) &&
1963 (LitTy.getSizeInBits() % 16 == 0);
1964 })
1965 .widenScalarIf(
1966 [=](const LegalityQuery &Query) {
1967 const LLT BigTy = Query.Types[BigTyIdx];
1968 return (BigTy.getScalarSizeInBits() < 16);
1969 },
1971 .widenScalarIf(
1972 [=](const LegalityQuery &Query) {
1973 const LLT LitTy = Query.Types[LitTyIdx];
1974 return (LitTy.getScalarSizeInBits() < 16);
1975 },
1977 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1978 .widenScalarToNextPow2(BigTyIdx, 32);
1979
1980 }
1981
1982 auto &BuildVector =
1983 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1985 .legalForCartesianProduct(AllS64Vectors, {S64})
1986 .clampNumElements(0, V16S32, V32S32)
1991
1992 if (ST.hasScalarPackInsts()) {
1993 BuildVector
1994 // FIXME: Should probably widen s1 vectors straight to s32
1995 .minScalarOrElt(0, S16)
1996 .minScalar(1, S16);
1997
1998 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1999 .legalFor({V2S16, S32})
2000 .lower();
2001 } else {
2002 BuildVector.customFor({V2S16, S16});
2003 BuildVector.minScalarOrElt(0, S32);
2004
2005 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2006 .customFor({V2S16, S32})
2007 .lower();
2008 }
2009
2010 BuildVector.legalIf(isRegisterType(ST, 0));
2011
2012 // FIXME: Clamp maximum size
2013 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2014 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2015 .clampMaxNumElements(0, S32, 32)
2016 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2017 .clampMaxNumElements(0, S16, 64);
2018
2019 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2020
2021 // Merge/Unmerge
2022 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2023 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2024 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2025
2026 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2027 const LLT Ty = Query.Types[TypeIdx];
2028 if (Ty.isVector()) {
2029 const LLT &EltTy = Ty.getElementType();
2030 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2031 return true;
2033 return true;
2034 }
2035 return false;
2036 };
2037
2038 auto &Builder =
2040 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2041 .lowerFor({{S16, V2S16}})
2042 .lowerIf([=](const LegalityQuery &Query) {
2043 const LLT BigTy = Query.Types[BigTyIdx];
2044 return BigTy.getSizeInBits() == 32;
2045 })
2046 // Try to widen to s16 first for small types.
2047 // TODO: Only do this on targets with legal s16 shifts
2048 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2049 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2051 oneMoreElement(BigTyIdx))
2053 elementTypeIs(1, S16)),
2054 changeTo(1, V2S16))
2055 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2056 // not worth considering the multiples of 64 since 2*192 and 2*384
2057 // are not valid.
2058 .clampScalar(LitTyIdx, S32, S512)
2059 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2060 // Break up vectors with weird elements into scalars
2062 [=](const LegalityQuery &Query) {
2063 return notValidElt(Query, LitTyIdx);
2064 },
2065 scalarize(0))
2066 .fewerElementsIf(
2067 [=](const LegalityQuery &Query) {
2068 return notValidElt(Query, BigTyIdx);
2069 },
2070 scalarize(1))
2071 .clampScalar(BigTyIdx, S32, MaxScalar);
2072
2073 if (Op == G_MERGE_VALUES) {
2074 Builder.widenScalarIf(
2075 // TODO: Use 16-bit shifts if legal for 8-bit values?
2076 [=](const LegalityQuery &Query) {
2077 const LLT Ty = Query.Types[LitTyIdx];
2078 return Ty.getSizeInBits() < 32;
2079 },
2080 changeTo(LitTyIdx, S32));
2081 }
2082
2083 Builder.widenScalarIf(
2084 [=](const LegalityQuery &Query) {
2085 const LLT Ty = Query.Types[BigTyIdx];
2086 return Ty.getSizeInBits() % 16 != 0;
2087 },
2088 [=](const LegalityQuery &Query) {
2089 // Pick the next power of 2, or a multiple of 64 over 128.
2090 // Whichever is smaller.
2091 const LLT &Ty = Query.Types[BigTyIdx];
2092 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2093 if (NewSizeInBits >= 256) {
2094 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2095 if (RoundedTo < NewSizeInBits)
2096 NewSizeInBits = RoundedTo;
2097 }
2098 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2099 })
2100 // Any vectors left are the wrong size. Scalarize them.
2101 .scalarize(0)
2102 .scalarize(1);
2103 }
2104
2105 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2106 // RegBankSelect.
2107 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2108 .legalFor({{S32}, {S64}})
2109 .clampScalar(0, S32, S64);
2110
2111 if (ST.hasVOP3PInsts()) {
2112 SextInReg.lowerFor({{V2S16}})
2113 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2114 // get more vector shift opportunities, since we'll get those when
2115 // expanded.
2116 .clampMaxNumElementsStrict(0, S16, 2);
2117 } else if (ST.has16BitInsts()) {
2118 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2119 } else {
2120 // Prefer to promote to s32 before lowering if we don't have 16-bit
2121 // shifts. This avoid a lot of intermediate truncate and extend operations.
2122 SextInReg.lowerFor({{S32}, {S64}});
2123 }
2124
2125 SextInReg
2126 .scalarize(0)
2127 .clampScalar(0, S32, S64)
2128 .lower();
2129
2130 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2131 .scalarize(0)
2132 .lower();
2133
2134 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2135 FSHRActionDefs.legalFor({{S32, S32}})
2136 .clampMaxNumElementsStrict(0, S16, 2);
2137 if (ST.hasVOP3PInsts())
2138 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2139 FSHRActionDefs.scalarize(0).lower();
2140
2141 if (ST.hasVOP3PInsts()) {
2143 .lowerFor({{V2S16, V2S16}})
2144 .clampMaxNumElementsStrict(0, S16, 2)
2145 .scalarize(0)
2146 .lower();
2147 } else {
2149 .scalarize(0)
2150 .lower();
2151 }
2152
2153 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2154 .legalFor({S64});
2155
2156 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2157
2159 .alwaysLegal();
2160
2161 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2162 .scalarize(0)
2163 .minScalar(0, S32)
2164 .lower();
2165
2166 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2167 .legalFor({{S32, S32}, {S64, S32}})
2168 .clampScalar(1, S32, S32)
2169 .clampScalar(0, S32, S64)
2171 .scalarize(0);
2172
2174 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2175 G_FCOPYSIGN,
2176
2177 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2178 G_READ_REGISTER, G_WRITE_REGISTER,
2179
2180 G_SADDO, G_SSUBO})
2181 .lower();
2182
2183 if (ST.hasIEEEMinimumMaximumInsts()) {
2184 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2185 .legalFor(FPTypesPK16)
2186 .clampMaxNumElements(0, S16, 2)
2187 .scalarize(0);
2188 } else if (ST.hasVOP3PInsts()) {
2189 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2190 .lowerFor({V2S16})
2191 .clampMaxNumElementsStrict(0, S16, 2)
2192 .scalarize(0)
2193 .lower();
2194 } else {
2195 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2196 .scalarize(0)
2197 .clampScalar(0, S32, S64)
2198 .lower();
2199 }
2200
2201 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2202 .lower();
2203
2204 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2205
2206 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2207 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2208 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2209 .unsupported();
2210
2212
2214 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2215 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2216 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2217 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2218 .legalFor(AllVectors)
2219 .scalarize(1)
2220 .lower();
2221
2223 verify(*ST.getInstrInfo());
2224}
2225
2228 LostDebugLocObserver &LocObserver) const {
2229 MachineIRBuilder &B = Helper.MIRBuilder;
2230 MachineRegisterInfo &MRI = *B.getMRI();
2231
2232 switch (MI.getOpcode()) {
2233 case TargetOpcode::G_ADDRSPACE_CAST:
2234 return legalizeAddrSpaceCast(MI, MRI, B);
2235 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2236 return legalizeFroundeven(MI, MRI, B);
2237 case TargetOpcode::G_FCEIL:
2238 return legalizeFceil(MI, MRI, B);
2239 case TargetOpcode::G_FREM:
2240 return legalizeFrem(MI, MRI, B);
2241 case TargetOpcode::G_INTRINSIC_TRUNC:
2242 return legalizeIntrinsicTrunc(MI, MRI, B);
2243 case TargetOpcode::G_SITOFP:
2244 return legalizeITOFP(MI, MRI, B, true);
2245 case TargetOpcode::G_UITOFP:
2246 return legalizeITOFP(MI, MRI, B, false);
2247 case TargetOpcode::G_FPTOSI:
2248 return legalizeFPTOI(MI, MRI, B, true);
2249 case TargetOpcode::G_FPTOUI:
2250 return legalizeFPTOI(MI, MRI, B, false);
2251 case TargetOpcode::G_FMINNUM:
2252 case TargetOpcode::G_FMAXNUM:
2253 case TargetOpcode::G_FMINIMUMNUM:
2254 case TargetOpcode::G_FMAXIMUMNUM:
2255 return legalizeMinNumMaxNum(Helper, MI);
2256 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2257 return legalizeExtractVectorElt(MI, MRI, B);
2258 case TargetOpcode::G_INSERT_VECTOR_ELT:
2259 return legalizeInsertVectorElt(MI, MRI, B);
2260 case TargetOpcode::G_FSIN:
2261 case TargetOpcode::G_FCOS:
2262 return legalizeSinCos(MI, MRI, B);
2263 case TargetOpcode::G_GLOBAL_VALUE:
2264 return legalizeGlobalValue(MI, MRI, B);
2265 case TargetOpcode::G_LOAD:
2266 case TargetOpcode::G_SEXTLOAD:
2267 case TargetOpcode::G_ZEXTLOAD:
2268 return legalizeLoad(Helper, MI);
2269 case TargetOpcode::G_STORE:
2270 return legalizeStore(Helper, MI);
2271 case TargetOpcode::G_FMAD:
2272 return legalizeFMad(MI, MRI, B);
2273 case TargetOpcode::G_FDIV:
2274 return legalizeFDIV(MI, MRI, B);
2275 case TargetOpcode::G_FFREXP:
2276 return legalizeFFREXP(MI, MRI, B);
2277 case TargetOpcode::G_FSQRT:
2278 return legalizeFSQRT(MI, MRI, B);
2279 case TargetOpcode::G_UDIV:
2280 case TargetOpcode::G_UREM:
2281 case TargetOpcode::G_UDIVREM:
2282 return legalizeUnsignedDIV_REM(MI, MRI, B);
2283 case TargetOpcode::G_SDIV:
2284 case TargetOpcode::G_SREM:
2285 case TargetOpcode::G_SDIVREM:
2286 return legalizeSignedDIV_REM(MI, MRI, B);
2287 case TargetOpcode::G_ATOMIC_CMPXCHG:
2288 return legalizeAtomicCmpXChg(MI, MRI, B);
2289 case TargetOpcode::G_FLOG2:
2290 return legalizeFlog2(MI, B);
2291 case TargetOpcode::G_FLOG:
2292 case TargetOpcode::G_FLOG10:
2293 return legalizeFlogCommon(MI, B);
2294 case TargetOpcode::G_FEXP2:
2295 return legalizeFExp2(MI, B);
2296 case TargetOpcode::G_FEXP:
2297 case TargetOpcode::G_FEXP10:
2298 return legalizeFExp(MI, B);
2299 case TargetOpcode::G_FPOW:
2300 return legalizeFPow(MI, B);
2301 case TargetOpcode::G_FFLOOR:
2302 return legalizeFFloor(MI, MRI, B);
2303 case TargetOpcode::G_BUILD_VECTOR:
2304 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2305 return legalizeBuildVector(MI, MRI, B);
2306 case TargetOpcode::G_MUL:
2307 return legalizeMul(Helper, MI);
2308 case TargetOpcode::G_CTLZ:
2309 case TargetOpcode::G_CTTZ:
2310 return legalizeCTLZ_CTTZ(MI, MRI, B);
2311 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2312 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2313 case TargetOpcode::G_STACKSAVE:
2314 return legalizeStackSave(MI, B);
2315 case TargetOpcode::G_GET_FPENV:
2316 return legalizeGetFPEnv(MI, MRI, B);
2317 case TargetOpcode::G_SET_FPENV:
2318 return legalizeSetFPEnv(MI, MRI, B);
2319 case TargetOpcode::G_TRAP:
2320 return legalizeTrap(MI, MRI, B);
2321 case TargetOpcode::G_DEBUGTRAP:
2322 return legalizeDebugTrap(MI, MRI, B);
2323 default:
2324 return false;
2325 }
2326
2327 llvm_unreachable("expected switch to return");
2328}
2329
2331 unsigned AS,
2333 MachineIRBuilder &B) const {
2334 MachineFunction &MF = B.getMF();
2335 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2336 const LLT S32 = LLT::scalar(32);
2337 const LLT S64 = LLT::scalar(64);
2338
2340
2341 if (ST.hasApertureRegs()) {
2342 // Note: this register is somewhat broken. When used as a 32-bit operand,
2343 // it only returns zeroes. The real value is in the upper 32 bits.
2344 // Thus, we must emit extract the high 32 bits.
2345 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2346 ? AMDGPU::SRC_SHARED_BASE
2347 : AMDGPU::SRC_PRIVATE_BASE;
2348 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2349 !ST.hasGloballyAddressableScratch()) &&
2350 "Cannot use src_private_base with globally addressable scratch!");
2351 Register Dst = MRI.createGenericVirtualRegister(S64);
2352 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2353 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2354 return B.buildUnmerge(S32, Dst).getReg(1);
2355 }
2356
2357 Register LoadAddr = MRI.createGenericVirtualRegister(
2359 // For code object version 5, private_base and shared_base are passed through
2360 // implicit kernargs.
2364
2369 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2370
2371 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2373
2374 if (!loadInputValue(KernargPtrReg, B,
2376 return Register();
2377
2379 PtrInfo.getWithOffset(Offset),
2383
2384 // Pointer address
2385 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2386 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2387 // Load address
2388 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2389 }
2390
2391 Register QueuePtr = MRI.createGenericVirtualRegister(
2393
2395 return Register();
2396
2397 // TODO: Use custom PseudoSourceValue
2399
2400 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2401 // private_segment_aperture_base_hi.
2402 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2403
2405 PtrInfo,
2408 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2409
2410 B.buildObjectPtrOffset(
2411 LoadAddr, QueuePtr,
2412 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2413 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2414}
2415
2416/// Return true if the value is a known valid address, such that a null check is
2417/// not necessary.
2419 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2420 MachineInstr *Def = MRI.getVRegDef(Val);
2421 switch (Def->getOpcode()) {
2422 case AMDGPU::G_FRAME_INDEX:
2423 case AMDGPU::G_GLOBAL_VALUE:
2424 case AMDGPU::G_BLOCK_ADDR:
2425 return true;
2426 case AMDGPU::G_CONSTANT: {
2427 const ConstantInt *CI = Def->getOperand(1).getCImm();
2428 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2429 }
2430 default:
2431 return false;
2432 }
2433
2434 return false;
2435}
2436
2439 MachineIRBuilder &B) const {
2440 MachineFunction &MF = B.getMF();
2441
2442 // MI can either be a G_ADDRSPACE_CAST or a
2443 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2444 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2445 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2446 Intrinsic::amdgcn_addrspacecast_nonnull));
2447
2448 const LLT S32 = LLT::scalar(32);
2449 Register Dst = MI.getOperand(0).getReg();
2450 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2451 : MI.getOperand(1).getReg();
2452 LLT DstTy = MRI.getType(Dst);
2453 LLT SrcTy = MRI.getType(Src);
2454 unsigned DestAS = DstTy.getAddressSpace();
2455 unsigned SrcAS = SrcTy.getAddressSpace();
2456
2457 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2458 // vector element.
2459 assert(!DstTy.isVector());
2460
2461 const AMDGPUTargetMachine &TM
2462 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2463
2464 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2465 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2466 return true;
2467 }
2468
2469 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2470 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2471 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2472 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2473 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2474 ST.hasGloballyAddressableScratch()) {
2475 // flat -> private with globally addressable scratch: subtract
2476 // src_flat_scratch_base_lo.
2477 const LLT S32 = LLT::scalar(32);
2478 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2479 Register FlatScratchBaseLo =
2480 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2481 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2482 .getReg(0);
2483 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2484 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2485 return B.buildIntToPtr(Dst, Sub).getReg(0);
2486 }
2487
2488 // Extract low 32-bits of the pointer.
2489 return B.buildExtract(Dst, Src, 0).getReg(0);
2490 };
2491
2492 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2493 // G_ADDRSPACE_CAST we need to guess.
2494 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2495 castFlatToLocalOrPrivate(Dst);
2496 MI.eraseFromParent();
2497 return true;
2498 }
2499
2500 unsigned NullVal = TM.getNullPointerValue(DestAS);
2501
2502 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2503 auto FlatNull = B.buildConstant(SrcTy, 0);
2504
2505 // Extract low 32-bits of the pointer.
2506 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2507
2508 auto CmpRes =
2509 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2510 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2511
2512 MI.eraseFromParent();
2513 return true;
2514 }
2515
2516 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2517 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2518 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2519 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2520 // Coerce the type of the low half of the result so we can use
2521 // merge_values.
2522 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2523
2524 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2525 ST.hasGloballyAddressableScratch()) {
2526 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2527 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2528 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2529 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2530 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2531 .addUse(AllOnes)
2532 .addUse(ThreadID)
2533 .getReg(0);
2534 if (ST.isWave64()) {
2535 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2536 .addUse(AllOnes)
2537 .addUse(ThreadID)
2538 .getReg(0);
2539 }
2540 Register ShAmt =
2541 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2542 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2543 Register CvtPtr =
2544 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2545 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2546 // 64-bit hi:lo value.
2547 Register FlatScratchBase =
2548 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2549 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2550 .getReg(0);
2551 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2552 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2553 }
2554
2555 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2556 if (!ApertureReg.isValid())
2557 return false;
2558
2559 // TODO: Should we allow mismatched types but matching sizes in merges to
2560 // avoid the ptrtoint?
2561 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2562 };
2563
2564 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2565 // G_ADDRSPACE_CAST we need to guess.
2566 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2567 castLocalOrPrivateToFlat(Dst);
2568 MI.eraseFromParent();
2569 return true;
2570 }
2571
2572 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2573
2574 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2575 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2576
2577 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2578 SegmentNull.getReg(0));
2579
2580 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2581
2582 MI.eraseFromParent();
2583 return true;
2584 }
2585
2586 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2587 SrcTy.getSizeInBits() == 64) {
2588 // Truncate.
2589 B.buildExtract(Dst, Src, 0);
2590 MI.eraseFromParent();
2591 return true;
2592 }
2593
2594 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2595 DstTy.getSizeInBits() == 64) {
2597 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2598 auto PtrLo = B.buildPtrToInt(S32, Src);
2599 if (AddrHiVal == 0) {
2600 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2601 B.buildIntToPtr(Dst, Zext);
2602 } else {
2603 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2604 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2605 }
2606
2607 MI.eraseFromParent();
2608 return true;
2609 }
2610
2611 // Invalid casts are poison.
2612 // TODO: Should return poison
2613 B.buildUndef(Dst);
2614 MI.eraseFromParent();
2615 return true;
2616}
2617
2620 MachineIRBuilder &B) const {
2621 Register Src = MI.getOperand(1).getReg();
2622 LLT Ty = MRI.getType(Src);
2623 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2624
2625 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2626 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2627
2628 auto C1 = B.buildFConstant(Ty, C1Val);
2629 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2630
2631 // TODO: Should this propagate fast-math-flags?
2632 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2633 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2634
2635 auto C2 = B.buildFConstant(Ty, C2Val);
2636 auto Fabs = B.buildFAbs(Ty, Src);
2637
2638 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2639 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2640 MI.eraseFromParent();
2641 return true;
2642}
2643
2646 MachineIRBuilder &B) const {
2647
2648 const LLT S1 = LLT::scalar(1);
2649 const LLT S64 = LLT::scalar(64);
2650
2651 Register Src = MI.getOperand(1).getReg();
2652 assert(MRI.getType(Src) == S64);
2653
2654 // result = trunc(src)
2655 // if (src > 0.0 && src != result)
2656 // result += 1.0
2657
2658 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2659
2660 const auto Zero = B.buildFConstant(S64, 0.0);
2661 const auto One = B.buildFConstant(S64, 1.0);
2662 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2663 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2664 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2665 auto Add = B.buildSelect(S64, And, One, Zero);
2666
2667 // TODO: Should this propagate fast-math-flags?
2668 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2669 MI.eraseFromParent();
2670 return true;
2671}
2672
2675 MachineIRBuilder &B) const {
2676 Register DstReg = MI.getOperand(0).getReg();
2677 Register Src0Reg = MI.getOperand(1).getReg();
2678 Register Src1Reg = MI.getOperand(2).getReg();
2679 auto Flags = MI.getFlags();
2680 LLT Ty = MRI.getType(DstReg);
2681
2682 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2683 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2684 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2685 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2686 MI.eraseFromParent();
2687 return true;
2688}
2689
2692 const unsigned FractBits = 52;
2693 const unsigned ExpBits = 11;
2694 LLT S32 = LLT::scalar(32);
2695
2696 auto Const0 = B.buildConstant(S32, FractBits - 32);
2697 auto Const1 = B.buildConstant(S32, ExpBits);
2698
2699 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2700 .addUse(Hi)
2701 .addUse(Const0.getReg(0))
2702 .addUse(Const1.getReg(0));
2703
2704 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2705}
2706
2709 MachineIRBuilder &B) const {
2710 const LLT S1 = LLT::scalar(1);
2711 const LLT S32 = LLT::scalar(32);
2712 const LLT S64 = LLT::scalar(64);
2713
2714 Register Src = MI.getOperand(1).getReg();
2715 assert(MRI.getType(Src) == S64);
2716
2717 // TODO: Should this use extract since the low half is unused?
2718 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2719 Register Hi = Unmerge.getReg(1);
2720
2721 // Extract the upper half, since this is where we will find the sign and
2722 // exponent.
2723 auto Exp = extractF64Exponent(Hi, B);
2724
2725 const unsigned FractBits = 52;
2726
2727 // Extract the sign bit.
2728 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2729 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2730
2731 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2732
2733 const auto Zero32 = B.buildConstant(S32, 0);
2734
2735 // Extend back to 64-bits.
2736 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2737
2738 auto Shr = B.buildAShr(S64, FractMask, Exp);
2739 auto Not = B.buildNot(S64, Shr);
2740 auto Tmp0 = B.buildAnd(S64, Src, Not);
2741 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2742
2743 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2744 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2745
2746 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2747 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2748 MI.eraseFromParent();
2749 return true;
2750}
2751
2754 MachineIRBuilder &B, bool Signed) const {
2755
2756 Register Dst = MI.getOperand(0).getReg();
2757 Register Src = MI.getOperand(1).getReg();
2758
2759 const LLT S64 = LLT::scalar(64);
2760 const LLT S32 = LLT::scalar(32);
2761
2762 assert(MRI.getType(Src) == S64);
2763
2764 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2765 auto ThirtyTwo = B.buildConstant(S32, 32);
2766
2767 if (MRI.getType(Dst) == S64) {
2768 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2769 : B.buildUITOFP(S64, Unmerge.getReg(1));
2770
2771 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2772 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2773
2774 // TODO: Should this propagate fast-math-flags?
2775 B.buildFAdd(Dst, LdExp, CvtLo);
2776 MI.eraseFromParent();
2777 return true;
2778 }
2779
2780 assert(MRI.getType(Dst) == S32);
2781
2782 auto One = B.buildConstant(S32, 1);
2783
2784 MachineInstrBuilder ShAmt;
2785 if (Signed) {
2786 auto ThirtyOne = B.buildConstant(S32, 31);
2787 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2788 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2789 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2790 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2791 .addUse(Unmerge.getReg(1));
2792 auto LS2 = B.buildSub(S32, LS, One);
2793 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2794 } else
2795 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2796 auto Norm = B.buildShl(S64, Src, ShAmt);
2797 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2798 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2799 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2800 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2801 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2802 B.buildFLdexp(Dst, FVal, Scale);
2803 MI.eraseFromParent();
2804 return true;
2805}
2806
2807// TODO: Copied from DAG implementation. Verify logic and document how this
2808// actually works.
2812 bool Signed) const {
2813
2814 Register Dst = MI.getOperand(0).getReg();
2815 Register Src = MI.getOperand(1).getReg();
2816
2817 const LLT S64 = LLT::scalar(64);
2818 const LLT S32 = LLT::scalar(32);
2819
2820 const LLT SrcLT = MRI.getType(Src);
2821 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2822
2823 unsigned Flags = MI.getFlags();
2824
2825 // The basic idea of converting a floating point number into a pair of 32-bit
2826 // integers is illustrated as follows:
2827 //
2828 // tf := trunc(val);
2829 // hif := floor(tf * 2^-32);
2830 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2831 // hi := fptoi(hif);
2832 // lo := fptoi(lof);
2833 //
2834 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2836 if (Signed && SrcLT == S32) {
2837 // However, a 32-bit floating point number has only 23 bits mantissa and
2838 // it's not enough to hold all the significant bits of `lof` if val is
2839 // negative. To avoid the loss of precision, We need to take the absolute
2840 // value after truncating and flip the result back based on the original
2841 // signedness.
2842 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2843 Trunc = B.buildFAbs(S32, Trunc, Flags);
2844 }
2845 MachineInstrBuilder K0, K1;
2846 if (SrcLT == S64) {
2847 K0 = B.buildFConstant(
2848 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2849 K1 = B.buildFConstant(
2850 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2851 } else {
2852 K0 = B.buildFConstant(
2853 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2854 K1 = B.buildFConstant(
2855 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2856 }
2857
2858 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2859 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2860 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2861
2862 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2863 : B.buildFPTOUI(S32, FloorMul);
2864 auto Lo = B.buildFPTOUI(S32, Fma);
2865
2866 if (Signed && SrcLT == S32) {
2867 // Flip the result based on the signedness, which is either all 0s or 1s.
2868 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2869 // r := xor({lo, hi}, sign) - sign;
2870 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2871 Sign);
2872 } else
2873 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2874 MI.eraseFromParent();
2875
2876 return true;
2877}
2878
2880 MachineInstr &MI) const {
2881 MachineFunction &MF = Helper.MIRBuilder.getMF();
2883
2884 // With ieee_mode disabled, the instructions have the correct behavior.
2885 if (!MFI->getMode().IEEE)
2886 return true;
2887
2889}
2890
2893 MachineIRBuilder &B) const {
2894 // TODO: Should move some of this into LegalizerHelper.
2895
2896 // TODO: Promote dynamic indexing of s16 to s32
2897
2898 Register Dst = MI.getOperand(0).getReg();
2899 Register Vec = MI.getOperand(1).getReg();
2900
2901 LLT VecTy = MRI.getType(Vec);
2902 LLT EltTy = VecTy.getElementType();
2903 assert(EltTy == MRI.getType(Dst));
2904
2905 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2906 // but we can't go directly to that logic becasue you can't bitcast a vector
2907 // of pointers to a vector of integers. Therefore, introduce an intermediate
2908 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2909 // drive the legalization forward.
2910 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2911 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2912 LLT IntVecTy = VecTy.changeElementType(IntTy);
2913
2914 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2915 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2916 B.buildIntToPtr(Dst, IntElt);
2917
2918 MI.eraseFromParent();
2919 return true;
2920 }
2921
2922 // FIXME: Artifact combiner probably should have replaced the truncated
2923 // constant before this, so we shouldn't need
2924 // getIConstantVRegValWithLookThrough.
2925 std::optional<ValueAndVReg> MaybeIdxVal =
2926 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2927 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2928 return true;
2929 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2930
2931 if (IdxVal < VecTy.getNumElements()) {
2932 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2933 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2934 } else {
2935 B.buildUndef(Dst);
2936 }
2937
2938 MI.eraseFromParent();
2939 return true;
2940}
2941
2944 MachineIRBuilder &B) const {
2945 // TODO: Should move some of this into LegalizerHelper.
2946
2947 // TODO: Promote dynamic indexing of s16 to s32
2948
2949 Register Dst = MI.getOperand(0).getReg();
2950 Register Vec = MI.getOperand(1).getReg();
2951 Register Ins = MI.getOperand(2).getReg();
2952
2953 LLT VecTy = MRI.getType(Vec);
2954 LLT EltTy = VecTy.getElementType();
2955 assert(EltTy == MRI.getType(Ins));
2956
2957 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2958 // but we can't go directly to that logic becasue you can't bitcast a vector
2959 // of pointers to a vector of integers. Therefore, make the pointer vector
2960 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2961 // new value, and then inttoptr the result vector back. This will then allow
2962 // the rest of legalization to take over.
2963 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2964 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2965 LLT IntVecTy = VecTy.changeElementType(IntTy);
2966
2967 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2968 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2969 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2970 MI.getOperand(3));
2971 B.buildIntToPtr(Dst, IntVecDest);
2972 MI.eraseFromParent();
2973 return true;
2974 }
2975
2976 // FIXME: Artifact combiner probably should have replaced the truncated
2977 // constant before this, so we shouldn't need
2978 // getIConstantVRegValWithLookThrough.
2979 std::optional<ValueAndVReg> MaybeIdxVal =
2980 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2981 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2982 return true;
2983
2984 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2985
2986 unsigned NumElts = VecTy.getNumElements();
2987 if (IdxVal < NumElts) {
2989 for (unsigned i = 0; i < NumElts; ++i)
2990 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2991 B.buildUnmerge(SrcRegs, Vec);
2992
2993 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2994 B.buildMergeLikeInstr(Dst, SrcRegs);
2995 } else {
2996 B.buildUndef(Dst);
2997 }
2998
2999 MI.eraseFromParent();
3000 return true;
3001}
3002
3005 MachineIRBuilder &B) const {
3006
3007 Register DstReg = MI.getOperand(0).getReg();
3008 Register SrcReg = MI.getOperand(1).getReg();
3009 LLT Ty = MRI.getType(DstReg);
3010 unsigned Flags = MI.getFlags();
3011
3012 Register TrigVal;
3013 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3014 if (ST.hasTrigReducedRange()) {
3015 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3016 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3017 .addUse(MulVal.getReg(0))
3018 .setMIFlags(Flags)
3019 .getReg(0);
3020 } else
3021 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3022
3023 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3024 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3025 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3026 .addUse(TrigVal)
3027 .setMIFlags(Flags);
3028 MI.eraseFromParent();
3029 return true;
3030}
3031
3034 const GlobalValue *GV,
3035 int64_t Offset,
3036 unsigned GAFlags) const {
3037 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3038 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3039 // to the following code sequence:
3040 //
3041 // For constant address space:
3042 // s_getpc_b64 s[0:1]
3043 // s_add_u32 s0, s0, $symbol
3044 // s_addc_u32 s1, s1, 0
3045 //
3046 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3047 // a fixup or relocation is emitted to replace $symbol with a literal
3048 // constant, which is a pc-relative offset from the encoding of the $symbol
3049 // operand to the global variable.
3050 //
3051 // For global address space:
3052 // s_getpc_b64 s[0:1]
3053 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3054 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3055 //
3056 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3057 // fixups or relocations are emitted to replace $symbol@*@lo and
3058 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3059 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3060 // operand to the global variable.
3061
3063
3064 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3065 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3066
3067 if (ST.has64BitLiterals()) {
3068 assert(GAFlags != SIInstrInfo::MO_NONE);
3069
3071 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3072 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3073 } else {
3075 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3076
3077 MIB.addGlobalAddress(GV, Offset, GAFlags);
3078 if (GAFlags == SIInstrInfo::MO_NONE)
3079 MIB.addImm(0);
3080 else
3081 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3082 }
3083
3084 if (!B.getMRI()->getRegClassOrNull(PCReg))
3085 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3086
3087 if (PtrTy.getSizeInBits() == 32)
3088 B.buildExtract(DstReg, PCReg, 0);
3089 return true;
3090}
3091
3092// Emit a ABS32_LO / ABS32_HI relocation stub.
3094 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3095 MachineRegisterInfo &MRI) const {
3096 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3097
3098 if (RequiresHighHalf && ST.has64BitLiterals()) {
3099 if (!MRI.getRegClassOrNull(DstReg))
3100 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3101 B.buildInstr(AMDGPU::S_MOV_B64)
3102 .addDef(DstReg)
3103 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3104 return;
3105 }
3106
3107 LLT S32 = LLT::scalar(32);
3108
3109 // Use the destination directly, if and only if we store the lower address
3110 // part only and we don't have a register class being set.
3111 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3112 ? DstReg
3113 : MRI.createGenericVirtualRegister(S32);
3114
3115 if (!MRI.getRegClassOrNull(AddrLo))
3116 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3117
3118 // Write the lower half.
3119 B.buildInstr(AMDGPU::S_MOV_B32)
3120 .addDef(AddrLo)
3121 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3122
3123 // If required, write the upper half as well.
3124 if (RequiresHighHalf) {
3125 assert(PtrTy.getSizeInBits() == 64 &&
3126 "Must provide a 64-bit pointer type!");
3127
3128 Register AddrHi = MRI.createGenericVirtualRegister(S32);
3129 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3130
3131 B.buildInstr(AMDGPU::S_MOV_B32)
3132 .addDef(AddrHi)
3133 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3134
3135 // Use the destination directly, if and only if we don't have a register
3136 // class being set.
3137 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3138 ? DstReg
3139 : MRI.createGenericVirtualRegister(LLT::scalar(64));
3140
3141 if (!MRI.getRegClassOrNull(AddrDst))
3142 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3143
3144 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3145
3146 // If we created a new register for the destination, cast the result into
3147 // the final output.
3148 if (AddrDst != DstReg)
3149 B.buildCast(DstReg, AddrDst);
3150 } else if (AddrLo != DstReg) {
3151 // If we created a new register for the destination, cast the result into
3152 // the final output.
3153 B.buildCast(DstReg, AddrLo);
3154 }
3155}
3156
3159 MachineIRBuilder &B) const {
3160 Register DstReg = MI.getOperand(0).getReg();
3161 LLT Ty = MRI.getType(DstReg);
3162 unsigned AS = Ty.getAddressSpace();
3163
3164 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3165 MachineFunction &MF = B.getMF();
3167
3169 if (!MFI->isModuleEntryFunction() &&
3170 GV->getName() != "llvm.amdgcn.module.lds" &&
3172 const Function &Fn = MF.getFunction();
3174 Fn, "local memory global used by non-kernel function",
3175 MI.getDebugLoc(), DS_Warning));
3176
3177 // We currently don't have a way to correctly allocate LDS objects that
3178 // aren't directly associated with a kernel. We do force inlining of
3179 // functions that use local objects. However, if these dead functions are
3180 // not eliminated, we don't want a compile time error. Just emit a warning
3181 // and a trap, since there should be no callable path here.
3182 B.buildTrap();
3183 B.buildUndef(DstReg);
3184 MI.eraseFromParent();
3185 return true;
3186 }
3187
3188 // TODO: We could emit code to handle the initialization somewhere.
3189 // We ignore the initializer for now and legalize it to allow selection.
3190 // The initializer will anyway get errored out during assembly emission.
3191 const SITargetLowering *TLI = ST.getTargetLowering();
3192 if (!TLI->shouldUseLDSConstAddress(GV)) {
3193 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3194 return true; // Leave in place;
3195 }
3196
3197 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3198 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3199 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3200 // zero-sized type in other languages to declare the dynamic shared
3201 // memory which size is not known at the compile time. They will be
3202 // allocated by the runtime and placed directly after the static
3203 // allocated ones. They all share the same offset.
3204 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3205 // Adjust alignment for that dynamic shared memory array.
3206 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3207 LLT S32 = LLT::scalar(32);
3208 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3209 B.buildIntToPtr(DstReg, Sz);
3210 MI.eraseFromParent();
3211 return true;
3212 }
3213 }
3214
3215 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3216 MI.eraseFromParent();
3217 return true;
3218 }
3219
3220 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3221 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3222 MI.eraseFromParent();
3223 return true;
3224 }
3225
3226 const SITargetLowering *TLI = ST.getTargetLowering();
3227
3228 if (TLI->shouldEmitFixup(GV)) {
3229 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3230 MI.eraseFromParent();
3231 return true;
3232 }
3233
3234 if (TLI->shouldEmitPCReloc(GV)) {
3235 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3236 MI.eraseFromParent();
3237 return true;
3238 }
3239
3241 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3242
3243 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3248 LoadTy, Align(8));
3249
3250 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3251
3252 if (Ty.getSizeInBits() == 32) {
3253 // Truncate if this is a 32-bit constant address.
3254 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3255 B.buildExtract(DstReg, Load, 0);
3256 } else
3257 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3258
3259 MI.eraseFromParent();
3260 return true;
3261}
3262
3264 if (Ty.isVector())
3265 return Ty.changeElementCount(
3266 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3267 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3268}
3269
3271 MachineInstr &MI) const {
3272 MachineIRBuilder &B = Helper.MIRBuilder;
3273 MachineRegisterInfo &MRI = *B.getMRI();
3274 GISelChangeObserver &Observer = Helper.Observer;
3275
3276 Register PtrReg = MI.getOperand(1).getReg();
3277 LLT PtrTy = MRI.getType(PtrReg);
3278 unsigned AddrSpace = PtrTy.getAddressSpace();
3279
3280 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3282 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3283 Observer.changingInstr(MI);
3284 MI.getOperand(1).setReg(Cast.getReg(0));
3285 Observer.changedInstr(MI);
3286 return true;
3287 }
3288
3289 if (MI.getOpcode() != AMDGPU::G_LOAD)
3290 return false;
3291
3292 Register ValReg = MI.getOperand(0).getReg();
3293 LLT ValTy = MRI.getType(ValReg);
3294
3295 if (hasBufferRsrcWorkaround(ValTy)) {
3296 Observer.changingInstr(MI);
3298 Observer.changedInstr(MI);
3299 return true;
3300 }
3301
3302 MachineMemOperand *MMO = *MI.memoperands_begin();
3303 const unsigned ValSize = ValTy.getSizeInBits();
3304 const LLT MemTy = MMO->getMemoryType();
3305 const Align MemAlign = MMO->getAlign();
3306 const unsigned MemSize = MemTy.getSizeInBits();
3307 const uint64_t AlignInBits = 8 * MemAlign.value();
3308
3309 // Widen non-power-of-2 loads to the alignment if needed
3310 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3311 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3312
3313 // This was already the correct extending load result type, so just adjust
3314 // the memory type.
3315 if (WideMemSize == ValSize) {
3316 MachineFunction &MF = B.getMF();
3317
3318 MachineMemOperand *WideMMO =
3319 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3320 Observer.changingInstr(MI);
3321 MI.setMemRefs(MF, {WideMMO});
3322 Observer.changedInstr(MI);
3323 return true;
3324 }
3325
3326 // Don't bother handling edge case that should probably never be produced.
3327 if (ValSize > WideMemSize)
3328 return false;
3329
3330 LLT WideTy = widenToNextPowerOf2(ValTy);
3331
3332 Register WideLoad;
3333 if (!WideTy.isVector()) {
3334 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3335 B.buildTrunc(ValReg, WideLoad).getReg(0);
3336 } else {
3337 // Extract the subvector.
3338
3339 if (isRegisterType(ST, ValTy)) {
3340 // If this a case where G_EXTRACT is legal, use it.
3341 // (e.g. <3 x s32> -> <4 x s32>)
3342 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3343 B.buildExtract(ValReg, WideLoad, 0);
3344 } else {
3345 // For cases where the widened type isn't a nice register value, unmerge
3346 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3347 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3348 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3349 }
3350 }
3351
3352 MI.eraseFromParent();
3353 return true;
3354 }
3355
3356 return false;
3357}
3358
3360 MachineInstr &MI) const {
3361 MachineIRBuilder &B = Helper.MIRBuilder;
3362 MachineRegisterInfo &MRI = *B.getMRI();
3363 GISelChangeObserver &Observer = Helper.Observer;
3364
3365 Register DataReg = MI.getOperand(0).getReg();
3366 LLT DataTy = MRI.getType(DataReg);
3367
3368 if (hasBufferRsrcWorkaround(DataTy)) {
3369 Observer.changingInstr(MI);
3371 Observer.changedInstr(MI);
3372 return true;
3373 }
3374 return false;
3375}
3376
3379 MachineIRBuilder &B) const {
3380 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3381 assert(Ty.isScalar());
3382
3383 MachineFunction &MF = B.getMF();
3385
3386 // TODO: Always legal with future ftz flag.
3387 // FIXME: Do we need just output?
3388 if (Ty == LLT::float32() &&
3390 return true;
3391 if (Ty == LLT::float16() &&
3393 return true;
3394
3395 MachineIRBuilder HelperBuilder(MI);
3396 GISelObserverWrapper DummyObserver;
3397 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3398 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3399}
3400
3403 Register DstReg = MI.getOperand(0).getReg();
3404 Register PtrReg = MI.getOperand(1).getReg();
3405 Register CmpVal = MI.getOperand(2).getReg();
3406 Register NewVal = MI.getOperand(3).getReg();
3407
3408 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3409 "this should not have been custom lowered");
3410
3411 LLT ValTy = MRI.getType(CmpVal);
3412 LLT VecTy = LLT::fixed_vector(2, ValTy);
3413
3414 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3415
3416 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3417 .addDef(DstReg)
3418 .addUse(PtrReg)
3419 .addUse(PackedVal)
3420 .setMemRefs(MI.memoperands());
3421
3422 MI.eraseFromParent();
3423 return true;
3424}
3425
3426/// Return true if it's known that \p Src can never be an f32 denormal value.
3428 Register Src) {
3429 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3430 switch (DefMI->getOpcode()) {
3431 case TargetOpcode::G_INTRINSIC: {
3433 case Intrinsic::amdgcn_frexp_mant:
3434 case Intrinsic::amdgcn_log:
3435 case Intrinsic::amdgcn_log_clamp:
3436 case Intrinsic::amdgcn_exp2:
3437 case Intrinsic::amdgcn_sqrt:
3438 return true;
3439 default:
3440 break;
3441 }
3442
3443 break;
3444 }
3445 case TargetOpcode::G_FSQRT:
3446 return true;
3447 case TargetOpcode::G_FFREXP: {
3448 if (DefMI->getOperand(0).getReg() == Src)
3449 return true;
3450 break;
3451 }
3452 case TargetOpcode::G_FPEXT: {
3453 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3454 }
3455 default:
3456 return false;
3457 }
3458
3459 return false;
3460}
3461
3462static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3463 return Flags & MachineInstr::FmAfn;
3464}
3465
3467 unsigned Flags) {
3468 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3471}
3472
3473std::pair<Register, Register>
3475 unsigned Flags) const {
3476 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3477 return {};
3478
3479 const LLT F32 = LLT::scalar(32);
3480 auto SmallestNormal = B.buildFConstant(
3482 auto IsLtSmallestNormal =
3483 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3484
3485 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3486 auto One = B.buildFConstant(F32, 1.0);
3487 auto ScaleFactor =
3488 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3489 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3490
3491 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3492}
3493
3495 MachineIRBuilder &B) const {
3496 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3497 // If we have to handle denormals, scale up the input and adjust the result.
3498
3499 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3500 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3501
3502 Register Dst = MI.getOperand(0).getReg();
3503 Register Src = MI.getOperand(1).getReg();
3504 LLT Ty = B.getMRI()->getType(Dst);
3505 unsigned Flags = MI.getFlags();
3506
3507 if (Ty == LLT::scalar(16)) {
3508 const LLT F32 = LLT::scalar(32);
3509 // Nothing in half is a denormal when promoted to f32.
3510 auto Ext = B.buildFPExt(F32, Src, Flags);
3511 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3512 .addUse(Ext.getReg(0))
3513 .setMIFlags(Flags);
3514 B.buildFPTrunc(Dst, Log2, Flags);
3515 MI.eraseFromParent();
3516 return true;
3517 }
3518
3519 assert(Ty == LLT::scalar(32));
3520
3521 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3522 if (!ScaledInput) {
3523 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3524 .addUse(Src)
3525 .setMIFlags(Flags);
3526 MI.eraseFromParent();
3527 return true;
3528 }
3529
3530 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3531 .addUse(ScaledInput)
3532 .setMIFlags(Flags);
3533
3534 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3535 auto Zero = B.buildFConstant(Ty, 0.0);
3536 auto ResultOffset =
3537 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3538 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3539
3540 MI.eraseFromParent();
3541 return true;
3542}
3543
3545 Register Z, unsigned Flags) {
3546 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3547 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3548}
3549
3551 MachineIRBuilder &B) const {
3552 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3553 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3554
3555 MachineRegisterInfo &MRI = *B.getMRI();
3556 Register Dst = MI.getOperand(0).getReg();
3557 Register X = MI.getOperand(1).getReg();
3558 unsigned Flags = MI.getFlags();
3559 const LLT Ty = MRI.getType(X);
3560
3561 const LLT F32 = LLT::scalar(32);
3562 const LLT F16 = LLT::scalar(16);
3563
3564 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3565 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3566 // depending on !fpmath metadata.
3567 bool PromoteToF32 =
3568 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3569 if (PromoteToF32) {
3570 Register LogVal = MRI.createGenericVirtualRegister(F32);
3571 auto PromoteSrc = B.buildFPExt(F32, X);
3572 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3573 B.buildFPTrunc(Dst, LogVal);
3574 } else {
3575 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3576 }
3577
3578 MI.eraseFromParent();
3579 return true;
3580 }
3581
3582 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3583 if (ScaledInput)
3584 X = ScaledInput;
3585
3586 auto Y =
3587 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3588
3589 Register R;
3590 if (ST.hasFastFMAF32()) {
3591 // c+cc are ln(2)/ln(10) to more than 49 bits
3592 const float c_log10 = 0x1.344134p-2f;
3593 const float cc_log10 = 0x1.09f79ep-26f;
3594
3595 // c + cc is ln(2) to more than 49 bits
3596 const float c_log = 0x1.62e42ep-1f;
3597 const float cc_log = 0x1.efa39ep-25f;
3598
3599 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3600 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3601 // This adds correction terms for which contraction may lead to an increase
3602 // in the error of the approximation, so disable it.
3603 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3604 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3605 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3606 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3607 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3608 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3609 } else {
3610 // ch+ct is ln(2)/ln(10) to more than 36 bits
3611 const float ch_log10 = 0x1.344000p-2f;
3612 const float ct_log10 = 0x1.3509f6p-18f;
3613
3614 // ch + ct is ln(2) to more than 36 bits
3615 const float ch_log = 0x1.62e000p-1f;
3616 const float ct_log = 0x1.0bfbe8p-15f;
3617
3618 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3619 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3620
3621 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3622 auto YH = B.buildAnd(Ty, Y, MaskConst);
3623 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3624 // This adds correction terms for which contraction may lead to an increase
3625 // in the error of the approximation, so disable it.
3626 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3627 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3628
3629 Register Mad0 =
3630 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3631 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3632 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3633 }
3634
3635 const bool IsFiniteOnly =
3637
3638 if (!IsFiniteOnly) {
3639 // Expand isfinite(x) => fabs(x) < inf
3640 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3641 auto Fabs = B.buildFAbs(Ty, Y);
3642 auto IsFinite =
3643 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3644 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3645 }
3646
3647 if (ScaledInput) {
3648 auto Zero = B.buildFConstant(Ty, 0.0);
3649 auto ShiftK =
3650 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3651 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3652 B.buildFSub(Dst, R, Shift, Flags);
3653 } else {
3654 B.buildCopy(Dst, R);
3655 }
3656
3657 MI.eraseFromParent();
3658 return true;
3659}
3660
3662 Register Src, bool IsLog10,
3663 unsigned Flags) const {
3664 const double Log2BaseInverted =
3666
3667 LLT Ty = B.getMRI()->getType(Dst);
3668
3669 if (Ty == LLT::scalar(32)) {
3670 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3671 if (ScaledInput) {
3672 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3673 .addUse(Src)
3674 .setMIFlags(Flags);
3675 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3676 auto Zero = B.buildFConstant(Ty, 0.0);
3677 auto ResultOffset =
3678 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3679 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3680
3681 if (ST.hasFastFMAF32())
3682 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3683 else {
3684 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3685 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3686 }
3687
3688 return true;
3689 }
3690 }
3691
3692 auto Log2Operand = Ty == LLT::scalar(16)
3693 ? B.buildFLog2(Ty, Src, Flags)
3694 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3695 .addUse(Src)
3696 .setMIFlags(Flags);
3697 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3698 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3699 return true;
3700}
3701
3703 MachineIRBuilder &B) const {
3704 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3705 // If we have to handle denormals, scale up the input and adjust the result.
3706
3707 Register Dst = MI.getOperand(0).getReg();
3708 Register Src = MI.getOperand(1).getReg();
3709 unsigned Flags = MI.getFlags();
3710 LLT Ty = B.getMRI()->getType(Dst);
3711 const LLT F16 = LLT::scalar(16);
3712 const LLT F32 = LLT::scalar(32);
3713 const LLT F64 = LLT::scalar(64);
3714
3715 if (Ty == F64)
3716 return legalizeFEXPF64(MI, B);
3717
3718 if (Ty == F16) {
3719 // Nothing in half is a denormal when promoted to f32.
3720 auto Ext = B.buildFPExt(F32, Src, Flags);
3721 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3722 .addUse(Ext.getReg(0))
3723 .setMIFlags(Flags);
3724 B.buildFPTrunc(Dst, Log2, Flags);
3725 MI.eraseFromParent();
3726 return true;
3727 }
3728
3729 assert(Ty == F32);
3730
3731 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3732 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3733 .addUse(Src)
3734 .setMIFlags(Flags);
3735 MI.eraseFromParent();
3736 return true;
3737 }
3738
3739 // bool needs_scaling = x < -0x1.f80000p+6f;
3740 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3741
3742 // -nextafter(128.0, -1)
3743 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3744 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3745 RangeCheckConst, Flags);
3746
3747 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3748 auto Zero = B.buildFConstant(Ty, 0.0);
3749 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3750 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3751
3752 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3753 .addUse(AddInput.getReg(0))
3754 .setMIFlags(Flags);
3755
3756 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3757 auto One = B.buildFConstant(Ty, 1.0);
3758 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3759 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3760 MI.eraseFromParent();
3761 return true;
3762}
3763
3765 const SrcOp &Src, unsigned Flags) {
3766 LLT Ty = Dst.getLLTTy(*B.getMRI());
3767
3768 if (Ty == LLT::scalar(32)) {
3769 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3770 .addUse(Src.getReg())
3771 .setMIFlags(Flags);
3772 }
3773 return B.buildFExp2(Dst, Src, Flags);
3774}
3775
3777 Register Dst, Register X,
3778 unsigned Flags,
3779 bool IsExp10) const {
3780 LLT Ty = B.getMRI()->getType(X);
3781
3782 // exp(x) -> exp2(M_LOG2E_F * x);
3783 // exp10(x) -> exp2(log2(10) * x);
3784 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3785 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3786 buildExp(B, Dst, Mul, Flags);
3787 return true;
3788}
3789
3791 Register X, unsigned Flags) const {
3792 LLT Ty = B.getMRI()->getType(Dst);
3793 LLT F32 = LLT::scalar(32);
3794
3795 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3796 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3797 }
3798
3799 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3800 auto NeedsScaling =
3801 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3802 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3803 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3804 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3805
3806 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3807 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3808
3809 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3810 .addUse(ExpInput.getReg(0))
3811 .setMIFlags(Flags);
3812
3813 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3814 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3815 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3816 return true;
3817}
3818
3820 Register Dst, Register X,
3821 unsigned Flags) const {
3822 LLT Ty = B.getMRI()->getType(Dst);
3823 LLT F32 = LLT::scalar(32);
3824
3825 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3826 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3827 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3828 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3829
3830 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3831 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3832 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3833 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3834 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3835 return true;
3836 }
3837
3838 // bool s = x < -0x1.2f7030p+5f;
3839 // x += s ? 0x1.0p+5f : 0.0f;
3840 // exp10 = exp2(x * 0x1.a92000p+1f) *
3841 // exp2(x * 0x1.4f0978p-11f) *
3842 // (s ? 0x1.9f623ep-107f : 1.0f);
3843
3844 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3845 auto NeedsScaling =
3846 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3847
3848 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3849 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3850 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3851
3852 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3853 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3854
3855 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3856 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3857 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3858 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3859
3860 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3861 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3862 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3863
3864 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3865 return true;
3866}
3867
3868// This expansion gives a result slightly better than 1ulp.
3870 MachineIRBuilder &B) const {
3871
3872 Register X = MI.getOperand(1).getReg();
3873 LLT S64 = LLT::scalar(64);
3874 LLT S32 = LLT::scalar(32);
3875 LLT S1 = LLT::scalar(1);
3876
3877 // TODO: Check if reassoc is safe. There is an output change in exp2 and
3878 // exp10, which slightly increases ulp.
3879 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
3880
3881 Register Dn, F, T;
3882
3883 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
3884 // Dn = rint(X)
3885 Dn = B.buildFRint(S64, X, Flags).getReg(0);
3886 // F = X - Dn
3887 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
3888 // T = F*C1 + F*C2
3889 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
3890 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
3891 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
3892 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
3893
3894 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
3895 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
3896 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
3897 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
3898
3899 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
3900 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
3901 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
3902 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
3903 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
3904
3905 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
3906 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
3907 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
3908 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
3909
3910 } else { // G_FEXP
3911 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
3912 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
3913 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
3914
3915 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
3916 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
3917 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
3918 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
3919 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
3920 }
3921
3922 // Polynomial chain for P
3923 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
3924 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
3925 Flags);
3926 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
3927 Flags);
3928 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
3929 Flags);
3930 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
3931 Flags);
3932 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
3933 Flags);
3934 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
3935 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
3936 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
3937 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
3938
3939 auto One = B.buildFConstant(S64, 1.0);
3940 P = B.buildFMA(S64, T, P, One, Flags);
3941 P = B.buildFMA(S64, T, P, One, Flags);
3942
3943 // Z = FLDEXP(P, (int)Dn)
3944 auto DnInt = B.buildFPTOSI(S32, Dn);
3945 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
3946
3947 if (!(Flags & MachineInstr::FmNoInfs)) {
3948 // Overflow guard: if X <= 1024.0 then Z else +inf
3949 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
3950 B.buildFConstant(S64, APFloat(1024.0)));
3951 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
3952 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
3953 }
3954
3955 // Underflow guard: if X >= -1075.0 then Z else 0.0
3956 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
3957 B.buildFConstant(S64, APFloat(-1075.0)));
3958 auto Zero = B.buildFConstant(S64, APFloat(0.0));
3959 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
3960
3961 MI.eraseFromParent();
3962 return true;
3963}
3964
3966 MachineIRBuilder &B) const {
3967 Register Dst = MI.getOperand(0).getReg();
3968 Register X = MI.getOperand(1).getReg();
3969 const unsigned Flags = MI.getFlags();
3970 MachineFunction &MF = B.getMF();
3971 MachineRegisterInfo &MRI = *B.getMRI();
3972 LLT Ty = MRI.getType(Dst);
3973
3974 const LLT F64 = LLT::scalar(64);
3975
3976 if (Ty == F64)
3977 return legalizeFEXPF64(MI, B);
3978
3979 const LLT F16 = LLT::scalar(16);
3980 const LLT F32 = LLT::scalar(32);
3981 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3982
3983 if (Ty == F16) {
3984 // v_exp_f16 (fmul x, log2e)
3985 if (allowApproxFunc(MF, Flags)) {
3986 // TODO: Does this really require fast?
3987 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3988 : legalizeFExpUnsafe(B, Dst, X, Flags);
3989 MI.eraseFromParent();
3990 return true;
3991 }
3992
3993 // Nothing in half is a denormal when promoted to f32.
3994 //
3995 // exp(f16 x) ->
3996 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3997 //
3998 // exp10(f16 x) ->
3999 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4000 auto Ext = B.buildFPExt(F32, X, Flags);
4001 Register Lowered = MRI.createGenericVirtualRegister(F32);
4002 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4003 B.buildFPTrunc(Dst, Lowered, Flags);
4004 MI.eraseFromParent();
4005 return true;
4006 }
4007
4008 assert(Ty == F32);
4009
4010 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4011 // library behavior. Also, is known-not-daz source sufficient?
4012 if (allowApproxFunc(MF, Flags)) {
4013 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4014 : legalizeFExpUnsafe(B, Dst, X, Flags);
4015 MI.eraseFromParent();
4016 return true;
4017 }
4018
4019 // Algorithm:
4020 //
4021 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4022 //
4023 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4024 // n = 64*m + j, 0 <= j < 64
4025 //
4026 // e^x = 2^((64*m + j + f)/64)
4027 // = (2^m) * (2^(j/64)) * 2^(f/64)
4028 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4029 //
4030 // f = x*(64/ln(2)) - n
4031 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4032 //
4033 // e^x = (2^m) * (2^(j/64)) * e^r
4034 //
4035 // (2^(j/64)) is precomputed
4036 //
4037 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4038 // e^r = 1 + q
4039 //
4040 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4041 //
4042 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4043 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4044 Register PH, PL;
4045
4046 if (ST.hasFastFMAF32()) {
4047 const float c_exp = numbers::log2ef;
4048 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4049 const float c_exp10 = 0x1.a934f0p+1f;
4050 const float cc_exp10 = 0x1.2f346ep-24f;
4051
4052 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4053 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4054 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4055 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4056
4057 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4058 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4059 } else {
4060 const float ch_exp = 0x1.714000p+0f;
4061 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4062
4063 const float ch_exp10 = 0x1.a92000p+1f;
4064 const float cl_exp10 = 0x1.4f0978p-11f;
4065
4066 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4067 auto XH = B.buildAnd(Ty, X, MaskConst);
4068 auto XL = B.buildFSub(Ty, X, XH, Flags);
4069
4070 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4071 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4072
4073 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4074 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4075
4076 Register Mad0 =
4077 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4078 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4079 }
4080
4081 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4082
4083 // It is unsafe to contract this fsub into the PH multiply.
4084 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4085 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4086 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4087
4088 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4089 .addUse(A.getReg(0))
4090 .setMIFlags(Flags);
4091 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4092
4093 auto UnderflowCheckConst =
4094 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4095 auto Zero = B.buildFConstant(Ty, 0.0);
4096 auto Underflow =
4097 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4098
4099 R = B.buildSelect(Ty, Underflow, Zero, R);
4100
4101 if (!(Flags & MachineInstr::FmNoInfs)) {
4102 auto OverflowCheckConst =
4103 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4104
4105 auto Overflow =
4106 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4107 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4108 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4109 }
4110
4111 B.buildCopy(Dst, R);
4112 MI.eraseFromParent();
4113 return true;
4114}
4115
4117 MachineIRBuilder &B) const {
4118 Register Dst = MI.getOperand(0).getReg();
4119 Register Src0 = MI.getOperand(1).getReg();
4120 Register Src1 = MI.getOperand(2).getReg();
4121 unsigned Flags = MI.getFlags();
4122 LLT Ty = B.getMRI()->getType(Dst);
4123 const LLT F16 = LLT::float16();
4124 const LLT F32 = LLT::float32();
4125
4126 if (Ty == F32) {
4127 auto Log = B.buildFLog2(F32, Src0, Flags);
4128 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4129 .addUse(Log.getReg(0))
4130 .addUse(Src1)
4131 .setMIFlags(Flags);
4132 B.buildFExp2(Dst, Mul, Flags);
4133 } else if (Ty == F16) {
4134 // There's no f16 fmul_legacy, so we need to convert for it.
4135 auto Log = B.buildFLog2(F16, Src0, Flags);
4136 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4137 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4138 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4139 .addUse(Ext0.getReg(0))
4140 .addUse(Ext1.getReg(0))
4141 .setMIFlags(Flags);
4142 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4143 } else
4144 return false;
4145
4146 MI.eraseFromParent();
4147 return true;
4148}
4149
4150// Find a source register, ignoring any possible source modifiers.
4152 Register ModSrc = OrigSrc;
4153 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4154 ModSrc = SrcFNeg->getOperand(1).getReg();
4155 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4156 ModSrc = SrcFAbs->getOperand(1).getReg();
4157 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4158 ModSrc = SrcFAbs->getOperand(1).getReg();
4159 return ModSrc;
4160}
4161
4164 MachineIRBuilder &B) const {
4165
4166 const LLT S1 = LLT::scalar(1);
4167 const LLT F64 = LLT::float64();
4168 Register Dst = MI.getOperand(0).getReg();
4169 Register OrigSrc = MI.getOperand(1).getReg();
4170 unsigned Flags = MI.getFlags();
4171 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4172 "this should not have been custom lowered");
4173
4174 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4175 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4176 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4177 // V_FRACT bug is:
4178 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4179 //
4180 // Convert floor(x) to (x - fract(x))
4181
4182 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4183 .addUse(OrigSrc)
4184 .setMIFlags(Flags);
4185
4186 // Give source modifier matching some assistance before obscuring a foldable
4187 // pattern.
4188
4189 // TODO: We can avoid the neg on the fract? The input sign to fract
4190 // shouldn't matter?
4191 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4192
4193 auto Const =
4194 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4195
4196 Register Min = MRI.createGenericVirtualRegister(F64);
4197
4198 // We don't need to concern ourselves with the snan handling difference, so
4199 // use the one which will directly select.
4200 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4201 if (MFI->getMode().IEEE)
4202 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4203 else
4204 B.buildFMinNum(Min, Fract, Const, Flags);
4205
4206 Register CorrectedFract = Min;
4207 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4208 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4209 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4210 }
4211
4212 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4213 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4214
4215 MI.eraseFromParent();
4216 return true;
4217}
4218
4219// Turn an illegal packed v2s16 build vector into bit operations.
4220// TODO: This should probably be a bitcast action in LegalizerHelper.
4223 Register Dst = MI.getOperand(0).getReg();
4224 const LLT S32 = LLT::scalar(32);
4225 const LLT S16 = LLT::scalar(16);
4226 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4227
4228 Register Src0 = MI.getOperand(1).getReg();
4229 Register Src1 = MI.getOperand(2).getReg();
4230
4231 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4232 assert(MRI.getType(Src0) == S32);
4233 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4234 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4235 }
4236
4237 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4238 B.buildBitcast(Dst, Merge);
4239
4240 MI.eraseFromParent();
4241 return true;
4242}
4243
4244// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4245//
4246// Source and accumulation registers must all be 32-bits.
4247//
4248// TODO: When the multiply is uniform, we should produce a code sequence
4249// that is better suited to instruction selection on the SALU. Instead of
4250// the outer loop going over parts of the result, the outer loop should go
4251// over parts of one of the factors. This should result in instruction
4252// selection that makes full use of S_ADDC_U32 instructions.
4255 ArrayRef<Register> Src0,
4256 ArrayRef<Register> Src1,
4257 bool UsePartialMad64_32,
4258 bool SeparateOddAlignedProducts) const {
4259 // Use (possibly empty) vectors of S1 registers to represent the set of
4260 // carries from one pair of positions to the next.
4261 using Carry = SmallVector<Register, 2>;
4262
4263 MachineIRBuilder &B = Helper.MIRBuilder;
4264 GISelValueTracking &VT = *Helper.getValueTracking();
4265
4266 const LLT S1 = LLT::scalar(1);
4267 const LLT S32 = LLT::scalar(32);
4268 const LLT S64 = LLT::scalar(64);
4269
4270 Register Zero32;
4271 Register Zero64;
4272
4273 auto getZero32 = [&]() -> Register {
4274 if (!Zero32)
4275 Zero32 = B.buildConstant(S32, 0).getReg(0);
4276 return Zero32;
4277 };
4278 auto getZero64 = [&]() -> Register {
4279 if (!Zero64)
4280 Zero64 = B.buildConstant(S64, 0).getReg(0);
4281 return Zero64;
4282 };
4283
4284 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4285 for (unsigned i = 0; i < Src0.size(); ++i) {
4286 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4287 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4288 }
4289
4290 // Merge the given carries into the 32-bit LocalAccum, which is modified
4291 // in-place.
4292 //
4293 // Returns the carry-out, which is a single S1 register or null.
4294 auto mergeCarry =
4295 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4296 if (CarryIn.empty())
4297 return Register();
4298
4299 bool HaveCarryOut = true;
4300 Register CarryAccum;
4301 if (CarryIn.size() == 1) {
4302 if (!LocalAccum) {
4303 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4304 return Register();
4305 }
4306
4307 CarryAccum = getZero32();
4308 } else {
4309 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4310 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4311 CarryAccum =
4312 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4313 .getReg(0);
4314 }
4315
4316 if (!LocalAccum) {
4317 LocalAccum = getZero32();
4318 HaveCarryOut = false;
4319 }
4320 }
4321
4322 auto Add =
4323 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4324 LocalAccum = Add.getReg(0);
4325 return HaveCarryOut ? Add.getReg(1) : Register();
4326 };
4327
4328 // Build a multiply-add chain to compute
4329 //
4330 // LocalAccum + (partial products at DstIndex)
4331 // + (opportunistic subset of CarryIn)
4332 //
4333 // LocalAccum is an array of one or two 32-bit registers that are updated
4334 // in-place. The incoming registers may be null.
4335 //
4336 // In some edge cases, carry-ins can be consumed "for free". In that case,
4337 // the consumed carry bits are removed from CarryIn in-place.
4338 auto buildMadChain =
4339 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4340 -> Carry {
4341 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4342 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4343
4344 Carry CarryOut;
4345 unsigned j0 = 0;
4346
4347 // Use plain 32-bit multiplication for the most significant part of the
4348 // result by default.
4349 if (LocalAccum.size() == 1 &&
4350 (!UsePartialMad64_32 || !CarryIn.empty())) {
4351 do {
4352 // Skip multiplication if one of the operands is 0
4353 unsigned j1 = DstIndex - j0;
4354 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4355 ++j0;
4356 continue;
4357 }
4358 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4359 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4360 LocalAccum[0] = Mul.getReg(0);
4361 } else {
4362 if (CarryIn.empty()) {
4363 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4364 } else {
4365 LocalAccum[0] =
4366 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4367 .getReg(0);
4368 CarryIn.pop_back();
4369 }
4370 }
4371 ++j0;
4372 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4373 }
4374
4375 // Build full 64-bit multiplies.
4376 if (j0 <= DstIndex) {
4377 bool HaveSmallAccum = false;
4378 Register Tmp;
4379
4380 if (LocalAccum[0]) {
4381 if (LocalAccum.size() == 1) {
4382 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4383 HaveSmallAccum = true;
4384 } else if (LocalAccum[1]) {
4385 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4386 HaveSmallAccum = false;
4387 } else {
4388 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4389 HaveSmallAccum = true;
4390 }
4391 } else {
4392 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4393 Tmp = getZero64();
4394 HaveSmallAccum = true;
4395 }
4396
4397 do {
4398 unsigned j1 = DstIndex - j0;
4399 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4400 ++j0;
4401 continue;
4402 }
4403 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4404 {Src0[j0], Src1[j1], Tmp});
4405 Tmp = Mad.getReg(0);
4406 if (!HaveSmallAccum)
4407 CarryOut.push_back(Mad.getReg(1));
4408 HaveSmallAccum = false;
4409
4410 ++j0;
4411 } while (j0 <= DstIndex);
4412
4413 auto Unmerge = B.buildUnmerge(S32, Tmp);
4414 LocalAccum[0] = Unmerge.getReg(0);
4415 if (LocalAccum.size() > 1)
4416 LocalAccum[1] = Unmerge.getReg(1);
4417 }
4418
4419 return CarryOut;
4420 };
4421
4422 // Outer multiply loop, iterating over destination parts from least
4423 // significant to most significant parts.
4424 //
4425 // The columns of the following diagram correspond to the destination parts
4426 // affected by one iteration of the outer loop (ignoring boundary
4427 // conditions).
4428 //
4429 // Dest index relative to 2 * i: 1 0 -1
4430 // ------
4431 // Carries from previous iteration: e o
4432 // Even-aligned partial product sum: E E .
4433 // Odd-aligned partial product sum: O O
4434 //
4435 // 'o' is OddCarry, 'e' is EvenCarry.
4436 // EE and OO are computed from partial products via buildMadChain and use
4437 // accumulation where possible and appropriate.
4438 //
4439 Register SeparateOddCarry;
4440 Carry EvenCarry;
4441 Carry OddCarry;
4442
4443 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4444 Carry OddCarryIn = std::move(OddCarry);
4445 Carry EvenCarryIn = std::move(EvenCarry);
4446 OddCarry.clear();
4447 EvenCarry.clear();
4448
4449 // Partial products at offset 2 * i.
4450 if (2 * i < Accum.size()) {
4451 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4452 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4453 }
4454
4455 // Partial products at offset 2 * i - 1.
4456 if (i > 0) {
4457 if (!SeparateOddAlignedProducts) {
4458 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4459 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4460 } else {
4461 bool IsHighest = 2 * i >= Accum.size();
4462 Register SeparateOddOut[2];
4463 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4464 .take_front(IsHighest ? 1 : 2);
4465 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4466
4468
4469 if (i == 1) {
4470 if (!IsHighest)
4471 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4472 else
4473 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4474 } else {
4475 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4476 SeparateOddCarry);
4477 }
4478 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4479
4480 if (!IsHighest) {
4481 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4482 Lo->getOperand(1).getReg());
4483 Accum[2 * i] = Hi.getReg(0);
4484 SeparateOddCarry = Hi.getReg(1);
4485 }
4486 }
4487 }
4488
4489 // Add in the carries from the previous iteration
4490 if (i > 0) {
4491 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4492 EvenCarryIn.push_back(CarryOut);
4493
4494 if (2 * i < Accum.size()) {
4495 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4496 OddCarry.push_back(CarryOut);
4497 }
4498 }
4499 }
4500}
4501
4502// Custom narrowing of wide multiplies using wide multiply-add instructions.
4503//
4504// TODO: If the multiply is followed by an addition, we should attempt to
4505// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4507 MachineInstr &MI) const {
4508 assert(ST.hasMad64_32());
4509 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4510
4511 MachineIRBuilder &B = Helper.MIRBuilder;
4512 MachineRegisterInfo &MRI = *B.getMRI();
4513
4514 Register DstReg = MI.getOperand(0).getReg();
4515 Register Src0 = MI.getOperand(1).getReg();
4516 Register Src1 = MI.getOperand(2).getReg();
4517
4518 LLT Ty = MRI.getType(DstReg);
4519 assert(Ty.isScalar());
4520
4521 unsigned Size = Ty.getSizeInBits();
4522 if (ST.hasVectorMulU64() && Size == 64)
4523 return true;
4524
4525 unsigned NumParts = Size / 32;
4526 assert((Size % 32) == 0);
4527 assert(NumParts >= 2);
4528
4529 // Whether to use MAD_64_32 for partial products whose high half is
4530 // discarded. This avoids some ADD instructions but risks false dependency
4531 // stalls on some subtargets in some cases.
4532 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4533
4534 // Whether to compute odd-aligned partial products separately. This is
4535 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4536 // in an even-aligned VGPR.
4537 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4538
4539 LLT S32 = LLT::scalar(32);
4540 SmallVector<Register, 2> Src0Parts, Src1Parts;
4541 for (unsigned i = 0; i < NumParts; ++i) {
4542 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4543 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4544 }
4545 B.buildUnmerge(Src0Parts, Src0);
4546 B.buildUnmerge(Src1Parts, Src1);
4547
4548 SmallVector<Register, 2> AccumRegs(NumParts);
4549 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4550 SeparateOddAlignedProducts);
4551
4552 B.buildMergeLikeInstr(DstReg, AccumRegs);
4553 MI.eraseFromParent();
4554 return true;
4555}
4556
4557// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4558// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4559// case with a single min instruction instead of a compare+select.
4562 MachineIRBuilder &B) const {
4563 Register Dst = MI.getOperand(0).getReg();
4564 Register Src = MI.getOperand(1).getReg();
4565 LLT DstTy = MRI.getType(Dst);
4566 LLT SrcTy = MRI.getType(Src);
4567
4568 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4569 ? AMDGPU::G_AMDGPU_FFBH_U32
4570 : AMDGPU::G_AMDGPU_FFBL_B32;
4571 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4572 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4573
4574 MI.eraseFromParent();
4575 return true;
4576}
4577
4580 MachineIRBuilder &B) const {
4581 Register Dst = MI.getOperand(0).getReg();
4582 Register Src = MI.getOperand(1).getReg();
4583 LLT SrcTy = MRI.getType(Src);
4584 TypeSize NumBits = SrcTy.getSizeInBits();
4585
4586 assert(NumBits < 32u);
4587
4588 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4589 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4590 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4591 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4592 B.buildTrunc(Dst, Ctlz);
4593 MI.eraseFromParent();
4594 return true;
4595}
4596
4597// Check that this is a G_XOR x, -1
4598static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4599 if (MI.getOpcode() != TargetOpcode::G_XOR)
4600 return false;
4601 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4602 return ConstVal == -1;
4603}
4604
4605// Return the use branch instruction, otherwise null if the usage is invalid.
4606static MachineInstr *
4608 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4609 Register CondDef = MI.getOperand(0).getReg();
4610 if (!MRI.hasOneNonDBGUse(CondDef))
4611 return nullptr;
4612
4613 MachineBasicBlock *Parent = MI.getParent();
4614 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4615
4616 if (isNot(MRI, *UseMI)) {
4617 Register NegatedCond = UseMI->getOperand(0).getReg();
4618 if (!MRI.hasOneNonDBGUse(NegatedCond))
4619 return nullptr;
4620
4621 // We're deleting the def of this value, so we need to remove it.
4622 eraseInstr(*UseMI, MRI);
4623
4624 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4625 Negated = true;
4626 }
4627
4628 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4629 return nullptr;
4630
4631 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4632 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4633 if (Next == Parent->end()) {
4634 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4635 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4636 return nullptr;
4637 UncondBrTarget = &*NextMBB;
4638 } else {
4639 if (Next->getOpcode() != AMDGPU::G_BR)
4640 return nullptr;
4641 Br = &*Next;
4642 UncondBrTarget = Br->getOperand(0).getMBB();
4643 }
4644
4645 return UseMI;
4646}
4647
4650 const ArgDescriptor *Arg,
4651 const TargetRegisterClass *ArgRC,
4652 LLT ArgTy) const {
4653 MCRegister SrcReg = Arg->getRegister();
4654 assert(SrcReg.isPhysical() && "Physical register expected");
4655 assert(DstReg.isVirtual() && "Virtual register expected");
4656
4657 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4658 *ArgRC, B.getDebugLoc(), ArgTy);
4659 if (Arg->isMasked()) {
4660 // TODO: Should we try to emit this once in the entry block?
4661 const LLT S32 = LLT::scalar(32);
4662 const unsigned Mask = Arg->getMask();
4663 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4664
4665 Register AndMaskSrc = LiveIn;
4666
4667 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4668 // 0.
4669 if (Shift != 0) {
4670 auto ShiftAmt = B.buildConstant(S32, Shift);
4671 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4672 }
4673
4674 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4675 } else {
4676 B.buildCopy(DstReg, LiveIn);
4677 }
4678}
4679
4684 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4685 Register DstReg = MI.getOperand(0).getReg();
4686 if (!ST.hasClusters()) {
4687 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4688 return false;
4689 MI.eraseFromParent();
4690 return true;
4691 }
4692
4693 // Clusters are supported. Return the global position in the grid. If clusters
4694 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4695
4696 // WorkGroupIdXYZ = ClusterId == 0 ?
4697 // ClusterIdXYZ :
4698 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4699 MachineRegisterInfo &MRI = *B.getMRI();
4700 const LLT S32 = LLT::scalar(32);
4701 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4702 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4703 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4704 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4705 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4706 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4707 return false;
4708
4709 auto One = B.buildConstant(S32, 1);
4710 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4711 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4712 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4713
4714 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4715
4716 switch (MFI->getClusterDims().getKind()) {
4719 B.buildCopy(DstReg, GlobalIdXYZ);
4720 MI.eraseFromParent();
4721 return true;
4722 }
4724 B.buildCopy(DstReg, ClusterIdXYZ);
4725 MI.eraseFromParent();
4726 return true;
4727 }
4729 using namespace AMDGPU::Hwreg;
4730 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4731 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4732 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4733 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4734 .addDef(ClusterId)
4735 .addImm(ClusterIdField);
4736 auto Zero = B.buildConstant(S32, 0);
4737 auto NoClusters =
4738 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4739 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4740 MI.eraseFromParent();
4741 return true;
4742 }
4743 }
4744
4745 llvm_unreachable("nothing should reach here");
4746}
4747
4749 Register DstReg, MachineIRBuilder &B,
4751 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4752 const ArgDescriptor *Arg = nullptr;
4753 const TargetRegisterClass *ArgRC;
4754 LLT ArgTy;
4755
4756 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4757 const ArgDescriptor WorkGroupIDX =
4758 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4759 // If GridZ is not programmed in an entry function then the hardware will set
4760 // it to all zeros, so there is no need to mask the GridY value in the low
4761 // order bits.
4762 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4763 AMDGPU::TTMP7,
4764 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4765 const ArgDescriptor WorkGroupIDZ =
4766 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4767 const ArgDescriptor ClusterWorkGroupIDX =
4768 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4769 const ArgDescriptor ClusterWorkGroupIDY =
4770 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4771 const ArgDescriptor ClusterWorkGroupIDZ =
4772 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4773 const ArgDescriptor ClusterWorkGroupMaxIDX =
4774 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4775 const ArgDescriptor ClusterWorkGroupMaxIDY =
4776 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4777 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4778 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4779 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4780 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4781
4782 auto LoadConstant = [&](unsigned N) {
4783 B.buildConstant(DstReg, N);
4784 return true;
4785 };
4786
4787 if (ST.hasArchitectedSGPRs() &&
4789 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4790 bool HasFixedDims = ClusterDims.isFixedDims();
4791
4792 switch (ArgType) {
4794 Arg = &WorkGroupIDX;
4795 ArgRC = &AMDGPU::SReg_32RegClass;
4796 ArgTy = LLT::scalar(32);
4797 break;
4799 Arg = &WorkGroupIDY;
4800 ArgRC = &AMDGPU::SReg_32RegClass;
4801 ArgTy = LLT::scalar(32);
4802 break;
4804 Arg = &WorkGroupIDZ;
4805 ArgRC = &AMDGPU::SReg_32RegClass;
4806 ArgTy = LLT::scalar(32);
4807 break;
4809 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4810 return LoadConstant(0);
4811 Arg = &ClusterWorkGroupIDX;
4812 ArgRC = &AMDGPU::SReg_32RegClass;
4813 ArgTy = LLT::scalar(32);
4814 break;
4816 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4817 return LoadConstant(0);
4818 Arg = &ClusterWorkGroupIDY;
4819 ArgRC = &AMDGPU::SReg_32RegClass;
4820 ArgTy = LLT::scalar(32);
4821 break;
4823 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4824 return LoadConstant(0);
4825 Arg = &ClusterWorkGroupIDZ;
4826 ArgRC = &AMDGPU::SReg_32RegClass;
4827 ArgTy = LLT::scalar(32);
4828 break;
4830 if (HasFixedDims)
4831 return LoadConstant(ClusterDims.getDims()[0] - 1);
4832 Arg = &ClusterWorkGroupMaxIDX;
4833 ArgRC = &AMDGPU::SReg_32RegClass;
4834 ArgTy = LLT::scalar(32);
4835 break;
4837 if (HasFixedDims)
4838 return LoadConstant(ClusterDims.getDims()[1] - 1);
4839 Arg = &ClusterWorkGroupMaxIDY;
4840 ArgRC = &AMDGPU::SReg_32RegClass;
4841 ArgTy = LLT::scalar(32);
4842 break;
4844 if (HasFixedDims)
4845 return LoadConstant(ClusterDims.getDims()[2] - 1);
4846 Arg = &ClusterWorkGroupMaxIDZ;
4847 ArgRC = &AMDGPU::SReg_32RegClass;
4848 ArgTy = LLT::scalar(32);
4849 break;
4851 Arg = &ClusterWorkGroupMaxFlatID;
4852 ArgRC = &AMDGPU::SReg_32RegClass;
4853 ArgTy = LLT::scalar(32);
4854 break;
4855 default:
4856 break;
4857 }
4858 }
4859
4860 if (!Arg)
4861 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4862
4863 if (!Arg) {
4865 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4866 // which case the pointer argument may be missing and we use null.
4867 return LoadConstant(0);
4868 }
4869
4870 // It's undefined behavior if a function marked with the amdgpu-no-*
4871 // attributes uses the corresponding intrinsic.
4872 B.buildUndef(DstReg);
4873 return true;
4874 }
4875
4876 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4877 return false; // TODO: Handle these
4878 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4879 return true;
4880}
4881
4885 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4886 return false;
4887
4888 MI.eraseFromParent();
4889 return true;
4890}
4891
4893 int64_t C) {
4894 B.buildConstant(MI.getOperand(0).getReg(), C);
4895 MI.eraseFromParent();
4896 return true;
4897}
4898
4901 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4902 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4903 if (MaxID == 0)
4904 return replaceWithConstant(B, MI, 0);
4905
4906 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4907 const ArgDescriptor *Arg;
4908 const TargetRegisterClass *ArgRC;
4909 LLT ArgTy;
4910 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4911
4912 Register DstReg = MI.getOperand(0).getReg();
4913 if (!Arg) {
4914 // It's undefined behavior if a function marked with the amdgpu-no-*
4915 // attributes uses the corresponding intrinsic.
4916 B.buildUndef(DstReg);
4917 MI.eraseFromParent();
4918 return true;
4919 }
4920
4921 if (Arg->isMasked()) {
4922 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4923 // masking operations anyway.
4924 //
4925 // TODO: We could assert the top bit is 0 for the source copy.
4926 if (!loadInputValue(DstReg, B, ArgType))
4927 return false;
4928 } else {
4929 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4930 if (!loadInputValue(TmpReg, B, ArgType))
4931 return false;
4932 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4933 }
4934
4935 MI.eraseFromParent();
4936 return true;
4937}
4938
4941 // This isn't really a constant pool but close enough.
4944 return PtrInfo;
4945}
4946
4948 int64_t Offset) const {
4950 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4951
4952 // TODO: If we passed in the base kernel offset we could have a better
4953 // alignment than 4, but we don't really need it.
4954 if (!loadInputValue(KernArgReg, B,
4956 llvm_unreachable("failed to find kernarg segment ptr");
4957
4958 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4959 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4960}
4961
4962/// Legalize a value that's loaded from kernel arguments. This is only used by
4963/// legacy intrinsics.
4967 Align Alignment) const {
4968 Register DstReg = MI.getOperand(0).getReg();
4969
4970 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4971 "unexpected kernarg parameter type");
4972
4975 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
4978 MI.eraseFromParent();
4979 return true;
4980}
4981
4984 MachineIRBuilder &B) const {
4985 Register Dst = MI.getOperand(0).getReg();
4986 LLT DstTy = MRI.getType(Dst);
4987 LLT S16 = LLT::scalar(16);
4988 LLT S32 = LLT::scalar(32);
4989 LLT S64 = LLT::scalar(64);
4990
4991 if (DstTy == S16)
4992 return legalizeFDIV16(MI, MRI, B);
4993 if (DstTy == S32)
4994 return legalizeFDIV32(MI, MRI, B);
4995 if (DstTy == S64)
4996 return legalizeFDIV64(MI, MRI, B);
4997
4998 return false;
4999}
5000
5002 Register DstDivReg,
5003 Register DstRemReg,
5004 Register X,
5005 Register Y) const {
5006 const LLT S1 = LLT::scalar(1);
5007 const LLT S32 = LLT::scalar(32);
5008
5009 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5010 // algorithm used here.
5011
5012 // Initial estimate of inv(y).
5013 auto FloatY = B.buildUITOFP(S32, Y);
5014 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5015 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5016 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5017 auto Z = B.buildFPTOUI(S32, ScaledY);
5018
5019 // One round of UNR.
5020 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5021 auto NegYZ = B.buildMul(S32, NegY, Z);
5022 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5023
5024 // Quotient/remainder estimate.
5025 auto Q = B.buildUMulH(S32, X, Z);
5026 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5027
5028 // First quotient/remainder refinement.
5029 auto One = B.buildConstant(S32, 1);
5030 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5031 if (DstDivReg)
5032 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5033 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5034
5035 // Second quotient/remainder refinement.
5036 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5037 if (DstDivReg)
5038 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5039
5040 if (DstRemReg)
5041 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5042}
5043
5044// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5045//
5046// Return lo, hi of result
5047//
5048// %cvt.lo = G_UITOFP Val.lo
5049// %cvt.hi = G_UITOFP Val.hi
5050// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5051// %rcp = G_AMDGPU_RCP_IFLAG %mad
5052// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5053// %mul2 = G_FMUL %mul1, 2**(-32)
5054// %trunc = G_INTRINSIC_TRUNC %mul2
5055// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5056// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5057static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5058 Register Val) {
5059 const LLT S32 = LLT::scalar(32);
5060 auto Unmerge = B.buildUnmerge(S32, Val);
5061
5062 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5063 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5064
5065 auto Mad = B.buildFMAD(
5066 S32, CvtHi, // 2**32
5067 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5068
5069 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5070 auto Mul1 = B.buildFMul(
5071 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5072
5073 // 2**(-32)
5074 auto Mul2 = B.buildFMul(
5075 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5076 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5077
5078 // -(2**32)
5079 auto Mad2 = B.buildFMAD(
5080 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5081 Mul1);
5082
5083 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5084 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5085
5086 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5087}
5088
5090 Register DstDivReg,
5091 Register DstRemReg,
5092 Register Numer,
5093 Register Denom) const {
5094 const LLT S32 = LLT::scalar(32);
5095 const LLT S64 = LLT::scalar(64);
5096 const LLT S1 = LLT::scalar(1);
5097 Register RcpLo, RcpHi;
5098
5099 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5100
5101 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5102
5103 auto Zero64 = B.buildConstant(S64, 0);
5104 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5105
5106 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5107 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5108
5109 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5110 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5111 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5112
5113 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5114 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5115 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5116
5117 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5118 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5119 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5120 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5121 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5122
5123 auto Zero32 = B.buildConstant(S32, 0);
5124 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5125 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5126 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5127
5128 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5129 Register NumerLo = UnmergeNumer.getReg(0);
5130 Register NumerHi = UnmergeNumer.getReg(1);
5131
5132 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5133 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5134 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5135 Register Mul3_Lo = UnmergeMul3.getReg(0);
5136 Register Mul3_Hi = UnmergeMul3.getReg(1);
5137 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5138 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5139 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5140 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5141
5142 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5143 Register DenomLo = UnmergeDenom.getReg(0);
5144 Register DenomHi = UnmergeDenom.getReg(1);
5145
5146 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5147 auto C1 = B.buildSExt(S32, CmpHi);
5148
5149 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5150 auto C2 = B.buildSExt(S32, CmpLo);
5151
5152 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5153 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5154
5155 // TODO: Here and below portions of the code can be enclosed into if/endif.
5156 // Currently control flow is unconditional and we have 4 selects after
5157 // potential endif to substitute PHIs.
5158
5159 // if C3 != 0 ...
5160 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5161 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5162 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5163 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5164
5165 auto One64 = B.buildConstant(S64, 1);
5166 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5167
5168 auto C4 =
5169 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5170 auto C5 =
5171 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5172 auto C6 = B.buildSelect(
5173 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5174
5175 // if (C6 != 0)
5176 auto Add4 = B.buildAdd(S64, Add3, One64);
5177 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5178
5179 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5180 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5181 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5182
5183 // endif C6
5184 // endif C3
5185
5186 if (DstDivReg) {
5187 auto Sel1 = B.buildSelect(
5188 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5189 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5190 Sel1, MulHi3);
5191 }
5192
5193 if (DstRemReg) {
5194 auto Sel2 = B.buildSelect(
5195 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5196 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5197 Sel2, Sub1);
5198 }
5199}
5200
5203 MachineIRBuilder &B) const {
5204 Register DstDivReg, DstRemReg;
5205 switch (MI.getOpcode()) {
5206 default:
5207 llvm_unreachable("Unexpected opcode!");
5208 case AMDGPU::G_UDIV: {
5209 DstDivReg = MI.getOperand(0).getReg();
5210 break;
5211 }
5212 case AMDGPU::G_UREM: {
5213 DstRemReg = MI.getOperand(0).getReg();
5214 break;
5215 }
5216 case AMDGPU::G_UDIVREM: {
5217 DstDivReg = MI.getOperand(0).getReg();
5218 DstRemReg = MI.getOperand(1).getReg();
5219 break;
5220 }
5221 }
5222
5223 const LLT S64 = LLT::scalar(64);
5224 const LLT S32 = LLT::scalar(32);
5225 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5226 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5227 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5228 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5229
5230 if (Ty == S32)
5231 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5232 else if (Ty == S64)
5233 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5234 else
5235 return false;
5236
5237 MI.eraseFromParent();
5238 return true;
5239}
5240
5243 MachineIRBuilder &B) const {
5244 const LLT S64 = LLT::scalar(64);
5245 const LLT S32 = LLT::scalar(32);
5246
5247 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5248 if (Ty != S32 && Ty != S64)
5249 return false;
5250
5251 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5252 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5253 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5254
5255 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5256 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5257 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5258
5259 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5260 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5261
5262 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5263 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5264
5265 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5266 switch (MI.getOpcode()) {
5267 default:
5268 llvm_unreachable("Unexpected opcode!");
5269 case AMDGPU::G_SDIV: {
5270 DstDivReg = MI.getOperand(0).getReg();
5271 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5272 break;
5273 }
5274 case AMDGPU::G_SREM: {
5275 DstRemReg = MI.getOperand(0).getReg();
5276 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5277 break;
5278 }
5279 case AMDGPU::G_SDIVREM: {
5280 DstDivReg = MI.getOperand(0).getReg();
5281 DstRemReg = MI.getOperand(1).getReg();
5282 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5283 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5284 break;
5285 }
5286 }
5287
5288 if (Ty == S32)
5289 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5290 else
5291 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5292
5293 if (DstDivReg) {
5294 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5295 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5296 B.buildSub(DstDivReg, SignXor, Sign);
5297 }
5298
5299 if (DstRemReg) {
5300 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5301 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5302 B.buildSub(DstRemReg, SignXor, Sign);
5303 }
5304
5305 MI.eraseFromParent();
5306 return true;
5307}
5308
5311 MachineIRBuilder &B) const {
5312 Register Res = MI.getOperand(0).getReg();
5313 Register LHS = MI.getOperand(1).getReg();
5314 Register RHS = MI.getOperand(2).getReg();
5315 uint16_t Flags = MI.getFlags();
5316 LLT ResTy = MRI.getType(Res);
5317
5318 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5319
5320 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5321 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5322 return false;
5323
5324 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5325 // the CI documentation has a worst case error of 1 ulp.
5326 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5327 // use it as long as we aren't trying to use denormals.
5328 //
5329 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5330
5331 // 1 / x -> RCP(x)
5332 if (CLHS->isExactlyValue(1.0)) {
5333 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5334 .addUse(RHS)
5335 .setMIFlags(Flags);
5336
5337 MI.eraseFromParent();
5338 return true;
5339 }
5340
5341 // -1 / x -> RCP( FNEG(x) )
5342 if (CLHS->isExactlyValue(-1.0)) {
5343 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5344 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5345 .addUse(FNeg.getReg(0))
5346 .setMIFlags(Flags);
5347
5348 MI.eraseFromParent();
5349 return true;
5350 }
5351 }
5352
5353 // For f16 require afn or arcp.
5354 // For f32 require afn.
5355 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5356 !MI.getFlag(MachineInstr::FmArcp)))
5357 return false;
5358
5359 // x / y -> x * (1.0 / y)
5360 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5361 .addUse(RHS)
5362 .setMIFlags(Flags);
5363 B.buildFMul(Res, LHS, RCP, Flags);
5364
5365 MI.eraseFromParent();
5366 return true;
5367}
5368
5371 MachineIRBuilder &B) const {
5372 Register Res = MI.getOperand(0).getReg();
5373 Register X = MI.getOperand(1).getReg();
5374 Register Y = MI.getOperand(2).getReg();
5375 uint16_t Flags = MI.getFlags();
5376 LLT ResTy = MRI.getType(Res);
5377
5378 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5379
5380 if (!AllowInaccurateRcp)
5381 return false;
5382
5383 auto NegY = B.buildFNeg(ResTy, Y);
5384 auto One = B.buildFConstant(ResTy, 1.0);
5385
5386 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5387 .addUse(Y)
5388 .setMIFlags(Flags);
5389
5390 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5391 R = B.buildFMA(ResTy, Tmp0, R, R);
5392
5393 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5394 R = B.buildFMA(ResTy, Tmp1, R, R);
5395
5396 auto Ret = B.buildFMul(ResTy, X, R);
5397 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5398
5399 B.buildFMA(Res, Tmp2, R, Ret);
5400 MI.eraseFromParent();
5401 return true;
5402}
5403
5406 MachineIRBuilder &B) const {
5408 return true;
5409
5410 Register Res = MI.getOperand(0).getReg();
5411 Register LHS = MI.getOperand(1).getReg();
5412 Register RHS = MI.getOperand(2).getReg();
5413
5414 uint16_t Flags = MI.getFlags();
5415
5416 LLT S16 = LLT::scalar(16);
5417 LLT S32 = LLT::scalar(32);
5418
5419 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5420 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5421 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5422 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5423 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5424 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5425 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5426 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5427 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5428 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5429 // q16.u = opx(V_CVT_F16_F32, q32.u);
5430 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5431
5432 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5433 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5434 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5435 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5436 .addUse(RHSExt.getReg(0))
5437 .setMIFlags(Flags);
5438 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5440 if (ST.hasMadMacF32Insts()) {
5441 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5442 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5443 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5444 } else {
5445 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5446 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5447 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5448 }
5449 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5450 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5451 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5452 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5453 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5454 .addUse(RDst.getReg(0))
5455 .addUse(RHS)
5456 .addUse(LHS)
5457 .setMIFlags(Flags);
5458
5459 MI.eraseFromParent();
5460 return true;
5461}
5462
5463static constexpr unsigned SPDenormModeBitField =
5465
5466// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5467// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5469 const GCNSubtarget &ST,
5471 // Set SP denorm mode to this value.
5472 unsigned SPDenormMode =
5473 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5474
5475 if (ST.hasDenormModeInst()) {
5476 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5477 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5478
5479 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5480 B.buildInstr(AMDGPU::S_DENORM_MODE)
5481 .addImm(NewDenormModeValue);
5482
5483 } else {
5484 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5485 .addImm(SPDenormMode)
5486 .addImm(SPDenormModeBitField);
5487 }
5488}
5489
5492 MachineIRBuilder &B) const {
5494 return true;
5495
5496 Register Res = MI.getOperand(0).getReg();
5497 Register LHS = MI.getOperand(1).getReg();
5498 Register RHS = MI.getOperand(2).getReg();
5499 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5500 SIModeRegisterDefaults Mode = MFI->getMode();
5501
5502 uint16_t Flags = MI.getFlags();
5503
5504 LLT S32 = LLT::scalar(32);
5505 LLT S1 = LLT::scalar(1);
5506
5507 auto One = B.buildFConstant(S32, 1.0f);
5508
5509 auto DenominatorScaled =
5510 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5511 .addUse(LHS)
5512 .addUse(RHS)
5513 .addImm(0)
5514 .setMIFlags(Flags);
5515 auto NumeratorScaled =
5516 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5517 .addUse(LHS)
5518 .addUse(RHS)
5519 .addImm(1)
5520 .setMIFlags(Flags);
5521
5522 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5523 .addUse(DenominatorScaled.getReg(0))
5524 .setMIFlags(Flags);
5525 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5526
5527 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5528 const bool HasDynamicDenormals =
5529 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5530 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5531
5532 Register SavedSPDenormMode;
5533 if (!PreservesDenormals) {
5534 if (HasDynamicDenormals) {
5535 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5536 B.buildInstr(AMDGPU::S_GETREG_B32)
5537 .addDef(SavedSPDenormMode)
5538 .addImm(SPDenormModeBitField);
5539 }
5540 toggleSPDenormMode(true, B, ST, Mode);
5541 }
5542
5543 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5544 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5545 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5546 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5547 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5548 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5549
5550 if (!PreservesDenormals) {
5551 if (HasDynamicDenormals) {
5552 assert(SavedSPDenormMode);
5553 B.buildInstr(AMDGPU::S_SETREG_B32)
5554 .addReg(SavedSPDenormMode)
5555 .addImm(SPDenormModeBitField);
5556 } else
5557 toggleSPDenormMode(false, B, ST, Mode);
5558 }
5559
5560 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5561 .addUse(Fma4.getReg(0))
5562 .addUse(Fma1.getReg(0))
5563 .addUse(Fma3.getReg(0))
5564 .addUse(NumeratorScaled.getReg(1))
5565 .setMIFlags(Flags);
5566
5567 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5568 .addUse(Fmas.getReg(0))
5569 .addUse(RHS)
5570 .addUse(LHS)
5571 .setMIFlags(Flags);
5572
5573 MI.eraseFromParent();
5574 return true;
5575}
5576
5579 MachineIRBuilder &B) const {
5581 return true;
5582
5583 Register Res = MI.getOperand(0).getReg();
5584 Register LHS = MI.getOperand(1).getReg();
5585 Register RHS = MI.getOperand(2).getReg();
5586
5587 uint16_t Flags = MI.getFlags();
5588
5589 LLT S64 = LLT::scalar(64);
5590 LLT S1 = LLT::scalar(1);
5591
5592 auto One = B.buildFConstant(S64, 1.0);
5593
5594 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5595 .addUse(LHS)
5596 .addUse(RHS)
5597 .addImm(0)
5598 .setMIFlags(Flags);
5599
5600 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5601
5602 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5603 .addUse(DivScale0.getReg(0))
5604 .setMIFlags(Flags);
5605
5606 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5607 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5608 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5609
5610 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5611 .addUse(LHS)
5612 .addUse(RHS)
5613 .addImm(1)
5614 .setMIFlags(Flags);
5615
5616 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5617 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5618 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5619
5620 Register Scale;
5621 if (!ST.hasUsableDivScaleConditionOutput()) {
5622 // Workaround a hardware bug on SI where the condition output from div_scale
5623 // is not usable.
5624
5625 LLT S32 = LLT::scalar(32);
5626
5627 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5628 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5629 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5630 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5631
5632 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5633 Scale1Unmerge.getReg(1));
5634 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5635 Scale0Unmerge.getReg(1));
5636 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5637 } else {
5638 Scale = DivScale1.getReg(1);
5639 }
5640
5641 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5642 .addUse(Fma4.getReg(0))
5643 .addUse(Fma3.getReg(0))
5644 .addUse(Mul.getReg(0))
5645 .addUse(Scale)
5646 .setMIFlags(Flags);
5647
5648 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5649 .addUse(Fmas.getReg(0))
5650 .addUse(RHS)
5651 .addUse(LHS)
5652 .setMIFlags(Flags);
5653
5654 MI.eraseFromParent();
5655 return true;
5656}
5657
5660 MachineIRBuilder &B) const {
5661 Register Res0 = MI.getOperand(0).getReg();
5662 Register Res1 = MI.getOperand(1).getReg();
5663 Register Val = MI.getOperand(2).getReg();
5664 uint16_t Flags = MI.getFlags();
5665
5666 LLT Ty = MRI.getType(Res0);
5667 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5668
5669 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5670 .addUse(Val)
5671 .setMIFlags(Flags);
5672 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5673 .addUse(Val)
5674 .setMIFlags(Flags);
5675
5676 if (ST.hasFractBug()) {
5677 auto Fabs = B.buildFAbs(Ty, Val);
5678 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5679 auto IsFinite =
5680 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5681 auto Zero = B.buildConstant(InstrExpTy, 0);
5682 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5683 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5684 }
5685
5686 B.buildCopy(Res0, Mant);
5687 B.buildSExtOrTrunc(Res1, Exp);
5688
5689 MI.eraseFromParent();
5690 return true;
5691}
5692
5695 MachineIRBuilder &B) const {
5696 Register Res = MI.getOperand(0).getReg();
5697 Register LHS = MI.getOperand(2).getReg();
5698 Register RHS = MI.getOperand(3).getReg();
5699 uint16_t Flags = MI.getFlags();
5700
5701 LLT S32 = LLT::scalar(32);
5702 LLT S1 = LLT::scalar(1);
5703
5704 auto Abs = B.buildFAbs(S32, RHS, Flags);
5705 const APFloat C0Val(1.0f);
5706
5707 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5708 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5709 auto C2 = B.buildFConstant(S32, 1.0f);
5710
5711 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5712 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5713
5714 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5715
5716 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5717 .addUse(Mul0.getReg(0))
5718 .setMIFlags(Flags);
5719
5720 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5721
5722 B.buildFMul(Res, Sel, Mul1, Flags);
5723
5724 MI.eraseFromParent();
5725 return true;
5726}
5727
5730 MachineIRBuilder &B) const {
5731 // Bypass the correct expansion a standard promotion through G_FSQRT would
5732 // get. The f32 op is accurate enough for the f16 cas.
5733 unsigned Flags = MI.getFlags();
5734 assert(!ST.has16BitInsts());
5735 const LLT F32 = LLT::scalar(32);
5736 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5737 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5738 .addUse(Ext.getReg(0))
5739 .setMIFlags(Flags);
5740 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5741 MI.eraseFromParent();
5742 return true;
5743}
5744
5747 MachineIRBuilder &B) const {
5748 MachineFunction &MF = B.getMF();
5749 Register Dst = MI.getOperand(0).getReg();
5750 Register X = MI.getOperand(1).getReg();
5751 const unsigned Flags = MI.getFlags();
5752 const LLT S1 = LLT::scalar(1);
5753 const LLT F32 = LLT::scalar(32);
5754 const LLT I32 = LLT::scalar(32);
5755
5756 if (allowApproxFunc(MF, Flags)) {
5757 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5758 .addUse(X)
5759 .setMIFlags(Flags);
5760 MI.eraseFromParent();
5761 return true;
5762 }
5763
5764 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5765 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5766 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5767 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5768 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5769
5770 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5771 if (needsDenormHandlingF32(MF, X, Flags)) {
5772 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5773 .addUse(SqrtX.getReg(0))
5774 .setMIFlags(Flags);
5775
5776 auto NegOne = B.buildConstant(I32, -1);
5777 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5778
5779 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5780 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5781
5782 auto PosOne = B.buildConstant(I32, 1);
5783 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5784
5785 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5786 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5787
5788 auto Zero = B.buildFConstant(F32, 0.0f);
5789 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5790
5791 SqrtS =
5792 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5793
5794 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5795 SqrtS =
5796 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5797 } else {
5798 auto SqrtR =
5799 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5800 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5801
5802 auto Half = B.buildFConstant(F32, 0.5f);
5803 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5804 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5805 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5806 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5807 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5808 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5809 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5810 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5811 }
5812
5813 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5814
5815 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5816
5817 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5818
5819 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5820 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5821
5822 MI.eraseFromParent();
5823 return true;
5824}
5825
5828 MachineIRBuilder &B) const {
5829 // For double type, the SQRT and RSQ instructions don't have required
5830 // precision, we apply Goldschmidt's algorithm to improve the result:
5831 //
5832 // y0 = rsq(x)
5833 // g0 = x * y0
5834 // h0 = 0.5 * y0
5835 //
5836 // r0 = 0.5 - h0 * g0
5837 // g1 = g0 * r0 + g0
5838 // h1 = h0 * r0 + h0
5839 //
5840 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5841 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5842 // h2 = h1 * r1 + h1
5843 //
5844 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5845 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5846 //
5847 // sqrt(x) = g3
5848
5849 const LLT S1 = LLT::scalar(1);
5850 const LLT S32 = LLT::scalar(32);
5851 const LLT F64 = LLT::scalar(64);
5852
5853 Register Dst = MI.getOperand(0).getReg();
5854 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5855
5856 Register X = MI.getOperand(1).getReg();
5857 unsigned Flags = MI.getFlags();
5858
5859 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5860
5861 auto ZeroInt = B.buildConstant(S32, 0);
5862 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5863
5864 // Scale up input if it is too small.
5865 auto ScaleUpFactor = B.buildConstant(S32, 256);
5866 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5867 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5868
5869 auto SqrtY =
5870 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5871
5872 auto Half = B.buildFConstant(F64, 0.5);
5873 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5874 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5875
5876 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5877 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5878
5879 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5880 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5881
5882 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5883 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5884
5885 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5886
5887 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5888 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5889
5890 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5891
5892 // Scale down the result.
5893 auto ScaleDownFactor = B.buildConstant(S32, -128);
5894 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5895 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5896
5897 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5898 // with finite only or nsz because rsq(+/-0) = +/-inf
5899
5900 // TODO: Check for DAZ and expand to subnormals
5901 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5902
5903 // If x is +INF, +0, or -0, use its original value
5904 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5905
5906 MI.eraseFromParent();
5907 return true;
5908}
5909
5912 MachineIRBuilder &B) const {
5913 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5914 if (Ty == LLT::scalar(32))
5915 return legalizeFSQRTF32(MI, MRI, B);
5916 if (Ty == LLT::scalar(64))
5917 return legalizeFSQRTF64(MI, MRI, B);
5918 if (Ty == LLT::scalar(16))
5919 return legalizeFSQRTF16(MI, MRI, B);
5920 return false;
5921}
5922
5923// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5924// FIXME: Why do we handle this one but not other removed instructions?
5925//
5926// Reciprocal square root. The clamp prevents infinite results, clamping
5927// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5928// +-max_float.
5931 MachineIRBuilder &B) const {
5932 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5933 return true;
5934
5935 Register Dst = MI.getOperand(0).getReg();
5936 Register Src = MI.getOperand(2).getReg();
5937 auto Flags = MI.getFlags();
5938
5939 LLT Ty = MRI.getType(Dst);
5940
5941 const fltSemantics *FltSemantics;
5942 if (Ty == LLT::scalar(32))
5943 FltSemantics = &APFloat::IEEEsingle();
5944 else if (Ty == LLT::scalar(64))
5945 FltSemantics = &APFloat::IEEEdouble();
5946 else
5947 return false;
5948
5949 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5950 .addUse(Src)
5951 .setMIFlags(Flags);
5952
5953 // We don't need to concern ourselves with the snan handling difference, since
5954 // the rsq quieted (or not) so use the one which will directly select.
5955 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5956 const bool UseIEEE = MFI->getMode().IEEE;
5957
5958 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5959 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5960 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5961
5962 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5963
5964 if (UseIEEE)
5965 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5966 else
5967 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5968 MI.eraseFromParent();
5969 return true;
5970}
5971
5972// TODO: Fix pointer type handling
5975 Intrinsic::ID IID) const {
5976
5977 MachineIRBuilder &B = Helper.MIRBuilder;
5978 MachineRegisterInfo &MRI = *B.getMRI();
5979
5980 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5981 IID == Intrinsic::amdgcn_permlanex16;
5982 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5983 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5984
5985 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5986 Register Src2, LLT VT) -> Register {
5987 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5988 switch (IID) {
5989 case Intrinsic::amdgcn_readfirstlane:
5990 case Intrinsic::amdgcn_permlane64:
5991 return LaneOp.getReg(0);
5992 case Intrinsic::amdgcn_readlane:
5993 case Intrinsic::amdgcn_set_inactive:
5994 case Intrinsic::amdgcn_set_inactive_chain_arg:
5995 return LaneOp.addUse(Src1).getReg(0);
5996 case Intrinsic::amdgcn_writelane:
5997 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5998 case Intrinsic::amdgcn_permlane16:
5999 case Intrinsic::amdgcn_permlanex16: {
6000 Register Src3 = MI.getOperand(5).getReg();
6001 int64_t Src4 = MI.getOperand(6).getImm();
6002 int64_t Src5 = MI.getOperand(7).getImm();
6003 return LaneOp.addUse(Src1)
6004 .addUse(Src2)
6005 .addUse(Src3)
6006 .addImm(Src4)
6007 .addImm(Src5)
6008 .getReg(0);
6009 }
6010 case Intrinsic::amdgcn_mov_dpp8:
6011 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6012 case Intrinsic::amdgcn_update_dpp:
6013 return LaneOp.addUse(Src1)
6014 .addImm(MI.getOperand(4).getImm())
6015 .addImm(MI.getOperand(5).getImm())
6016 .addImm(MI.getOperand(6).getImm())
6017 .addImm(MI.getOperand(7).getImm())
6018 .getReg(0);
6019 default:
6020 llvm_unreachable("unhandled lane op");
6021 }
6022 };
6023
6024 Register DstReg = MI.getOperand(0).getReg();
6025 Register Src0 = MI.getOperand(2).getReg();
6026 Register Src1, Src2;
6027 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6028 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6029 Src1 = MI.getOperand(3).getReg();
6030 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6031 Src2 = MI.getOperand(4).getReg();
6032 }
6033 }
6034
6035 LLT Ty = MRI.getType(DstReg);
6036 unsigned Size = Ty.getSizeInBits();
6037
6038 unsigned SplitSize = 32;
6039 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6040 ST.hasDPALU_DPP() &&
6041 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6042 SplitSize = 64;
6043
6044 if (Size == SplitSize) {
6045 // Already legal
6046 return true;
6047 }
6048
6049 if (Size < 32) {
6050 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6051
6052 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6053 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6054
6055 if (IID == Intrinsic::amdgcn_writelane)
6056 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6057
6058 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6059 B.buildTrunc(DstReg, LaneOpDst);
6060 MI.eraseFromParent();
6061 return true;
6062 }
6063
6064 if (Size % SplitSize != 0)
6065 return false;
6066
6067 LLT PartialResTy = LLT::scalar(SplitSize);
6068 bool NeedsBitcast = false;
6069 if (Ty.isVector()) {
6070 LLT EltTy = Ty.getElementType();
6071 unsigned EltSize = EltTy.getSizeInBits();
6072 if (EltSize == SplitSize) {
6073 PartialResTy = EltTy;
6074 } else if (EltSize == 16 || EltSize == 32) {
6075 unsigned NElem = SplitSize / EltSize;
6076 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6077 } else {
6078 // Handle all other cases via S32/S64 pieces
6079 NeedsBitcast = true;
6080 }
6081 }
6082
6083 SmallVector<Register, 4> PartialRes;
6084 unsigned NumParts = Size / SplitSize;
6085 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6086 MachineInstrBuilder Src1Parts, Src2Parts;
6087
6088 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6089 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6090
6091 if (IID == Intrinsic::amdgcn_writelane)
6092 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6093
6094 for (unsigned i = 0; i < NumParts; ++i) {
6095 Src0 = Src0Parts.getReg(i);
6096
6097 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6098 Src1 = Src1Parts.getReg(i);
6099
6100 if (IID == Intrinsic::amdgcn_writelane)
6101 Src2 = Src2Parts.getReg(i);
6102
6103 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6104 }
6105
6106 if (NeedsBitcast)
6107 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6108 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6109 else
6110 B.buildMergeLikeInstr(DstReg, PartialRes);
6111
6112 MI.eraseFromParent();
6113 return true;
6114}
6115
6118 MachineIRBuilder &B) const {
6120 ST.getTargetLowering()->getImplicitParameterOffset(
6122 LLT DstTy = MRI.getType(DstReg);
6123 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6124
6125 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6126 if (!loadInputValue(KernargPtrReg, B,
6128 return false;
6129
6130 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6131 B.buildConstant(IdxTy, Offset).getReg(0));
6132 return true;
6133}
6134
6135/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6136/// bits of the pointer and replace them with the stride argument, then
6137/// merge_values everything together. In the common case of a raw buffer (the
6138/// stride component is 0), we can just AND off the upper half.
6141 Register Result = MI.getOperand(0).getReg();
6142 Register Pointer = MI.getOperand(2).getReg();
6143 Register Stride = MI.getOperand(3).getReg();
6144 Register NumRecords = MI.getOperand(4).getReg();
6145 Register Flags = MI.getOperand(5).getReg();
6146
6147 LLT S32 = LLT::scalar(32);
6148 LLT S64 = LLT::scalar(64);
6149
6150 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6151
6152 auto ExtStride = B.buildAnyExt(S32, Stride);
6153
6154 if (ST.has45BitNumRecordsBufferResource()) {
6155 Register Zero = B.buildConstant(S32, 0).getReg(0);
6156 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6157 // num_records.
6158 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6159 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6160 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6161 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6162 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6163
6164 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6165 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6166 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6167 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6168 auto ExtShiftedStride =
6169 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6170 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6171 auto ExtShiftedFlags =
6172 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6173 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6174 Register HighHalf =
6175 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6176 B.buildMergeValues(Result, {LowHalf, HighHalf});
6177 } else {
6178 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6179 auto Unmerge = B.buildUnmerge(S32, Pointer);
6180 auto LowHalf = Unmerge.getReg(0);
6181 auto HighHalf = Unmerge.getReg(1);
6182
6183 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6184 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6185 auto ShiftConst = B.buildConstant(S32, 16);
6186 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6187 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6188 Register NewHighHalfReg = NewHighHalf.getReg(0);
6189 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6190 }
6191
6192 MI.eraseFromParent();
6193 return true;
6194}
6195
6198 MachineIRBuilder &B) const {
6199 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6200 if (!MFI->isEntryFunction()) {
6203 }
6204
6205 Register DstReg = MI.getOperand(0).getReg();
6206 if (!getImplicitArgPtr(DstReg, MRI, B))
6207 return false;
6208
6209 MI.eraseFromParent();
6210 return true;
6211}
6212
6215 MachineIRBuilder &B) const {
6216 Function &F = B.getMF().getFunction();
6217 std::optional<uint32_t> KnownSize =
6219 if (KnownSize.has_value())
6220 B.buildConstant(DstReg, *KnownSize);
6221 return false;
6222}
6223
6226 MachineIRBuilder &B) const {
6227
6228 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6229 if (!MFI->isEntryFunction()) {
6232 }
6233
6234 Register DstReg = MI.getOperand(0).getReg();
6235 if (!getLDSKernelId(DstReg, MRI, B))
6236 return false;
6237
6238 MI.eraseFromParent();
6239 return true;
6240}
6241
6245 unsigned AddrSpace) const {
6246 const LLT S32 = LLT::scalar(32);
6247 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6248 Register Hi32 = Unmerge.getReg(1);
6249
6250 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6251 ST.hasGloballyAddressableScratch()) {
6252 Register FlatScratchBaseHi =
6253 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6254 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6255 .getReg(0);
6256 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6257 // Test bits 63..58 against the aperture address.
6258 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6259 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6260 B.buildConstant(S32, 1u << 26));
6261 } else {
6262 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6263 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6264 }
6265 MI.eraseFromParent();
6266 return true;
6267}
6268
6269// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6270// offset (the offset that is included in bounds checking and swizzling, to be
6271// split between the instruction's voffset and immoffset fields) and soffset
6272// (the offset that is excluded from bounds checking and swizzling, to go in
6273// the instruction's soffset field). This function takes the first kind of
6274// offset and figures out how to split it between voffset and immoffset.
6275std::pair<Register, unsigned>
6277 Register OrigOffset) const {
6278 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6279 Register BaseReg;
6280 unsigned ImmOffset;
6281 const LLT S32 = LLT::scalar(32);
6282 MachineRegisterInfo &MRI = *B.getMRI();
6283
6284 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6285 // being added, so we can only safely match a 32-bit addition with no unsigned
6286 // overflow.
6287 bool CheckNUW = ST.hasGFX1250Insts();
6288 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6289 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6290
6291 // If BaseReg is a pointer, convert it to int.
6292 if (MRI.getType(BaseReg).isPointer())
6293 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6294
6295 // If the immediate value is too big for the immoffset field, put only bits
6296 // that would normally fit in the immoffset field. The remaining value that
6297 // is copied/added for the voffset field is a large power of 2, and it
6298 // stands more chance of being CSEd with the copy/add for another similar
6299 // load/store.
6300 // However, do not do that rounding down if that is a negative
6301 // number, as it appears to be illegal to have a negative offset in the
6302 // vgpr, even if adding the immediate offset makes it positive.
6303 unsigned Overflow = ImmOffset & ~MaxImm;
6304 ImmOffset -= Overflow;
6305 if ((int32_t)Overflow < 0) {
6306 Overflow += ImmOffset;
6307 ImmOffset = 0;
6308 }
6309
6310 if (Overflow != 0) {
6311 if (!BaseReg) {
6312 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6313 } else {
6314 auto OverflowVal = B.buildConstant(S32, Overflow);
6315 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6316 }
6317 }
6318
6319 if (!BaseReg)
6320 BaseReg = B.buildConstant(S32, 0).getReg(0);
6321
6322 return std::pair(BaseReg, ImmOffset);
6323}
6324
6325/// Handle register layout difference for f16 images for some subtargets.
6328 Register Reg,
6329 bool ImageStore) const {
6330 const LLT S16 = LLT::scalar(16);
6331 const LLT S32 = LLT::scalar(32);
6332 LLT StoreVT = MRI.getType(Reg);
6333 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6334
6335 if (ST.hasUnpackedD16VMem()) {
6336 auto Unmerge = B.buildUnmerge(S16, Reg);
6337
6338 SmallVector<Register, 4> WideRegs;
6339 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6340 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6341
6342 int NumElts = StoreVT.getNumElements();
6343
6344 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6345 .getReg(0);
6346 }
6347
6348 if (ImageStore && ST.hasImageStoreD16Bug()) {
6349 if (StoreVT.getNumElements() == 2) {
6350 SmallVector<Register, 4> PackedRegs;
6351 Reg = B.buildBitcast(S32, Reg).getReg(0);
6352 PackedRegs.push_back(Reg);
6353 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6354 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6355 .getReg(0);
6356 }
6357
6358 if (StoreVT.getNumElements() == 3) {
6359 SmallVector<Register, 4> PackedRegs;
6360 auto Unmerge = B.buildUnmerge(S16, Reg);
6361 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6362 PackedRegs.push_back(Unmerge.getReg(I));
6363 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6364 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6365 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6366 }
6367
6368 if (StoreVT.getNumElements() == 4) {
6369 SmallVector<Register, 4> PackedRegs;
6370 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6371 auto Unmerge = B.buildUnmerge(S32, Reg);
6372 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6373 PackedRegs.push_back(Unmerge.getReg(I));
6374 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6375 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6376 .getReg(0);
6377 }
6378
6379 llvm_unreachable("invalid data type");
6380 }
6381
6382 if (StoreVT == LLT::fixed_vector(3, S16)) {
6383 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6384 .getReg(0);
6385 }
6386 return Reg;
6387}
6388
6390 Register VData, LLT MemTy,
6391 bool IsFormat) const {
6392 MachineRegisterInfo *MRI = B.getMRI();
6393 LLT Ty = MRI->getType(VData);
6394
6395 const LLT S16 = LLT::scalar(16);
6396
6397 // Fixup buffer resources themselves needing to be v4i128.
6399 return castBufferRsrcToV4I32(VData, B);
6400
6401 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6402 Ty = getBitcastRegisterType(Ty);
6403 VData = B.buildBitcast(Ty, VData).getReg(0);
6404 }
6405 // Fixup illegal register types for i8 stores.
6406 if (Ty == LLT::scalar(8) || Ty == S16) {
6407 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6408 return AnyExt;
6409 }
6410
6411 if (Ty.isVector()) {
6412 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6413 if (IsFormat)
6414 return handleD16VData(B, *MRI, VData);
6415 }
6416 }
6417
6418 return VData;
6419}
6420
6422 LegalizerHelper &Helper,
6423 bool IsTyped,
6424 bool IsFormat) const {
6425 MachineIRBuilder &B = Helper.MIRBuilder;
6426 MachineRegisterInfo &MRI = *B.getMRI();
6427
6428 Register VData = MI.getOperand(1).getReg();
6429 LLT Ty = MRI.getType(VData);
6430 LLT EltTy = Ty.getScalarType();
6431 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6432 const LLT S32 = LLT::scalar(32);
6433
6434 MachineMemOperand *MMO = *MI.memoperands_begin();
6435 const int MemSize = MMO->getSize().getValue();
6436 LLT MemTy = MMO->getMemoryType();
6437
6438 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6439
6441 Register RSrc = MI.getOperand(2).getReg();
6442
6443 unsigned ImmOffset;
6444
6445 // The typed intrinsics add an immediate after the registers.
6446 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6447
6448 // The struct intrinsic variants add one additional operand over raw.
6449 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6450 Register VIndex;
6451 int OpOffset = 0;
6452 if (HasVIndex) {
6453 VIndex = MI.getOperand(3).getReg();
6454 OpOffset = 1;
6455 } else {
6456 VIndex = B.buildConstant(S32, 0).getReg(0);
6457 }
6458
6459 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6460 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6461
6462 unsigned Format = 0;
6463 if (IsTyped) {
6464 Format = MI.getOperand(5 + OpOffset).getImm();
6465 ++OpOffset;
6466 }
6467
6468 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6469
6470 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6471
6472 unsigned Opc;
6473 if (IsTyped) {
6474 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6475 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6476 } else if (IsFormat) {
6477 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6478 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6479 } else {
6480 switch (MemSize) {
6481 case 1:
6482 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6483 break;
6484 case 2:
6485 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6486 break;
6487 default:
6488 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6489 break;
6490 }
6491 }
6492
6493 auto MIB = B.buildInstr(Opc)
6494 .addUse(VData) // vdata
6495 .addUse(RSrc) // rsrc
6496 .addUse(VIndex) // vindex
6497 .addUse(VOffset) // voffset
6498 .addUse(SOffset) // soffset
6499 .addImm(ImmOffset); // offset(imm)
6500
6501 if (IsTyped)
6502 MIB.addImm(Format);
6503
6504 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6505 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6506 .addMemOperand(MMO);
6507
6508 MI.eraseFromParent();
6509 return true;
6510}
6511
6512static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6513 Register VIndex, Register VOffset, Register SOffset,
6514 unsigned ImmOffset, unsigned Format,
6515 unsigned AuxiliaryData, MachineMemOperand *MMO,
6516 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6517 auto MIB = B.buildInstr(Opc)
6518 .addDef(LoadDstReg) // vdata
6519 .addUse(RSrc) // rsrc
6520 .addUse(VIndex) // vindex
6521 .addUse(VOffset) // voffset
6522 .addUse(SOffset) // soffset
6523 .addImm(ImmOffset); // offset(imm)
6524
6525 if (IsTyped)
6526 MIB.addImm(Format);
6527
6528 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6529 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6530 .addMemOperand(MMO);
6531}
6532
6534 LegalizerHelper &Helper,
6535 bool IsFormat,
6536 bool IsTyped) const {
6537 MachineIRBuilder &B = Helper.MIRBuilder;
6538 MachineRegisterInfo &MRI = *B.getMRI();
6539 GISelChangeObserver &Observer = Helper.Observer;
6540
6541 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6542 MachineMemOperand *MMO = *MI.memoperands_begin();
6543 const LLT MemTy = MMO->getMemoryType();
6544 const LLT S32 = LLT::scalar(32);
6545
6546 Register Dst = MI.getOperand(0).getReg();
6547
6548 Register StatusDst;
6549 int OpOffset = 0;
6550 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6551 bool IsTFE = MI.getNumExplicitDefs() == 2;
6552 if (IsTFE) {
6553 StatusDst = MI.getOperand(1).getReg();
6554 ++OpOffset;
6555 }
6556
6557 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6558 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6559
6560 // The typed intrinsics add an immediate after the registers.
6561 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6562
6563 // The struct intrinsic variants add one additional operand over raw.
6564 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6565 Register VIndex;
6566 if (HasVIndex) {
6567 VIndex = MI.getOperand(3 + OpOffset).getReg();
6568 ++OpOffset;
6569 } else {
6570 VIndex = B.buildConstant(S32, 0).getReg(0);
6571 }
6572
6573 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6574 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6575
6576 unsigned Format = 0;
6577 if (IsTyped) {
6578 Format = MI.getOperand(5 + OpOffset).getImm();
6579 ++OpOffset;
6580 }
6581
6582 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6583 unsigned ImmOffset;
6584
6585 LLT Ty = MRI.getType(Dst);
6586 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6587 // logic doesn't have to handle that case.
6588 if (hasBufferRsrcWorkaround(Ty)) {
6589 Observer.changingInstr(MI);
6590 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6591 Observer.changedInstr(MI);
6592 Dst = MI.getOperand(0).getReg();
6593 B.setInsertPt(B.getMBB(), MI);
6594 }
6595 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6596 Ty = getBitcastRegisterType(Ty);
6597 Observer.changingInstr(MI);
6598 Helper.bitcastDst(MI, Ty, 0);
6599 Observer.changedInstr(MI);
6600 Dst = MI.getOperand(0).getReg();
6601 B.setInsertPt(B.getMBB(), MI);
6602 }
6603
6604 LLT EltTy = Ty.getScalarType();
6605 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6606 const bool Unpacked = ST.hasUnpackedD16VMem();
6607
6608 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6609
6610 unsigned Opc;
6611
6612 // TODO: Support TFE for typed and narrow loads.
6613 if (IsTyped) {
6614 if (IsTFE)
6615 return false;
6616 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6617 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6618 } else if (IsFormat) {
6619 if (IsD16) {
6620 if (IsTFE)
6621 return false;
6622 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6623 } else {
6624 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6625 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6626 }
6627 } else {
6628 switch (MemTy.getSizeInBits()) {
6629 case 8:
6630 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6631 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6632 break;
6633 case 16:
6634 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6635 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6636 break;
6637 default:
6638 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6639 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6640 break;
6641 }
6642 }
6643
6644 if (IsTFE) {
6645 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6646 unsigned NumLoadDWords = NumValueDWords + 1;
6647 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6648 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6649 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6650 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6651 if (MemTy.getSizeInBits() < 32) {
6652 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6653 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6654 B.buildTrunc(Dst, ExtDst);
6655 } else if (NumValueDWords == 1) {
6656 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6657 } else {
6658 SmallVector<Register, 5> LoadElts;
6659 for (unsigned I = 0; I != NumValueDWords; ++I)
6660 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6661 LoadElts.push_back(StatusDst);
6662 B.buildUnmerge(LoadElts, LoadDstReg);
6663 LoadElts.truncate(NumValueDWords);
6664 B.buildMergeLikeInstr(Dst, LoadElts);
6665 }
6666 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6667 (IsD16 && !Ty.isVector())) {
6668 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6669 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6670 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6671 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6672 B.buildTrunc(Dst, LoadDstReg);
6673 } else if (Unpacked && IsD16 && Ty.isVector()) {
6674 LLT UnpackedTy = Ty.changeElementSize(32);
6675 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6676 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6677 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6678 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6679 // FIXME: G_TRUNC should work, but legalization currently fails
6680 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6682 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6683 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6684 B.buildMergeLikeInstr(Dst, Repack);
6685 } else {
6686 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6687 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6688 }
6689
6690 MI.eraseFromParent();
6691 return true;
6692}
6693
6694static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6695 switch (IntrID) {
6696 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6697 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6698 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6699 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6700 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6701 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6702 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6703 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6704 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6705 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6706 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6707 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6708 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6709 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6710 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6711 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6712 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6713 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6714 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6715 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6716 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6717 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6718 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6719 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6720 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6721 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6722 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6723 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6724 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6725 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6726 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6727 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6728 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6729 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6730 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6731 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6732 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6733 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6734 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6735 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6736 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6737 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6738 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6739 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6740 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6741 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6742 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6743 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6744 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6745 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6746 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6747 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6748 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6749 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6750 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6751 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6752 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6753 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6754 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6755 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6756 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6757 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6758 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6759 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6760 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6761 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6762 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6763 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6764 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6765 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6766 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6767 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6768 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6769 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6770 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6771 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6772 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6773 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6774 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6775 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6776 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6777 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6778 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6779 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6780 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6781 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6782 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6783 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6784 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6785 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6786 default:
6787 llvm_unreachable("unhandled atomic opcode");
6788 }
6789}
6790
6793 Intrinsic::ID IID) const {
6794 const bool IsCmpSwap =
6795 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6796 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6797 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6798 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6799
6800 Register Dst = MI.getOperand(0).getReg();
6801 // Since we don't have 128-bit atomics, we don't need to handle the case of
6802 // p8 argmunents to the atomic itself
6803 Register VData = MI.getOperand(2).getReg();
6804
6805 Register CmpVal;
6806 int OpOffset = 0;
6807
6808 if (IsCmpSwap) {
6809 CmpVal = MI.getOperand(3).getReg();
6810 ++OpOffset;
6811 }
6812
6813 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6814 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6815 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6816
6817 // The struct intrinsic variants add one additional operand over raw.
6818 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6819 Register VIndex;
6820 if (HasVIndex) {
6821 VIndex = MI.getOperand(4 + OpOffset).getReg();
6822 ++OpOffset;
6823 } else {
6824 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6825 }
6826
6827 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6828 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6829 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6830
6831 MachineMemOperand *MMO = *MI.memoperands_begin();
6832
6833 unsigned ImmOffset;
6834 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6835
6836 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6837 .addDef(Dst)
6838 .addUse(VData); // vdata
6839
6840 if (IsCmpSwap)
6841 MIB.addReg(CmpVal);
6842
6843 MIB.addUse(RSrc) // rsrc
6844 .addUse(VIndex) // vindex
6845 .addUse(VOffset) // voffset
6846 .addUse(SOffset) // soffset
6847 .addImm(ImmOffset) // offset(imm)
6848 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6849 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6850 .addMemOperand(MMO);
6851
6852 MI.eraseFromParent();
6853 return true;
6854}
6855
6856/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6857/// vector with s16 typed elements.
6859 SmallVectorImpl<Register> &PackedAddrs,
6860 unsigned ArgOffset,
6862 bool IsA16, bool IsG16) {
6863 const LLT S16 = LLT::scalar(16);
6864 const LLT V2S16 = LLT::fixed_vector(2, 16);
6865 auto EndIdx = Intr->VAddrEnd;
6866
6867 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6868 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6869 if (!SrcOp.isReg())
6870 continue; // _L to _LZ may have eliminated this.
6871
6872 Register AddrReg = SrcOp.getReg();
6873
6874 if ((I < Intr->GradientStart) ||
6875 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6876 (I >= Intr->CoordStart && !IsA16)) {
6877 if ((I < Intr->GradientStart) && IsA16 &&
6878 (B.getMRI()->getType(AddrReg) == S16)) {
6879 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6880 // Special handling of bias when A16 is on. Bias is of type half but
6881 // occupies full 32-bit.
6882 PackedAddrs.push_back(
6883 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6884 .getReg(0));
6885 } else {
6886 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6887 "Bias needs to be converted to 16 bit in A16 mode");
6888 // Handle any gradient or coordinate operands that should not be packed
6889 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6890 PackedAddrs.push_back(AddrReg);
6891 }
6892 } else {
6893 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6894 // derivatives dx/dh and dx/dv are packed with undef.
6895 if (((I + 1) >= EndIdx) ||
6896 ((Intr->NumGradients / 2) % 2 == 1 &&
6897 (I == static_cast<unsigned>(Intr->GradientStart +
6898 (Intr->NumGradients / 2) - 1) ||
6899 I == static_cast<unsigned>(Intr->GradientStart +
6900 Intr->NumGradients - 1))) ||
6901 // Check for _L to _LZ optimization
6902 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6903 PackedAddrs.push_back(
6904 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6905 .getReg(0));
6906 } else {
6907 PackedAddrs.push_back(
6908 B.buildBuildVector(
6909 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6910 .getReg(0));
6911 ++I;
6912 }
6913 }
6914 }
6915}
6916
6917/// Convert from separate vaddr components to a single vector address register,
6918/// and replace the remaining operands with $noreg.
6920 int DimIdx, int NumVAddrs) {
6921 const LLT S32 = LLT::scalar(32);
6922 (void)S32;
6923 SmallVector<Register, 8> AddrRegs;
6924 for (int I = 0; I != NumVAddrs; ++I) {
6925 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6926 if (SrcOp.isReg()) {
6927 AddrRegs.push_back(SrcOp.getReg());
6928 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6929 }
6930 }
6931
6932 int NumAddrRegs = AddrRegs.size();
6933 if (NumAddrRegs != 1) {
6934 auto VAddr =
6935 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6936 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6937 }
6938
6939 for (int I = 1; I != NumVAddrs; ++I) {
6940 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6941 if (SrcOp.isReg())
6942 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6943 }
6944}
6945
6946/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6947///
6948/// Depending on the subtarget, load/store with 16-bit element data need to be
6949/// rewritten to use the low half of 32-bit registers, or directly use a packed
6950/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6951/// registers.
6952///
6953/// We don't want to directly select image instructions just yet, but also want
6954/// to exposes all register repacking to the legalizer/combiners. We also don't
6955/// want a selected instruction entering RegBankSelect. In order to avoid
6956/// defining a multitude of intermediate image instructions, directly hack on
6957/// the intrinsic's arguments. In cases like a16 addresses, this requires
6958/// padding now unnecessary arguments with $noreg.
6961 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6962
6963 const MachineFunction &MF = *MI.getMF();
6964 const unsigned NumDefs = MI.getNumExplicitDefs();
6965 const unsigned ArgOffset = NumDefs + 1;
6966 bool IsTFE = NumDefs == 2;
6967 // We are only processing the operands of d16 image operations on subtargets
6968 // that use the unpacked register layout, or need to repack the TFE result.
6969
6970 // TODO: Do we need to guard against already legalized intrinsics?
6971 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6973
6974 MachineRegisterInfo *MRI = B.getMRI();
6975 const LLT S32 = LLT::scalar(32);
6976 const LLT S16 = LLT::scalar(16);
6977 const LLT V2S16 = LLT::fixed_vector(2, 16);
6978
6979 unsigned DMask = 0;
6980 Register VData;
6981 LLT Ty;
6982
6983 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6984 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6985 Ty = MRI->getType(VData);
6986 }
6987
6988 const bool IsAtomicPacked16Bit =
6989 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6990 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6991
6992 // Check for 16 bit addresses and pack if true.
6993 LLT GradTy =
6994 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6995 LLT AddrTy =
6996 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6997 const bool IsG16 =
6998 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6999 const bool IsA16 = AddrTy == S16;
7000 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7001
7002 int DMaskLanes = 0;
7003 if (!BaseOpcode->Atomic) {
7004 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7005 if (BaseOpcode->Gather4) {
7006 DMaskLanes = 4;
7007 } else if (DMask != 0) {
7008 DMaskLanes = llvm::popcount(DMask);
7009 } else if (!IsTFE && !BaseOpcode->Store) {
7010 // If dmask is 0, this is a no-op load. This can be eliminated.
7011 B.buildUndef(MI.getOperand(0));
7012 MI.eraseFromParent();
7013 return true;
7014 }
7015 }
7016
7017 Observer.changingInstr(MI);
7018 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7019
7020 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7021 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7022 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7023 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7024 unsigned NewOpcode = LoadOpcode;
7025 if (BaseOpcode->Store)
7026 NewOpcode = StoreOpcode;
7027 else if (BaseOpcode->NoReturn)
7028 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7029
7030 // Track that we legalized this
7031 MI.setDesc(B.getTII().get(NewOpcode));
7032
7033 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7034 // dmask to be at least 1 otherwise the instruction will fail
7035 if (IsTFE && DMask == 0) {
7036 DMask = 0x1;
7037 DMaskLanes = 1;
7038 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7039 }
7040
7041 if (BaseOpcode->Atomic) {
7042 Register VData0 = MI.getOperand(2).getReg();
7043 LLT Ty = MRI->getType(VData0);
7044
7045 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7046 if (Ty.isVector() && !IsAtomicPacked16Bit)
7047 return false;
7048
7049 if (BaseOpcode->AtomicX2) {
7050 Register VData1 = MI.getOperand(3).getReg();
7051 // The two values are packed in one register.
7052 LLT PackedTy = LLT::fixed_vector(2, Ty);
7053 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7054 MI.getOperand(2).setReg(Concat.getReg(0));
7055 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7056 }
7057 }
7058
7059 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7060
7061 // Rewrite the addressing register layout before doing anything else.
7062 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7063 // 16 bit gradients are supported, but are tied to the A16 control
7064 // so both gradients and addresses must be 16 bit
7065 return false;
7066 }
7067
7068 if (IsA16 && !ST.hasA16()) {
7069 // A16 not supported
7070 return false;
7071 }
7072
7073 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7074 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7075
7076 if (IsA16 || IsG16) {
7077 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7078 // instructions expect VGPR_32
7079 SmallVector<Register, 4> PackedRegs;
7080
7081 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7082
7083 // See also below in the non-a16 branch
7084 const bool UseNSA = ST.hasNSAEncoding() &&
7085 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7086 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7087 const bool UsePartialNSA =
7088 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7089
7090 if (UsePartialNSA) {
7091 // Pack registers that would go over NSAMaxSize into last VAddr register
7092 LLT PackedAddrTy =
7093 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7094 auto Concat = B.buildConcatVectors(
7095 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7096 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7097 PackedRegs.resize(NSAMaxSize);
7098 } else if (!UseNSA && PackedRegs.size() > 1) {
7099 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7100 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7101 PackedRegs[0] = Concat.getReg(0);
7102 PackedRegs.resize(1);
7103 }
7104
7105 const unsigned NumPacked = PackedRegs.size();
7106 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7107 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7108 if (!SrcOp.isReg()) {
7109 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7110 continue;
7111 }
7112
7113 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7114
7115 if (I - Intr->VAddrStart < NumPacked)
7116 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7117 else
7118 SrcOp.setReg(AMDGPU::NoRegister);
7119 }
7120 } else {
7121 // If the register allocator cannot place the address registers contiguously
7122 // without introducing moves, then using the non-sequential address encoding
7123 // is always preferable, since it saves VALU instructions and is usually a
7124 // wash in terms of code size or even better.
7125 //
7126 // However, we currently have no way of hinting to the register allocator
7127 // that MIMG addresses should be placed contiguously when it is possible to
7128 // do so, so force non-NSA for the common 2-address case as a heuristic.
7129 //
7130 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7131 // allocation when possible.
7132 //
7133 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7134 // set of the remaining addresses.
7135 const bool UseNSA = ST.hasNSAEncoding() &&
7136 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7137 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7138 const bool UsePartialNSA =
7139 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7140
7141 if (UsePartialNSA) {
7143 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7144 Intr->NumVAddrs - NSAMaxSize + 1);
7145 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7146 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7147 Intr->NumVAddrs);
7148 }
7149 }
7150
7151 int Flags = 0;
7152 if (IsA16)
7153 Flags |= 1;
7154 if (IsG16)
7155 Flags |= 2;
7156 MI.addOperand(MachineOperand::CreateImm(Flags));
7157
7158 if (BaseOpcode->NoReturn) { // No TFE for stores?
7159 // TODO: Handle dmask trim
7160 if (!Ty.isVector() || !IsD16)
7161 return true;
7162
7163 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7164 if (RepackedReg != VData) {
7165 MI.getOperand(1).setReg(RepackedReg);
7166 }
7167
7168 return true;
7169 }
7170
7171 Register DstReg = MI.getOperand(0).getReg();
7172 const LLT EltTy = Ty.getScalarType();
7173 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7174
7175 // Confirm that the return type is large enough for the dmask specified
7176 if (NumElts < DMaskLanes)
7177 return false;
7178
7179 if (NumElts > 4 || DMaskLanes > 4)
7180 return false;
7181
7182 // Image atomic instructions are using DMask to specify how many bits
7183 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7184 // DMaskLanes for image atomic has default value '0'.
7185 // We must be sure that atomic variants (especially packed) will not be
7186 // truncated from v2s16 or v4s16 to s16 type.
7187 //
7188 // ChangeElementCount will be needed for image load where Ty is always scalar.
7189 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7190 const LLT AdjustedTy =
7191 DMaskLanes == 0
7192 ? Ty
7193 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7194
7195 // The raw dword aligned data component of the load. The only legal cases
7196 // where this matters should be when using the packed D16 format, for
7197 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7198 LLT RoundedTy;
7199
7200 // S32 vector to cover all data, plus TFE result element.
7201 LLT TFETy;
7202
7203 // Register type to use for each loaded component. Will be S32 or V2S16.
7204 LLT RegTy;
7205
7206 if (IsD16 && ST.hasUnpackedD16VMem()) {
7207 RoundedTy =
7208 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7209 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7210 RegTy = S32;
7211 } else {
7212 unsigned EltSize = EltTy.getSizeInBits();
7213 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7214 unsigned RoundedSize = 32 * RoundedElts;
7215 RoundedTy = LLT::scalarOrVector(
7216 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7217 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7218 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7219 }
7220
7221 // The return type does not need adjustment.
7222 // TODO: Should we change s16 case to s32 or <2 x s16>?
7223 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7224 return true;
7225
7226 Register Dst1Reg;
7227
7228 // Insert after the instruction.
7229 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7230
7231 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7232 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7233 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7234 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7235
7236 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7237
7238 MI.getOperand(0).setReg(NewResultReg);
7239
7240 // In the IR, TFE is supposed to be used with a 2 element struct return
7241 // type. The instruction really returns these two values in one contiguous
7242 // register, with one additional dword beyond the loaded data. Rewrite the
7243 // return type to use a single register result.
7244
7245 if (IsTFE) {
7246 Dst1Reg = MI.getOperand(1).getReg();
7247 if (MRI->getType(Dst1Reg) != S32)
7248 return false;
7249
7250 // TODO: Make sure the TFE operand bit is set.
7251 MI.removeOperand(1);
7252
7253 // Handle the easy case that requires no repack instructions.
7254 if (Ty == S32) {
7255 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7256 return true;
7257 }
7258 }
7259
7260 // Now figure out how to copy the new result register back into the old
7261 // result.
7262 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7263
7264 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7265
7266 if (ResultNumRegs == 1) {
7267 assert(!IsTFE);
7268 ResultRegs[0] = NewResultReg;
7269 } else {
7270 // We have to repack into a new vector of some kind.
7271 for (int I = 0; I != NumDataRegs; ++I)
7272 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7273 B.buildUnmerge(ResultRegs, NewResultReg);
7274
7275 // Drop the final TFE element to get the data part. The TFE result is
7276 // directly written to the right place already.
7277 if (IsTFE)
7278 ResultRegs.resize(NumDataRegs);
7279 }
7280
7281 // For an s16 scalar result, we form an s32 result with a truncate regardless
7282 // of packed vs. unpacked.
7283 if (IsD16 && !Ty.isVector()) {
7284 B.buildTrunc(DstReg, ResultRegs[0]);
7285 return true;
7286 }
7287
7288 // Avoid a build/concat_vector of 1 entry.
7289 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7290 B.buildBitcast(DstReg, ResultRegs[0]);
7291 return true;
7292 }
7293
7294 assert(Ty.isVector());
7295
7296 if (IsD16) {
7297 // For packed D16 results with TFE enabled, all the data components are
7298 // S32. Cast back to the expected type.
7299 //
7300 // TODO: We don't really need to use load s32 elements. We would only need one
7301 // cast for the TFE result if a multiple of v2s16 was used.
7302 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7303 for (Register &Reg : ResultRegs)
7304 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7305 } else if (ST.hasUnpackedD16VMem()) {
7306 for (Register &Reg : ResultRegs)
7307 Reg = B.buildTrunc(S16, Reg).getReg(0);
7308 }
7309 }
7310
7311 auto padWithUndef = [&](LLT Ty, int NumElts) {
7312 if (NumElts == 0)
7313 return;
7314 Register Undef = B.buildUndef(Ty).getReg(0);
7315 for (int I = 0; I != NumElts; ++I)
7316 ResultRegs.push_back(Undef);
7317 };
7318
7319 // Pad out any elements eliminated due to the dmask.
7320 LLT ResTy = MRI->getType(ResultRegs[0]);
7321 if (!ResTy.isVector()) {
7322 padWithUndef(ResTy, NumElts - ResultRegs.size());
7323 B.buildBuildVector(DstReg, ResultRegs);
7324 return true;
7325 }
7326
7327 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7328 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7329
7330 // Deal with the one annoying legal case.
7331 const LLT V3S16 = LLT::fixed_vector(3, 16);
7332 if (Ty == V3S16) {
7333 if (IsTFE) {
7334 if (ResultRegs.size() == 1) {
7335 NewResultReg = ResultRegs[0];
7336 } else if (ResultRegs.size() == 2) {
7337 LLT V4S16 = LLT::fixed_vector(4, 16);
7338 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7339 } else {
7340 return false;
7341 }
7342 }
7343
7344 if (MRI->getType(DstReg).getNumElements() <
7345 MRI->getType(NewResultReg).getNumElements()) {
7346 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7347 } else {
7348 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7349 }
7350 return true;
7351 }
7352
7353 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7354 B.buildConcatVectors(DstReg, ResultRegs);
7355 return true;
7356}
7357
7359 MachineInstr &MI) const {
7360 MachineIRBuilder &B = Helper.MIRBuilder;
7361 GISelChangeObserver &Observer = Helper.Observer;
7362
7363 Register OrigDst = MI.getOperand(0).getReg();
7364 Register Dst;
7365 LLT Ty = B.getMRI()->getType(OrigDst);
7366 unsigned Size = Ty.getSizeInBits();
7367 MachineFunction &MF = B.getMF();
7368 unsigned Opc = 0;
7369 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7370 assert(Size == 8 || Size == 16);
7371 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7372 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7373 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7374 // destination register.
7375 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7376 } else {
7377 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7378 Dst = OrigDst;
7379 }
7380
7381 Observer.changingInstr(MI);
7382
7383 // Handle needing to s.buffer.load() a p8 value.
7384 if (hasBufferRsrcWorkaround(Ty)) {
7385 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7386 B.setInsertPt(B.getMBB(), MI);
7387 }
7389 Ty = getBitcastRegisterType(Ty);
7390 Helper.bitcastDst(MI, Ty, 0);
7391 B.setInsertPt(B.getMBB(), MI);
7392 }
7393
7394 // FIXME: We don't really need this intermediate instruction. The intrinsic
7395 // should be fixed to have a memory operand. Since it's readnone, we're not
7396 // allowed to add one.
7397 MI.setDesc(B.getTII().get(Opc));
7398 MI.removeOperand(1); // Remove intrinsic ID
7399
7400 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7401 const unsigned MemSize = (Size + 7) / 8;
7402 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7408 MemSize, MemAlign);
7409 MI.addMemOperand(MF, MMO);
7410 if (Dst != OrigDst) {
7411 MI.getOperand(0).setReg(Dst);
7412 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7413 B.buildTrunc(OrigDst, Dst);
7414 }
7415
7416 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7417 // always be legal. We may need to restore this to a 96-bit result if it turns
7418 // out this needs to be converted to a vector load during RegBankSelect.
7419 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7420 if (Ty.isVector())
7422 else
7423 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7424 }
7425
7426 Observer.changedInstr(MI);
7427 return true;
7428}
7429
7431 MachineInstr &MI) const {
7432 MachineIRBuilder &B = Helper.MIRBuilder;
7433 GISelChangeObserver &Observer = Helper.Observer;
7434 Observer.changingInstr(MI);
7435 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7436 MI.removeOperand(0); // Remove intrinsic ID
7438 Observer.changedInstr(MI);
7439 return true;
7440}
7441
7442// TODO: Move to selection
7445 MachineIRBuilder &B) const {
7446 if (!ST.hasTrapHandler() ||
7447 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7448 return legalizeTrapEndpgm(MI, MRI, B);
7449
7450 return ST.supportsGetDoorbellID() ?
7452}
7453
7456 const DebugLoc &DL = MI.getDebugLoc();
7457 MachineBasicBlock &BB = B.getMBB();
7458 MachineFunction *MF = BB.getParent();
7459
7460 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7461 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7462 .addImm(0);
7463 MI.eraseFromParent();
7464 return true;
7465 }
7466
7467 // We need a block split to make the real endpgm a terminator. We also don't
7468 // want to break phis in successor blocks, so we can't just delete to the
7469 // end of the block.
7470 BB.splitAt(MI, false /*UpdateLiveIns*/);
7472 MF->push_back(TrapBB);
7473 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7474 .addImm(0);
7475 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7476 .addMBB(TrapBB);
7477
7478 BB.addSuccessor(TrapBB);
7479 MI.eraseFromParent();
7480 return true;
7481}
7482
7485 MachineFunction &MF = B.getMF();
7486 const LLT S64 = LLT::scalar(64);
7487
7488 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7489 // For code object version 5, queue_ptr is passed through implicit kernarg.
7495 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7496
7497 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7499
7500 if (!loadInputValue(KernargPtrReg, B,
7502 return false;
7503
7504 // TODO: can we be smarter about machine pointer info?
7507 PtrInfo.getWithOffset(Offset),
7511
7512 // Pointer address
7513 Register LoadAddr = MRI.createGenericVirtualRegister(
7515 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7516 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7517 // Load address
7518 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7519 B.buildCopy(SGPR01, Temp);
7520 B.buildInstr(AMDGPU::S_TRAP)
7521 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7522 .addReg(SGPR01, RegState::Implicit);
7523 MI.eraseFromParent();
7524 return true;
7525 }
7526
7527 // Pass queue pointer to trap handler as input, and insert trap instruction
7528 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7529 Register LiveIn =
7530 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7532 return false;
7533
7534 B.buildCopy(SGPR01, LiveIn);
7535 B.buildInstr(AMDGPU::S_TRAP)
7536 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7537 .addReg(SGPR01, RegState::Implicit);
7538
7539 MI.eraseFromParent();
7540 return true;
7541}
7542
7545 MachineIRBuilder &B) const {
7546 // We need to simulate the 's_trap 2' instruction on targets that run in
7547 // PRIV=1 (where it is treated as a nop).
7548 if (ST.hasPrivEnabledTrap2NopBug()) {
7549 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7550 MI.getDebugLoc());
7551 MI.eraseFromParent();
7552 return true;
7553 }
7554
7555 B.buildInstr(AMDGPU::S_TRAP)
7556 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7557 MI.eraseFromParent();
7558 return true;
7559}
7560
7563 MachineIRBuilder &B) const {
7564 // Is non-HSA path or trap-handler disabled? Then, report a warning
7565 // accordingly
7566 if (!ST.hasTrapHandler() ||
7567 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7568 Function &Fn = B.getMF().getFunction();
7570 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7571 } else {
7572 // Insert debug-trap instruction
7573 B.buildInstr(AMDGPU::S_TRAP)
7574 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7575 }
7576
7577 MI.eraseFromParent();
7578 return true;
7579}
7580
7582 MachineInstr &MI, MachineIRBuilder &B) const {
7583 MachineRegisterInfo &MRI = *B.getMRI();
7584 const LLT S16 = LLT::scalar(16);
7585 const LLT S32 = LLT::scalar(32);
7586 const LLT V2S16 = LLT::fixed_vector(2, 16);
7587 const LLT V3S32 = LLT::fixed_vector(3, 32);
7588
7589 Register DstReg = MI.getOperand(0).getReg();
7590 Register NodePtr = MI.getOperand(2).getReg();
7591 Register RayExtent = MI.getOperand(3).getReg();
7592 Register RayOrigin = MI.getOperand(4).getReg();
7593 Register RayDir = MI.getOperand(5).getReg();
7594 Register RayInvDir = MI.getOperand(6).getReg();
7595 Register TDescr = MI.getOperand(7).getReg();
7596
7597 if (!ST.hasGFX10_AEncoding()) {
7598 Function &Fn = B.getMF().getFunction();
7600 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7601 return false;
7602 }
7603
7604 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7605 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7606 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7607 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7608 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7609 const unsigned NumVDataDwords = 4;
7610 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7611 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7612 const bool UseNSA =
7613 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7614
7615 const unsigned BaseOpcodes[2][2] = {
7616 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7617 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7618 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7619 int Opcode;
7620 if (UseNSA) {
7621 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7622 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7623 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7624 : AMDGPU::MIMGEncGfx10NSA,
7625 NumVDataDwords, NumVAddrDwords);
7626 } else {
7627 assert(!IsGFX12Plus);
7628 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7629 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7630 : AMDGPU::MIMGEncGfx10Default,
7631 NumVDataDwords, NumVAddrDwords);
7632 }
7633 assert(Opcode != -1);
7634
7636 if (UseNSA && IsGFX11Plus) {
7637 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7638 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7639 auto Merged = B.buildMergeLikeInstr(
7640 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7641 Ops.push_back(Merged.getReg(0));
7642 };
7643
7644 Ops.push_back(NodePtr);
7645 Ops.push_back(RayExtent);
7646 packLanes(RayOrigin);
7647
7648 if (IsA16) {
7649 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7650 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7651 auto MergedDir = B.buildMergeLikeInstr(
7652 V3S32,
7653 {B.buildBitcast(
7654 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7655 UnmergeRayDir.getReg(0)}))
7656 .getReg(0),
7657 B.buildBitcast(
7658 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7659 UnmergeRayDir.getReg(1)}))
7660 .getReg(0),
7661 B.buildBitcast(
7662 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7663 UnmergeRayDir.getReg(2)}))
7664 .getReg(0)});
7665 Ops.push_back(MergedDir.getReg(0));
7666 } else {
7667 packLanes(RayDir);
7668 packLanes(RayInvDir);
7669 }
7670 } else {
7671 if (Is64) {
7672 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7673 Ops.push_back(Unmerge.getReg(0));
7674 Ops.push_back(Unmerge.getReg(1));
7675 } else {
7676 Ops.push_back(NodePtr);
7677 }
7678 Ops.push_back(RayExtent);
7679
7680 auto packLanes = [&Ops, &S32, &B](Register Src) {
7681 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7682 Ops.push_back(Unmerge.getReg(0));
7683 Ops.push_back(Unmerge.getReg(1));
7684 Ops.push_back(Unmerge.getReg(2));
7685 };
7686
7687 packLanes(RayOrigin);
7688 if (IsA16) {
7689 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7690 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7691 Register R1 = MRI.createGenericVirtualRegister(S32);
7692 Register R2 = MRI.createGenericVirtualRegister(S32);
7693 Register R3 = MRI.createGenericVirtualRegister(S32);
7694 B.buildMergeLikeInstr(R1,
7695 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7696 B.buildMergeLikeInstr(
7697 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7698 B.buildMergeLikeInstr(
7699 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7700 Ops.push_back(R1);
7701 Ops.push_back(R2);
7702 Ops.push_back(R3);
7703 } else {
7704 packLanes(RayDir);
7705 packLanes(RayInvDir);
7706 }
7707 }
7708
7709 if (!UseNSA) {
7710 // Build a single vector containing all the operands so far prepared.
7711 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7712 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7713 Ops.clear();
7714 Ops.push_back(MergedOps);
7715 }
7716
7717 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7718 .addDef(DstReg)
7719 .addImm(Opcode);
7720
7721 for (Register R : Ops) {
7722 MIB.addUse(R);
7723 }
7724
7725 MIB.addUse(TDescr)
7726 .addImm(IsA16 ? 1 : 0)
7727 .cloneMemRefs(MI);
7728
7729 MI.eraseFromParent();
7730 return true;
7731}
7732
7734 MachineInstr &MI, MachineIRBuilder &B) const {
7735 const LLT S32 = LLT::scalar(32);
7736 const LLT V2S32 = LLT::fixed_vector(2, 32);
7737
7738 Register DstReg = MI.getOperand(0).getReg();
7739 Register DstOrigin = MI.getOperand(1).getReg();
7740 Register DstDir = MI.getOperand(2).getReg();
7741 Register NodePtr = MI.getOperand(4).getReg();
7742 Register RayExtent = MI.getOperand(5).getReg();
7743 Register InstanceMask = MI.getOperand(6).getReg();
7744 Register RayOrigin = MI.getOperand(7).getReg();
7745 Register RayDir = MI.getOperand(8).getReg();
7746 Register Offsets = MI.getOperand(9).getReg();
7747 Register TDescr = MI.getOperand(10).getReg();
7748
7749 if (!ST.hasBVHDualAndBVH8Insts()) {
7750 Function &Fn = B.getMF().getFunction();
7752 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7753 return false;
7754 }
7755
7756 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7757 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7758 const unsigned NumVDataDwords = 10;
7759 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7760 int Opcode = AMDGPU::getMIMGOpcode(
7761 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7762 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7763 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7764 assert(Opcode != -1);
7765
7766 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7767 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7768
7769 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7770 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7771 .addDef(DstReg)
7772 .addDef(DstOrigin)
7773 .addDef(DstDir)
7774 .addImm(Opcode)
7775 .addUse(NodePtr)
7776 .addUse(RayExtentInstanceMaskVec.getReg(0))
7777 .addUse(RayOrigin)
7778 .addUse(RayDir)
7779 .addUse(Offsets)
7780 .addUse(TDescr)
7781 .cloneMemRefs(MI);
7782
7783 MI.eraseFromParent();
7784 return true;
7785}
7786
7788 MachineIRBuilder &B) const {
7789 const SITargetLowering *TLI = ST.getTargetLowering();
7791 Register DstReg = MI.getOperand(0).getReg();
7792 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7793 MI.eraseFromParent();
7794 return true;
7795}
7796
7798 MachineIRBuilder &B) const {
7799 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7800 if (!ST.hasArchitectedSGPRs())
7801 return false;
7802 LLT S32 = LLT::scalar(32);
7803 Register DstReg = MI.getOperand(0).getReg();
7804 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7805 auto LSB = B.buildConstant(S32, 25);
7806 auto Width = B.buildConstant(S32, 5);
7807 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7808 MI.eraseFromParent();
7809 return true;
7810}
7811
7814 AMDGPU::Hwreg::Id HwReg,
7815 unsigned LowBit,
7816 unsigned Width) const {
7817 MachineRegisterInfo &MRI = *B.getMRI();
7818 Register DstReg = MI.getOperand(0).getReg();
7819 if (!MRI.getRegClassOrNull(DstReg))
7820 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7821 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7822 .addDef(DstReg)
7823 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7824 MI.eraseFromParent();
7825 return true;
7826}
7827
7828static constexpr unsigned FPEnvModeBitField =
7830
7831static constexpr unsigned FPEnvTrapBitField =
7833
7836 MachineIRBuilder &B) const {
7837 Register Src = MI.getOperand(0).getReg();
7838 if (MRI.getType(Src) != S64)
7839 return false;
7840
7841 auto ModeReg =
7842 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7843 /*HasSideEffects=*/true, /*isConvergent=*/false)
7844 .addImm(FPEnvModeBitField);
7845 auto TrapReg =
7846 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7847 /*HasSideEffects=*/true, /*isConvergent=*/false)
7848 .addImm(FPEnvTrapBitField);
7849 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7850 MI.eraseFromParent();
7851 return true;
7852}
7853
7856 MachineIRBuilder &B) const {
7857 Register Src = MI.getOperand(0).getReg();
7858 if (MRI.getType(Src) != S64)
7859 return false;
7860
7861 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7862 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7863 /*HasSideEffects=*/true, /*isConvergent=*/false)
7864 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7865 .addReg(Unmerge.getReg(0));
7866 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7867 /*HasSideEffects=*/true, /*isConvergent=*/false)
7868 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7869 .addReg(Unmerge.getReg(1));
7870 MI.eraseFromParent();
7871 return true;
7872}
7873
7875 MachineInstr &MI) const {
7876 MachineIRBuilder &B = Helper.MIRBuilder;
7877 MachineRegisterInfo &MRI = *B.getMRI();
7878
7879 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7880 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7881 switch (IntrID) {
7882 case Intrinsic::sponentry:
7883 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
7884 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
7885 // that we can remove this cast.
7886 const LLT S32 = LLT::scalar(32);
7887 Register TmpReg = MRI.createGenericVirtualRegister(S32);
7888 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
7889
7890 Register DstReg = MI.getOperand(0).getReg();
7891 B.buildIntToPtr(DstReg, TmpReg);
7892 MI.eraseFromParent();
7893 } else {
7894 int FI = B.getMF().getFrameInfo().CreateFixedObject(
7895 1, 0, /*IsImmutable=*/false);
7896 B.buildFrameIndex(MI.getOperand(0), FI);
7897 MI.eraseFromParent();
7898 }
7899 return true;
7900 case Intrinsic::amdgcn_if:
7901 case Intrinsic::amdgcn_else: {
7902 MachineInstr *Br = nullptr;
7903 MachineBasicBlock *UncondBrTarget = nullptr;
7904 bool Negated = false;
7905 if (MachineInstr *BrCond =
7906 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7907 const SIRegisterInfo *TRI
7908 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7909
7910 Register Def = MI.getOperand(1).getReg();
7911 Register Use = MI.getOperand(3).getReg();
7912
7913 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7914
7915 if (Negated)
7916 std::swap(CondBrTarget, UncondBrTarget);
7917
7918 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7919 if (IntrID == Intrinsic::amdgcn_if) {
7920 B.buildInstr(AMDGPU::SI_IF)
7921 .addDef(Def)
7922 .addUse(Use)
7923 .addMBB(UncondBrTarget);
7924 } else {
7925 B.buildInstr(AMDGPU::SI_ELSE)
7926 .addDef(Def)
7927 .addUse(Use)
7928 .addMBB(UncondBrTarget);
7929 }
7930
7931 if (Br) {
7932 Br->getOperand(0).setMBB(CondBrTarget);
7933 } else {
7934 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7935 // since we're swapping branch targets it needs to be reinserted.
7936 // FIXME: IRTranslator should probably not do this
7937 B.buildBr(*CondBrTarget);
7938 }
7939
7940 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7941 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7942 MI.eraseFromParent();
7943 BrCond->eraseFromParent();
7944 return true;
7945 }
7946
7947 return false;
7948 }
7949 case Intrinsic::amdgcn_loop: {
7950 MachineInstr *Br = nullptr;
7951 MachineBasicBlock *UncondBrTarget = nullptr;
7952 bool Negated = false;
7953 if (MachineInstr *BrCond =
7954 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7955 const SIRegisterInfo *TRI
7956 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7957
7958 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7959 Register Reg = MI.getOperand(2).getReg();
7960
7961 if (Negated)
7962 std::swap(CondBrTarget, UncondBrTarget);
7963
7964 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7965 B.buildInstr(AMDGPU::SI_LOOP)
7966 .addUse(Reg)
7967 .addMBB(UncondBrTarget);
7968
7969 if (Br)
7970 Br->getOperand(0).setMBB(CondBrTarget);
7971 else
7972 B.buildBr(*CondBrTarget);
7973
7974 MI.eraseFromParent();
7975 BrCond->eraseFromParent();
7976 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7977 return true;
7978 }
7979
7980 return false;
7981 }
7982 case Intrinsic::amdgcn_addrspacecast_nonnull:
7983 return legalizeAddrSpaceCast(MI, MRI, B);
7984 case Intrinsic::amdgcn_make_buffer_rsrc:
7986 case Intrinsic::amdgcn_kernarg_segment_ptr:
7987 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
7988 // This only makes sense to call in a kernel, so just lower to null.
7989 B.buildConstant(MI.getOperand(0).getReg(), 0);
7990 MI.eraseFromParent();
7991 return true;
7992 }
7993
7996 case Intrinsic::amdgcn_implicitarg_ptr:
7997 return legalizeImplicitArgPtr(MI, MRI, B);
7998 case Intrinsic::amdgcn_workitem_id_x:
7999 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8001 case Intrinsic::amdgcn_workitem_id_y:
8002 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8004 case Intrinsic::amdgcn_workitem_id_z:
8005 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8007 case Intrinsic::amdgcn_workgroup_id_x:
8008 return legalizeWorkGroupId(
8012 case Intrinsic::amdgcn_workgroup_id_y:
8013 return legalizeWorkGroupId(
8017 case Intrinsic::amdgcn_workgroup_id_z:
8018 return legalizeWorkGroupId(
8022 case Intrinsic::amdgcn_cluster_id_x:
8023 return ST.hasClusters() &&
8026 case Intrinsic::amdgcn_cluster_id_y:
8027 return ST.hasClusters() &&
8030 case Intrinsic::amdgcn_cluster_id_z:
8031 return ST.hasClusters() &&
8034 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8035 return ST.hasClusters() &&
8038 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8039 return ST.hasClusters() &&
8042 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8043 return ST.hasClusters() &&
8046 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8047 return ST.hasClusters() &&
8049 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8050 return ST.hasClusters() &&
8053 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8054 return ST.hasClusters() &&
8057 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8058 return ST.hasClusters() &&
8061 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8062 return ST.hasClusters() &&
8064 MI, MRI, B,
8066 case Intrinsic::amdgcn_wave_id:
8067 return legalizeWaveID(MI, B);
8068 case Intrinsic::amdgcn_lds_kernel_id:
8071 case Intrinsic::amdgcn_dispatch_ptr:
8074 case Intrinsic::amdgcn_queue_ptr:
8077 case Intrinsic::amdgcn_implicit_buffer_ptr:
8080 case Intrinsic::amdgcn_dispatch_id:
8083 case Intrinsic::r600_read_ngroups_x:
8084 // TODO: Emit error for hsa
8087 case Intrinsic::r600_read_ngroups_y:
8090 case Intrinsic::r600_read_ngroups_z:
8093 case Intrinsic::r600_read_local_size_x:
8094 // TODO: Could insert G_ASSERT_ZEXT from s16
8096 case Intrinsic::r600_read_local_size_y:
8097 // TODO: Could insert G_ASSERT_ZEXT from s16
8099 // TODO: Could insert G_ASSERT_ZEXT from s16
8100 case Intrinsic::r600_read_local_size_z:
8103 case Intrinsic::amdgcn_fdiv_fast:
8104 return legalizeFDIVFastIntrin(MI, MRI, B);
8105 case Intrinsic::amdgcn_is_shared:
8107 case Intrinsic::amdgcn_is_private:
8109 case Intrinsic::amdgcn_wavefrontsize: {
8110 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8111 MI.eraseFromParent();
8112 return true;
8113 }
8114 case Intrinsic::amdgcn_s_buffer_load:
8115 return legalizeSBufferLoad(Helper, MI);
8116 case Intrinsic::amdgcn_raw_buffer_store:
8117 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8118 case Intrinsic::amdgcn_struct_buffer_store:
8119 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8120 return legalizeBufferStore(MI, Helper, false, false);
8121 case Intrinsic::amdgcn_raw_buffer_store_format:
8122 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8123 case Intrinsic::amdgcn_struct_buffer_store_format:
8124 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8125 return legalizeBufferStore(MI, Helper, false, true);
8126 case Intrinsic::amdgcn_raw_tbuffer_store:
8127 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8128 case Intrinsic::amdgcn_struct_tbuffer_store:
8129 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8130 return legalizeBufferStore(MI, Helper, true, true);
8131 case Intrinsic::amdgcn_raw_buffer_load:
8132 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8133 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8134 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8135 case Intrinsic::amdgcn_struct_buffer_load:
8136 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8137 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8138 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8139 return legalizeBufferLoad(MI, Helper, false, false);
8140 case Intrinsic::amdgcn_raw_buffer_load_format:
8141 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8142 case Intrinsic::amdgcn_struct_buffer_load_format:
8143 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8144 return legalizeBufferLoad(MI, Helper, true, false);
8145 case Intrinsic::amdgcn_raw_tbuffer_load:
8146 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8147 case Intrinsic::amdgcn_struct_tbuffer_load:
8148 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8149 return legalizeBufferLoad(MI, Helper, true, true);
8150 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8151 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8152 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8153 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8154 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8155 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8156 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8157 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8158 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8159 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8160 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8161 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8162 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8163 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8164 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8165 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8166 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8167 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8168 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8169 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8170 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8171 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8172 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8173 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8174 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8175 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8176 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8177 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8178 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8180 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8182 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8183 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8184 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8185 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8186 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8187 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8188 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8189 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8190 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8192 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8193 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8194 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8195 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8196 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8197 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8198 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8200 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8201 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8202 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8203 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8204 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8205 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8206 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8207 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8208 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8209 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8210 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8212 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8213 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8214 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8215 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8216 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8217 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8218 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8219 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8220 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8221 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8222 return legalizeBufferAtomic(MI, B, IntrID);
8223 case Intrinsic::amdgcn_rsq_clamp:
8225 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8227 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8228 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8230 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8231 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8232 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8233 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8234 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8235 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8236 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8237 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8238 Register Index = MI.getOperand(5).getReg();
8239 LLT S64 = LLT::scalar(64);
8240 LLT IndexArgTy = MRI.getType(Index);
8241 if (IndexArgTy != S64) {
8242 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8243 : B.buildAnyExt(S64, Index);
8244 MI.getOperand(5).setReg(NewIndex.getReg(0));
8245 }
8246 return true;
8247 }
8248 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8249 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8250 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8251 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8252 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8253 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8254 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8255 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8256 Register Index = MI.getOperand(5).getReg();
8257 LLT S32 = LLT::scalar(32);
8258 if (MRI.getType(Index) != S32)
8259 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8260 return true;
8261 }
8262 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8263 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8264 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8265 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8266 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8267 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8268 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8269 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8270 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8271 Register Index = MI.getOperand(7).getReg();
8272 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8273 ? LLT::scalar(64)
8274 : LLT::scalar(32);
8275 LLT IndexArgTy = MRI.getType(Index);
8276 if (IndexArgTy != IdxTy) {
8277 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8278 : B.buildAnyExt(IdxTy, Index);
8279 MI.getOperand(7).setReg(NewIndex.getReg(0));
8280 }
8281 return true;
8282 }
8283
8284 case Intrinsic::amdgcn_fmed3: {
8285 GISelChangeObserver &Observer = Helper.Observer;
8286
8287 // FIXME: This is to workaround the inability of tablegen match combiners to
8288 // match intrinsics in patterns.
8289 Observer.changingInstr(MI);
8290 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8291 MI.removeOperand(1);
8292 Observer.changedInstr(MI);
8293 return true;
8294 }
8295 case Intrinsic::amdgcn_readlane:
8296 case Intrinsic::amdgcn_writelane:
8297 case Intrinsic::amdgcn_readfirstlane:
8298 case Intrinsic::amdgcn_permlane16:
8299 case Intrinsic::amdgcn_permlanex16:
8300 case Intrinsic::amdgcn_permlane64:
8301 case Intrinsic::amdgcn_set_inactive:
8302 case Intrinsic::amdgcn_set_inactive_chain_arg:
8303 case Intrinsic::amdgcn_mov_dpp8:
8304 case Intrinsic::amdgcn_update_dpp:
8305 return legalizeLaneOp(Helper, MI, IntrID);
8306 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8307 return legalizeSBufferPrefetch(Helper, MI);
8308 case Intrinsic::amdgcn_dead: {
8309 // TODO: Use poison instead of undef
8310 for (const MachineOperand &Def : MI.defs())
8311 B.buildUndef(Def);
8312 MI.eraseFromParent();
8313 return true;
8314 }
8315 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8316 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8317 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8318 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8319 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8320 MI.eraseFromParent();
8321 return true;
8322 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8323 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8324 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8325 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8326 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8327 MI.eraseFromParent();
8328 return true;
8329 case Intrinsic::amdgcn_flat_load_monitor_b32:
8330 case Intrinsic::amdgcn_flat_load_monitor_b64:
8331 case Intrinsic::amdgcn_flat_load_monitor_b128:
8332 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8333 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8334 .add(MI.getOperand(0))
8335 .add(MI.getOperand(2))
8336 .addMemOperand(*MI.memoperands_begin());
8337 MI.eraseFromParent();
8338 return true;
8339 case Intrinsic::amdgcn_global_load_monitor_b32:
8340 case Intrinsic::amdgcn_global_load_monitor_b64:
8341 case Intrinsic::amdgcn_global_load_monitor_b128:
8342 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8343 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8344 .add(MI.getOperand(0))
8345 .add(MI.getOperand(2))
8346 .addMemOperand(*MI.memoperands_begin());
8347 MI.eraseFromParent();
8348 return true;
8349 default: {
8350 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8352 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8353 return true;
8354 }
8355 }
8356
8357 return true;
8358}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:71
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1268
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1209
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1189
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1149
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:133
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:387
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:921
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:2041
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1725
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.