LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::float32();
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::float64();
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
323constexpr LLT V2BF16 = V2F16; // FIXME
324
325constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
326constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
327constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
328constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
329constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
330constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
331constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
332constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
333constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
334constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
335constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
336constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
337constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
338
339constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
340constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
341constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
342constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
343constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
344constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
345constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
346constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
347
348constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
349constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
350
351constexpr std::initializer_list<LLT> AllScalarTypes = {
353
354constexpr std::initializer_list<LLT> AllS16Vectors{
356
357constexpr std::initializer_list<LLT> AllS32Vectors = {
360
361constexpr std::initializer_list<LLT> AllS64Vectors = {
363
369
370// Checks whether a type is in the list of legal register types.
371static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
374
377 (ST.useRealTrue16Insts() && Ty == S16) ||
379}
380
382 unsigned TypeIdx) {
383 return [&ST, TypeIdx](const LegalityQuery &Query) {
384 return isRegisterClassType(ST, Query.Types[TypeIdx]);
385 };
386}
387
388// If we have a truncating store or an extending load with a data size larger
389// than 32-bits, we need to reduce to a 32-bit type.
391 return [=](const LegalityQuery &Query) {
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
395 };
396}
397
398// If we have a truncating store or an extending load with a data size larger
399// than 32-bits and mem location is a power of 2
401 return [=](const LegalityQuery &Query) {
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
403 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
404 isPowerOf2_64(MemSize);
405 };
406}
407
408// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
409// handle some operations by just promoting the register during
410// selection. There are also d16 loads on GFX9+ which preserve the high bits.
411static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
412 bool IsLoad, bool IsAtomic) {
413 switch (AS) {
415 // FIXME: Private element size.
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
418 return ST.useDS128() ? 128 : 64;
423 // Treat constant and global as identical. SMRD loads are sometimes usable for
424 // global loads (ideally constant address space should be eliminated)
425 // depending on the context. Legality cannot be context dependent, but
426 // RegBankSelect can split the load as necessary depending on the pointer
427 // register bank/uniformity and if the memory is invariant or not written in a
428 // kernel.
429 return IsLoad ? 512 : 128;
430 default:
431 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
432 // if they may alias scratch depending on the subtarget. This needs to be
433 // moved to custom handling to use addressMayBeAccessedAsPrivate
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
435 }
436}
437
438static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
439 const LegalityQuery &Query) {
440 const LLT Ty = Query.Types[0];
441
442 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
443 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
444
445 unsigned RegSize = Ty.getSizeInBits();
446 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
447 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
448 unsigned AS = Query.Types[1].getAddressSpace();
449
450 // All of these need to be custom lowered to cast the pointer operand.
452 return false;
453
454 // Do not handle extending vector loads.
455 if (Ty.isVector() && MemSize != RegSize)
456 return false;
457
458 // TODO: We should be able to widen loads if the alignment is high enough, but
459 // we also need to modify the memory access size.
460#if 0
461 // Accept widening loads based on alignment.
462 if (IsLoad && MemSize < Size)
463 MemSize = std::max(MemSize, Align);
464#endif
465
466 // Only 1-byte and 2-byte to 32-bit extloads are valid.
467 if (MemSize != RegSize && RegSize != 32)
468 return false;
469
470 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
471 Query.MMODescrs[0].Ordering !=
473 return false;
474
475 switch (MemSize) {
476 case 8:
477 case 16:
478 case 32:
479 case 64:
480 case 128:
481 break;
482 case 96:
483 if (!ST.hasDwordx3LoadStores())
484 return false;
485 break;
486 case 256:
487 case 512:
488 // These may contextually need to be broken down.
489 break;
490 default:
491 return false;
492 }
493
494 assert(RegSize >= MemSize);
495
496 if (AlignBits < MemSize) {
497 const SITargetLowering *TLI = ST.getTargetLowering();
498 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
499 Align(AlignBits / 8)))
500 return false;
501 }
502
503 return true;
504}
505
506// The newer buffer intrinsic forms take their resource arguments as
507// pointers in address space 8, aka s128 values. However, in order to not break
508// SelectionDAG, the underlying operations have to continue to take v4i32
509// arguments. Therefore, we convert resource pointers - or vectors of them
510// to integer values here.
511static bool hasBufferRsrcWorkaround(const LLT Ty) {
512 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
513 return true;
514 if (Ty.isVector()) {
515 const LLT ElemTy = Ty.getElementType();
516 return hasBufferRsrcWorkaround(ElemTy);
517 }
518 return false;
519}
520
521// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
522// workaround this. Eventually it should ignore the type for loads and only care
523// about the size. Return true in cases where we will workaround this for now by
524// bitcasting.
525static bool loadStoreBitcastWorkaround(const LLT Ty) {
527 return false;
528
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
531 return true;
532 if (Size <= 64)
533 return false;
534 // Address space 8 pointers get their own workaround.
536 return false;
537 if (!Ty.isVector())
538 return true;
539
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
542}
543
544static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
545 const LLT Ty = Query.Types[0];
546 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
548}
549
550/// Return true if a load or store of the type should be lowered with a bitcast
551/// to a different type.
552static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
553 const LLT MemTy) {
554 const unsigned MemSizeInBits = MemTy.getSizeInBits();
555 const unsigned Size = Ty.getSizeInBits();
556 if (Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
558
560 return true;
561
562 // Don't try to handle bitcasting vector ext loads for now.
563 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
564 (Size <= 32 || isRegisterSize(ST, Size)) &&
565 !isRegisterVectorElementType(Ty.getElementType());
566}
567
568/// Return true if we should legalize a load by widening an odd sized memory
569/// access up to the alignment. Note this case when the memory access itself
570/// changes, not the size of the result register.
571static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
572 uint64_t AlignInBits, unsigned AddrSpace,
573 unsigned Opcode) {
574 unsigned SizeInBits = MemoryTy.getSizeInBits();
575 // We don't want to widen cases that are naturally legal.
576 if (isPowerOf2_32(SizeInBits))
577 return false;
578
579 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
580 // end up widening these for a scalar load during RegBankSelect, if we don't
581 // have 96-bit scalar loads.
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
583 return false;
584
585 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
586 return false;
587
588 // A load is known dereferenceable up to the alignment, so it's legal to widen
589 // to it.
590 //
591 // TODO: Could check dereferenceable for less aligned cases.
592 unsigned RoundedSize = NextPowerOf2(SizeInBits);
593 if (AlignInBits < RoundedSize)
594 return false;
595
596 // Do not widen if it would introduce a slow unaligned load.
597 const SITargetLowering *TLI = ST.getTargetLowering();
598 unsigned Fast = 0;
600 RoundedSize, AddrSpace, Align(AlignInBits / 8),
602 Fast;
603}
604
605static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
606 unsigned Opcode) {
607 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
608 return false;
609
610 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
611 Query.MMODescrs[0].AlignInBits,
612 Query.Types[1].getAddressSpace(), Opcode);
613}
614
615/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
616/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
617/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
619 MachineRegisterInfo &MRI, unsigned Idx) {
620 MachineOperand &MO = MI.getOperand(Idx);
621
622 const LLT PointerTy = MRI.getType(MO.getReg());
623
624 // Paranoidly prevent us from doing this multiple times.
626 return PointerTy;
627
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630 if (!PointerTy.isVector()) {
631 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
632 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
633 const LLT S32 = LLT::scalar(32);
634
635 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
638 for (unsigned I = 0; I < NumParts; ++I)
639 VectorElems[I] =
640 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
641 B.buildMergeValues(MO, VectorElems);
642 MO.setReg(VectorReg);
643 return VectorTy;
644 }
645 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
646 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
647 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
648 B.buildIntToPtr(MO, Scalar);
649 MO.setReg(BitcastReg);
650
651 return VectorTy;
652}
653
654/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
655/// the form in which the value must be in order to be passed to the low-level
656/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
657/// needed in order to account for the fact that we can't define a register
658/// class for s128 without breaking SelectionDAG.
660 MachineRegisterInfo &MRI = *B.getMRI();
661 const LLT PointerTy = MRI.getType(Pointer);
662 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
663 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
664
665 if (!PointerTy.isVector()) {
666 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
667 SmallVector<Register, 4> PointerParts;
668 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
669 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
670 for (unsigned I = 0; I < NumParts; ++I)
671 PointerParts.push_back(Unmerged.getReg(I));
672 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
673 }
674 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
675 return B.buildBitcast(VectorTy, Scalar).getReg(0);
676}
677
679 unsigned Idx) {
680 MachineOperand &MO = MI.getOperand(Idx);
681
682 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
683 // Paranoidly prevent us from doing this multiple times.
685 return;
687}
688
690 const GCNTargetMachine &TM)
691 : ST(ST_) {
692 using namespace TargetOpcode;
693
694 auto GetAddrSpacePtr = [&TM](unsigned AS) {
695 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
696 };
697
698 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
699 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
700 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
701 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
702 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
703 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
704 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
705 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
706 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
707 const LLT BufferStridedPtr =
708 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
709
710 const LLT CodePtr = FlatPtr;
711
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
714 };
715
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 };
719
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
721
722 const std::initializer_list<LLT> FPTypesBase = {
723 S32, S64
724 };
725
726 const std::initializer_list<LLT> FPTypes16 = {
727 S32, S64, S16
728 };
729
730 const std::initializer_list<LLT> FPTypesPK16 = {
731 S32, S64, S16, V2S16
732 };
733
734 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
735
736 // s1 for VCC branches, s32 for SCC branches.
738
739 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
740 // elements for v3s16
743 .legalFor(AllS32Vectors)
745 .legalFor(AddrSpaces64)
746 .legalFor(AddrSpaces32)
747 .legalFor(AddrSpaces128)
748 .legalIf(isPointer(0))
749 .clampScalar(0, S16, S256)
751 .clampMaxNumElements(0, S32, 16)
753 .scalarize(0);
754
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
756 // Full set of gfx9 features.
757 if (ST.hasScalarAddSub64()) {
758 getActionDefinitionsBuilder({G_ADD, G_SUB})
759 .legalFor({S64, S32, S16, V2S16})
760 .clampMaxNumElementsStrict(0, S16, 2)
761 .scalarize(0)
762 .minScalar(0, S16)
764 .maxScalar(0, S32);
765 } else {
766 getActionDefinitionsBuilder({G_ADD, G_SUB})
767 .legalFor({S32, S16, V2S16})
768 .clampMaxNumElementsStrict(0, S16, 2)
769 .scalarize(0)
770 .minScalar(0, S16)
772 .maxScalar(0, S32);
773 }
774
775 if (ST.hasScalarSMulU64()) {
777 .legalFor({S64, S32, S16, V2S16})
778 .clampMaxNumElementsStrict(0, S16, 2)
779 .scalarize(0)
780 .minScalar(0, S16)
782 .custom();
783 } else {
785 .legalFor({S32, S16, V2S16})
786 .clampMaxNumElementsStrict(0, S16, 2)
787 .scalarize(0)
788 .minScalar(0, S16)
790 .custom();
791 }
792 assert(ST.hasMad64_32());
793
794 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
795 .legalFor({S32, S16, V2S16}) // Clamp modifier
796 .minScalarOrElt(0, S16)
798 .scalarize(0)
800 .lower();
801 } else if (ST.has16BitInsts()) {
802 getActionDefinitionsBuilder({G_ADD, G_SUB})
803 .legalFor({S32, S16})
804 .minScalar(0, S16)
806 .maxScalar(0, S32)
807 .scalarize(0);
808
810 .legalFor({S32, S16})
811 .scalarize(0)
812 .minScalar(0, S16)
814 .custom();
815 assert(ST.hasMad64_32());
816
817 // Technically the saturating operations require clamp bit support, but this
818 // was introduced at the same time as 16-bit operations.
819 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
820 .legalFor({S32, S16}) // Clamp modifier
821 .minScalar(0, S16)
822 .scalarize(0)
824 .lower();
825
826 // We're just lowering this, but it helps get a better result to try to
827 // coerce to the desired type first.
828 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
829 .minScalar(0, S16)
830 .scalarize(0)
831 .lower();
832 } else {
833 getActionDefinitionsBuilder({G_ADD, G_SUB})
834 .legalFor({S32})
835 .widenScalarToNextMultipleOf(0, 32)
836 .clampScalar(0, S32, S32)
837 .scalarize(0);
838
839 auto &Mul = getActionDefinitionsBuilder(G_MUL)
840 .legalFor({S32})
841 .scalarize(0)
842 .minScalar(0, S32)
844
845 if (ST.hasMad64_32())
846 Mul.custom();
847 else
848 Mul.maxScalar(0, S32);
849
850 if (ST.hasIntClamp()) {
851 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
852 .legalFor({S32}) // Clamp modifier.
853 .scalarize(0)
855 .lower();
856 } else {
857 // Clamp bit support was added in VI, along with 16-bit operations.
858 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
859 .minScalar(0, S32)
860 .scalarize(0)
861 .lower();
862 }
863
864 // FIXME: DAG expansion gets better results. The widening uses the smaller
865 // range values and goes for the min/max lowering directly.
866 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
867 .minScalar(0, S32)
868 .scalarize(0)
869 .lower();
870 }
871
873 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
874 .customFor({S32, S64})
875 .clampScalar(0, S32, S64)
877 .scalarize(0);
878
879 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
880 .legalFor({S32})
881 .maxScalar(0, S32);
882
883 if (ST.hasVOP3PInsts()) {
884 Mulh
885 .clampMaxNumElements(0, S8, 2)
886 .lowerFor({V2S8});
887 }
888
889 Mulh
890 .scalarize(0)
891 .lower();
892
893 // Report legal for any types we can handle anywhere. For the cases only legal
894 // on the SALU, RegBankSelect will be able to re-legalize.
895 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
896 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
897 .clampScalar(0, S32, S64)
903 .scalarize(0);
904
906 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
907 .legalFor({{S32, S1}, {S32, S32}})
908 .clampScalar(0, S32, S32)
909 .scalarize(0);
910
912 // Don't worry about the size constraint.
914 .lower();
915
917 .legalFor({S1, S32, S64, S16, GlobalPtr,
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
919 .legalIf(isPointer(0))
920 .clampScalar(0, S32, S64)
922
923 getActionDefinitionsBuilder(G_FCONSTANT)
924 .legalFor({S32, S64, S16})
925 .clampScalar(0, S16, S64);
926
927 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
928 .legalIf(isRegisterClassType(ST, 0))
929 // s1 and s16 are special cases because they have legal operations on
930 // them, but don't really occupy registers in the normal way.
931 .legalFor({S1, S16})
932 .clampNumElements(0, V16S32, V32S32)
936 .clampMaxNumElements(0, S32, 16);
937
938 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
939
940 // If the amount is divergent, we have to do a wave reduction to get the
941 // maximum value, so this is expanded during RegBankSelect.
942 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
943 .legalFor({{PrivatePtr, S32}});
944
945 getActionDefinitionsBuilder(G_STACKSAVE)
946 .customFor({PrivatePtr});
947 getActionDefinitionsBuilder(G_STACKRESTORE)
948 .legalFor({PrivatePtr});
949
950 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
951
952 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
953 .customIf(typeIsNot(0, PrivatePtr));
954
955 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
956
957 auto &FPOpActions = getActionDefinitionsBuilder(
958 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
960 .legalFor({S32, S64});
961 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
962 .customFor({S32, S64});
963 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
964 .customFor({S32, S64});
965
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor({S16, V2S16});
969 else
970 FPOpActions.legalFor({S16});
971
972 TrigActions.customFor({S16});
973 FDIVActions.customFor({S16});
974 }
975
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor({V2S32});
978 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
979 }
980
981 auto &MinNumMaxNumIeee =
982 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
983
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(FPTypesPK16)
986 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
987 .clampMaxNumElements(0, S16, 2)
988 .clampScalar(0, S16, S64)
989 .scalarize(0);
990 } else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
992 } else {
993 MinNumMaxNumIeee.legalFor(FPTypesBase)
994 .clampScalar(0, S32, S64)
995 .scalarize(0);
996 }
997
998 auto &MinNumMaxNum = getActionDefinitionsBuilder(
999 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1000
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(FPTypesPK16)
1003 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1004 .clampMaxNumElements(0, S16, 2)
1005 .clampScalar(0, S16, S64)
1006 .scalarize(0);
1007 } else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(FPTypes16)
1009 .clampScalar(0, S16, S64)
1010 .scalarize(0);
1011 } else {
1012 MinNumMaxNum.customFor(FPTypesBase)
1013 .clampScalar(0, S32, S64)
1014 .scalarize(0);
1015 }
1016
1017 if (ST.hasVOP3PInsts())
1018 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1019
1020 FPOpActions
1021 .scalarize(0)
1022 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1023
1024 TrigActions
1025 .scalarize(0)
1026 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1027
1028 FDIVActions
1029 .scalarize(0)
1030 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1031
1032 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1033 .legalFor(FPTypesPK16)
1035 .scalarize(0)
1036 .clampScalar(0, S16, S64);
1037
1038 if (ST.has16BitInsts()) {
1040 .legalFor({S16})
1041 .customFor({S32, S64})
1042 .scalarize(0)
1043 .unsupported();
1045 .legalFor({S32, S64, S16})
1046 .scalarize(0)
1047 .clampScalar(0, S16, S64);
1048
1049 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1050 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1051 .scalarize(0)
1052 .maxScalarIf(typeIs(0, S16), 1, S16)
1053 .clampScalar(1, S32, S32)
1054 .lower();
1055
1057 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1058 .scalarize(0)
1059 .lower();
1060
1062 .lowerFor({S16, S32, S64})
1063 .scalarize(0)
1064 .lower();
1065 } else {
1067 .customFor({S32, S64, S16})
1068 .scalarize(0)
1069 .unsupported();
1070
1071
1072 if (ST.hasFractBug()) {
1074 .customFor({S64})
1075 .legalFor({S32, S64})
1076 .scalarize(0)
1077 .clampScalar(0, S32, S64);
1078 } else {
1080 .legalFor({S32, S64})
1081 .scalarize(0)
1082 .clampScalar(0, S32, S64);
1083 }
1084
1085 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1086 .legalFor({{S32, S32}, {S64, S32}})
1087 .scalarize(0)
1088 .clampScalar(0, S32, S64)
1089 .clampScalar(1, S32, S32)
1090 .lower();
1091
1093 .customFor({{S32, S32}, {S64, S32}})
1094 .scalarize(0)
1095 .minScalar(0, S32)
1096 .clampScalar(1, S32, S32)
1097 .lower();
1098
1100 .lowerFor({S32, S64})
1101 .scalarize(0)
1102 .lower();
1103 }
1104
1105 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1106 if (ST.hasCvtPkF16F32Inst()) {
1107 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1108 .clampMaxNumElements(0, S16, 2);
1109 } else {
1110 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1111 }
1112 FPTruncActions.scalarize(0).lower();
1113
1115 .legalFor({{S64, S32}, {S32, S16}})
1116 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1117 .scalarize(0);
1118
1119 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1120 if (ST.has16BitInsts()) {
1121 FSubActions
1122 // Use actual fsub instruction
1123 .legalFor({S32, S16})
1124 // Must use fadd + fneg
1125 .lowerFor({S64, V2S16});
1126 } else {
1127 FSubActions
1128 // Use actual fsub instruction
1129 .legalFor({S32})
1130 // Must use fadd + fneg
1131 .lowerFor({S64, S16, V2S16});
1132 }
1133
1134 FSubActions
1135 .scalarize(0)
1136 .clampScalar(0, S32, S64);
1137
1138 // Whether this is legal depends on the floating point mode for the function.
1139 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1140 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1141 FMad.customFor({S32, S16});
1142 else if (ST.hasMadMacF32Insts())
1143 FMad.customFor({S32});
1144 else if (ST.hasMadF16())
1145 FMad.customFor({S16});
1146 FMad.scalarize(0)
1147 .lower();
1148
1149 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1150 if (ST.has16BitInsts()) {
1151 FRem.customFor({S16, S32, S64});
1152 } else {
1153 FRem.minScalar(0, S32)
1154 .customFor({S32, S64});
1155 }
1156 FRem.scalarize(0);
1157
1158 // TODO: Do we need to clamp maximum bitwidth?
1160 .legalIf(isScalar(0))
1161 .legalFor({{V2S16, V2S32}})
1162 .clampMaxNumElements(0, S16, 2)
1163 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1164 // situations (like an invalid implicit use), we don't want to infinite loop
1165 // in the legalizer.
1167 .alwaysLegal();
1168
1169 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1170 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1171 {S32, S1}, {S64, S1}, {S16, S1}})
1172 .scalarize(0)
1173 .clampScalar(0, S32, S64)
1174 .widenScalarToNextPow2(1, 32);
1175
1176 // TODO: Split s1->s64 during regbankselect for VALU.
1177 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1178 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1179 .lowerIf(typeIs(1, S1))
1180 .customFor({{S32, S64}, {S64, S64}});
1181 if (ST.has16BitInsts())
1182 IToFP.legalFor({{S16, S16}});
1183 IToFP.clampScalar(1, S32, S64)
1184 .minScalar(0, S32)
1185 .scalarize(0)
1187
1188 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1189 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1190 .customFor({{S64, S32}, {S64, S64}})
1191 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1192 if (ST.has16BitInsts())
1193 FPToI.legalFor({{S16, S16}});
1194 else
1195 FPToI.minScalar(1, S32);
1196
1197 FPToI.minScalar(0, S32)
1198 .widenScalarToNextPow2(0, 32)
1199 .scalarize(0)
1200 .lower();
1201
1202 // clang-format off
1203 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1204 .legalFor({{S32, S32}, {S32, S64}})
1205 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1206 if (ST.has16BitInsts())
1207 FPToISat.legalFor({{S16, S16}});
1208
1209 FPToISat.minScalar(1, S32);
1210 FPToISat.minScalar(0, S32)
1211 .widenScalarToNextPow2(0, 32)
1212 .scalarize(0)
1213 .lower();
1214 // clang-format on
1215
1216 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1217 .clampScalar(0, S16, S64)
1218 .scalarize(0)
1219 .lower();
1220
1221 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1222 .legalFor({S16, S32})
1223 .scalarize(0)
1224 .lower();
1225
1226 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1227 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1228 .scalarize(0)
1229 .lower();
1230
1231 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1232 .clampScalar(0, S16, S64)
1233 .scalarize(0)
1234 .lower();
1235
1236 if (ST.has16BitInsts()) {
1238 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1239 .legalFor({S16, S32, S64})
1240 .clampScalar(0, S16, S64)
1241 .scalarize(0);
1242 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1244 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1245 .legalFor({S32, S64})
1246 .clampScalar(0, S32, S64)
1247 .scalarize(0);
1248 } else {
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1251 .legalFor({S32})
1252 .customFor({S64})
1253 .clampScalar(0, S32, S64)
1254 .scalarize(0);
1255 }
1256
1258 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1259 .legalIf(all(isPointer(0), sameSize(0, 1)))
1260 .scalarize(0)
1261 .scalarSameSizeAs(1, 0);
1262
1264 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1265 .scalarSameSizeAs(1, 0)
1266 .scalarize(0);
1267
1268 auto &CmpBuilder =
1270 // The compare output type differs based on the register bank of the output,
1271 // so make both s1 and s32 legal.
1272 //
1273 // Scalar compares producing output in scc will be promoted to s32, as that
1274 // is the allocatable register type that will be needed for the copy from
1275 // scc. This will be promoted during RegBankSelect, and we assume something
1276 // before that won't try to use s32 result types.
1277 //
1278 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1279 // bank.
1281 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1282 .legalForCartesianProduct(
1283 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1284 if (ST.has16BitInsts()) {
1285 CmpBuilder.legalFor({{S1, S16}});
1286 }
1287
1288 CmpBuilder
1290 .clampScalar(1, S32, S64)
1291 .scalarize(0)
1292 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1293
1294 auto &FCmpBuilder =
1296 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1297
1298 if (ST.hasSALUFloatInsts())
1299 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1300
1301 FCmpBuilder
1303 .clampScalar(1, S32, S64)
1304 .scalarize(0);
1305
1306 // FIXME: fpow has a selection pattern that should move to custom lowering.
1307 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1308 if (ST.has16BitInsts())
1309 ExpOps.customFor({{S32}, {S16}});
1310 else
1311 ExpOps.customFor({S32});
1312 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1313 .scalarize(0);
1314
1316 .clampScalar(0, MinScalarFPTy, S32)
1317 .lower();
1318
1320 .legalFor(ST.has16BitInsts(), {S16})
1321 .customFor({S32, S16})
1322 .scalarize(0)
1323 .lower();
1324
1326 .legalFor(ST.has16BitInsts(), {S16})
1327 .customFor({S32, S64, S16})
1328 .scalarize(0)
1329 .lower();
1330
1331 auto &LogOps =
1332 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1333 LogOps.customFor({S32, S16, S64});
1334 LogOps.clampScalar(0, MinScalarFPTy, S32)
1335 .scalarize(0);
1336
1337 // The 64-bit versions produce 32-bit results, but only on the SALU.
1339 .legalFor({{S32, S32}, {S32, S64}})
1340 .clampScalar(0, S32, S32)
1341 .widenScalarToNextPow2(1, 32)
1342 .clampScalar(1, S32, S64)
1343 .scalarize(0)
1344 .widenScalarToNextPow2(0, 32);
1345
1346 // If no 16 bit instr is available, lower into different instructions.
1347 if (ST.has16BitInsts())
1348 getActionDefinitionsBuilder(G_IS_FPCLASS)
1349 .legalForCartesianProduct({S1}, FPTypes16)
1350 .widenScalarToNextPow2(1)
1351 .scalarize(0)
1352 .lower();
1353 else
1354 getActionDefinitionsBuilder(G_IS_FPCLASS)
1355 .legalForCartesianProduct({S1}, FPTypesBase)
1356 .lowerFor({S1, S16})
1357 .widenScalarToNextPow2(1)
1358 .scalarize(0)
1359 .lower();
1360
1361 // The hardware instructions return a different result on 0 than the generic
1362 // instructions expect. The hardware produces -1, but these produce the
1363 // bitwidth.
1364 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1365 .scalarize(0)
1366 .clampScalar(0, S32, S32)
1367 .clampScalar(1, S32, S64)
1368 .widenScalarToNextPow2(0, 32)
1369 .widenScalarToNextPow2(1, 32)
1370 .custom();
1371
1372 // The 64-bit versions produce 32-bit results, but only on the SALU.
1373 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1374 .legalFor({{S32, S32}, {S32, S64}})
1375 .customIf(scalarNarrowerThan(1, 32))
1376 .clampScalar(0, S32, S32)
1377 .clampScalar(1, S32, S64)
1378 .scalarize(0)
1379 .widenScalarToNextPow2(0, 32)
1380 .widenScalarToNextPow2(1, 32);
1381
1382 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1383 .legalFor({{S32, S32}, {S32, S64}})
1384 .clampScalar(0, S32, S32)
1385 .clampScalar(1, S32, S64)
1386 .scalarize(0)
1387 .widenScalarToNextPow2(0, 32)
1388 .widenScalarToNextPow2(1, 32);
1389
1390 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1391 // RegBankSelect.
1392 getActionDefinitionsBuilder(G_BITREVERSE)
1393 .legalFor({S32, S64})
1394 .clampScalar(0, S32, S64)
1395 .scalarize(0)
1397
1398 if (ST.has16BitInsts()) {
1400 .legalFor({S16, S32, V2S16})
1401 .clampMaxNumElementsStrict(0, S16, 2)
1402 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1403 // narrowScalar limitation.
1405 .clampScalar(0, S16, S32)
1406 .scalarize(0);
1407
1408 if (ST.hasVOP3PInsts()) {
1410 .legalFor({S32, S16, V2S16})
1411 .clampMaxNumElements(0, S16, 2)
1412 .minScalar(0, S16)
1414 .scalarize(0)
1415 .lower();
1416 if (ST.hasIntMinMax64()) {
1417 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1418 .legalFor({S32, S16, S64, V2S16})
1419 .clampMaxNumElements(0, S16, 2)
1420 .minScalar(0, S16)
1422 .scalarize(0)
1423 .lower();
1424 } else {
1425 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1426 .legalFor({S32, S16, V2S16})
1427 .clampMaxNumElements(0, S16, 2)
1428 .minScalar(0, S16)
1430 .scalarize(0)
1431 .lower();
1432 }
1433 } else {
1434 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1435 .legalFor({S32, S16})
1436 .widenScalarToNextPow2(0)
1437 .minScalar(0, S16)
1438 .scalarize(0)
1439 .lower();
1440 }
1441 } else {
1442 // TODO: Should have same legality without v_perm_b32
1444 .legalFor({S32})
1445 .lowerIf(scalarNarrowerThan(0, 32))
1446 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1447 // narrowScalar limitation.
1449 .maxScalar(0, S32)
1450 .scalarize(0)
1451 .lower();
1452
1453 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1454 .legalFor({S32})
1455 .minScalar(0, S32)
1457 .scalarize(0)
1458 .lower();
1459 }
1460
1461 getActionDefinitionsBuilder(G_INTTOPTR)
1462 // List the common cases
1463 .legalForCartesianProduct(AddrSpaces64, {S64})
1464 .legalForCartesianProduct(AddrSpaces32, {S32})
1465 .scalarize(0)
1466 // Accept any address space as long as the size matches
1467 .legalIf(sameSize(0, 1))
1469 [](const LegalityQuery &Query) {
1470 return std::pair(
1471 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1472 })
1473 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1474 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1475 });
1476
1477 getActionDefinitionsBuilder(G_PTRTOINT)
1478 // List the common cases
1479 .legalForCartesianProduct(AddrSpaces64, {S64})
1480 .legalForCartesianProduct(AddrSpaces32, {S32})
1481 .scalarize(0)
1482 // Accept any address space as long as the size matches
1483 .legalIf(sameSize(0, 1))
1485 [](const LegalityQuery &Query) {
1486 return std::pair(
1487 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1488 })
1489 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1490 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1491 });
1492
1493 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1494 .scalarize(0)
1495 .custom();
1496
1497 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1498 bool IsLoad) -> bool {
1499 const LLT DstTy = Query.Types[0];
1500
1501 // Split vector extloads.
1502 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1503
1504 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1505 return true;
1506
1507 const LLT PtrTy = Query.Types[1];
1508 unsigned AS = PtrTy.getAddressSpace();
1509 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1510 Query.MMODescrs[0].Ordering !=
1512 return true;
1513
1514 // Catch weird sized loads that don't evenly divide into the access sizes
1515 // TODO: May be able to widen depending on alignment etc.
1516 unsigned NumRegs = (MemSize + 31) / 32;
1517 if (NumRegs == 3) {
1518 if (!ST.hasDwordx3LoadStores())
1519 return true;
1520 } else {
1521 // If the alignment allows, these should have been widened.
1522 if (!isPowerOf2_32(NumRegs))
1523 return true;
1524 }
1525
1526 return false;
1527 };
1528
1529 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1530 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1531 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1532
1533 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1534 // LDS
1535 // TODO: Unsupported flat for SI.
1536
1537 for (unsigned Op : {G_LOAD, G_STORE}) {
1538 const bool IsStore = Op == G_STORE;
1539
1540 auto &Actions = getActionDefinitionsBuilder(Op);
1541 // Explicitly list some common cases.
1542 // TODO: Does this help compile time at all?
1543 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1544 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1545 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1546 {S64, GlobalPtr, S64, GlobalAlign32},
1547 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1548 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1549 {S32, GlobalPtr, S8, GlobalAlign8},
1550 {S32, GlobalPtr, S16, GlobalAlign16},
1551
1552 {S32, LocalPtr, S32, 32},
1553 {S64, LocalPtr, S64, 32},
1554 {V2S32, LocalPtr, V2S32, 32},
1555 {S32, LocalPtr, S8, 8},
1556 {S32, LocalPtr, S16, 16},
1557 {V2S16, LocalPtr, S32, 32},
1558
1559 {S32, PrivatePtr, S32, 32},
1560 {S32, PrivatePtr, S8, 8},
1561 {S32, PrivatePtr, S16, 16},
1562 {V2S16, PrivatePtr, S32, 32},
1563
1564 {S32, ConstantPtr, S32, GlobalAlign32},
1565 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1566 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1567 {S64, ConstantPtr, S64, GlobalAlign32},
1568 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1569 Actions.legalIf(
1570 [=](const LegalityQuery &Query) -> bool {
1571 return isLoadStoreLegal(ST, Query);
1572 });
1573
1574 // The custom pointers (fat pointers, buffer resources) don't work with load
1575 // and store at this level. Fat pointers should have been lowered to
1576 // intrinsics before the translation to MIR.
1577 Actions.unsupportedIf(
1578 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1579
1580 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1581 // ptrtoint. This is needed to account for the fact that we can't have i128
1582 // as a register class for SelectionDAG reasons.
1583 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1584 return hasBufferRsrcWorkaround(Query.Types[0]);
1585 });
1586
1587 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1588 // 64-bits.
1589 //
1590 // TODO: Should generalize bitcast action into coerce, which will also cover
1591 // inserting addrspacecasts.
1592 Actions.customIf(typeIs(1, Constant32Ptr));
1593
1594 // Turn any illegal element vectors into something easier to deal
1595 // with. These will ultimately produce 32-bit scalar shifts to extract the
1596 // parts anyway.
1597 //
1598 // For odd 16-bit element vectors, prefer to split those into pieces with
1599 // 16-bit vector parts.
1600 Actions.bitcastIf(
1601 [=](const LegalityQuery &Query) -> bool {
1602 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1603 Query.MMODescrs[0].MemoryTy);
1604 }, bitcastToRegisterType(0));
1605
1606 if (!IsStore) {
1607 // Widen suitably aligned loads by loading extra bytes. The standard
1608 // legalization actions can't properly express widening memory operands.
1609 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1610 return shouldWidenLoad(ST, Query, G_LOAD);
1611 });
1612 }
1613
1614 // FIXME: load/store narrowing should be moved to lower action
1615 Actions
1616 .narrowScalarIf(
1617 [=](const LegalityQuery &Query) -> bool {
1618 return !Query.Types[0].isVector() &&
1619 needToSplitMemOp(Query, Op == G_LOAD);
1620 },
1621 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1622 const LLT DstTy = Query.Types[0];
1623 const LLT PtrTy = Query.Types[1];
1624
1625 const unsigned DstSize = DstTy.getSizeInBits();
1626 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1627
1628 // Split extloads.
1629 if (DstSize > MemSize)
1630 return std::pair(0, LLT::scalar(MemSize));
1631
1632 unsigned MaxSize = maxSizeForAddrSpace(
1633 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1634 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1635 if (MemSize > MaxSize)
1636 return std::pair(0, LLT::scalar(MaxSize));
1637
1638 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1639 return std::pair(0, LLT::scalar(Align));
1640 })
1641 .fewerElementsIf(
1642 [=](const LegalityQuery &Query) -> bool {
1643 return Query.Types[0].isVector() &&
1644 needToSplitMemOp(Query, Op == G_LOAD);
1645 },
1646 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1647 const LLT DstTy = Query.Types[0];
1648 const LLT PtrTy = Query.Types[1];
1649
1650 LLT EltTy = DstTy.getElementType();
1651 unsigned MaxSize = maxSizeForAddrSpace(
1652 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1653 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1654
1655 // FIXME: Handle widened to power of 2 results better. This ends
1656 // up scalarizing.
1657 // FIXME: 3 element stores scalarized on SI
1658
1659 // Split if it's too large for the address space.
1660 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1661 if (MemSize > MaxSize) {
1662 unsigned NumElts = DstTy.getNumElements();
1663 unsigned EltSize = EltTy.getSizeInBits();
1664
1665 if (MaxSize % EltSize == 0) {
1666 return std::pair(
1668 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1669 }
1670
1671 unsigned NumPieces = MemSize / MaxSize;
1672
1673 // FIXME: Refine when odd breakdowns handled
1674 // The scalars will need to be re-legalized.
1675 if (NumPieces == 1 || NumPieces >= NumElts ||
1676 NumElts % NumPieces != 0)
1677 return std::pair(0, EltTy);
1678
1679 return std::pair(0,
1680 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1681 }
1682
1683 // FIXME: We could probably handle weird extending loads better.
1684 if (DstTy.getSizeInBits() > MemSize)
1685 return std::pair(0, EltTy);
1686
1687 unsigned EltSize = EltTy.getSizeInBits();
1688 unsigned DstSize = DstTy.getSizeInBits();
1689 if (!isPowerOf2_32(DstSize)) {
1690 // We're probably decomposing an odd sized store. Try to split
1691 // to the widest type. TODO: Account for alignment. As-is it
1692 // should be OK, since the new parts will be further legalized.
1693 unsigned FloorSize = llvm::bit_floor(DstSize);
1694 return std::pair(
1696 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1697 }
1698
1699 // May need relegalization for the scalars.
1700 return std::pair(0, EltTy);
1701 })
1702 .minScalar(0, S32)
1703 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1705 .widenScalarToNextPow2(0)
1706 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1707 .lower();
1708 }
1709
1710 // FIXME: Unaligned accesses not lowered.
1711 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1712 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1713 {S32, GlobalPtr, S16, 2 * 8},
1714 {S32, LocalPtr, S8, 8},
1715 {S32, LocalPtr, S16, 16},
1716 {S32, PrivatePtr, S8, 8},
1717 {S32, PrivatePtr, S16, 16},
1718 {S32, ConstantPtr, S8, 8},
1719 {S32, ConstantPtr, S16, 2 * 8}})
1720 .legalIf(
1721 [=](const LegalityQuery &Query) -> bool {
1722 return isLoadStoreLegal(ST, Query);
1723 });
1724
1725 if (ST.hasFlatAddressSpace()) {
1726 ExtLoads.legalForTypesWithMemDesc(
1727 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1728 }
1729
1730 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1731 // 64-bits.
1732 //
1733 // TODO: Should generalize bitcast action into coerce, which will also cover
1734 // inserting addrspacecasts.
1735 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1736
1737 ExtLoads.clampScalar(0, S32, S32)
1739 .lower();
1740
1741 auto &Atomics = getActionDefinitionsBuilder(
1742 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1743 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1744 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1745 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1746 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1747 {S64, GlobalPtr}, {S64, LocalPtr},
1748 {S32, RegionPtr}, {S64, RegionPtr}});
1749 if (ST.hasFlatAddressSpace()) {
1750 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1751 }
1752
1753 auto &Atomics32 =
1754 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1755 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1756 if (ST.hasFlatAddressSpace()) {
1757 Atomics32.legalFor({{S32, FlatPtr}});
1758 }
1759
1760 // TODO: v2bf16 operations, and fat buffer pointer support.
1761 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1762 if (ST.hasLDSFPAtomicAddF32()) {
1763 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1764 if (ST.hasLdsAtomicAddF64())
1765 Atomic.legalFor({{S64, LocalPtr}});
1766 if (ST.hasAtomicDsPkAdd16Insts())
1767 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1768 }
1769 if (ST.hasAtomicFaddInsts())
1770 Atomic.legalFor({{S32, GlobalPtr}});
1771 if (ST.hasFlatAtomicFaddF32Inst())
1772 Atomic.legalFor({{S32, FlatPtr}});
1773
1774 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1775 // These are legal with some caveats, and should have undergone expansion in
1776 // the IR in most situations
1777 // TODO: Move atomic expansion into legalizer
1778 Atomic.legalFor({
1779 {S32, GlobalPtr},
1780 {S64, GlobalPtr},
1781 {S64, FlatPtr}
1782 });
1783 }
1784
1785 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1786 ST.hasAtomicBufferGlobalPkAddF16Insts())
1787 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1788 if (ST.hasAtomicGlobalPkAddBF16Inst())
1789 Atomic.legalFor({{V2BF16, GlobalPtr}});
1790 if (ST.hasAtomicFlatPkAdd16Insts())
1791 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1792
1793
1794 // Most of the legalization work here is done by AtomicExpand. We could
1795 // probably use a simpler legality rule that just assumes anything is OK.
1796 auto &AtomicFMinFMax =
1797 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1798 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1799
1800 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1801 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1802 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1803 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1804 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1805 AtomicFMinFMax.legalFor({F32, FlatPtr});
1806 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1807 AtomicFMinFMax.legalFor({F64, FlatPtr});
1808
1809 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1810 // demarshalling
1811 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1812 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1813 {S32, FlatPtr}, {S64, FlatPtr}})
1814 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1815 {S32, RegionPtr}, {S64, RegionPtr}});
1816 // TODO: Pointer types, any 32-bit or 64-bit vector
1817
1818 // Condition should be s32 for scalar, s1 for vector.
1821 LocalPtr, FlatPtr, PrivatePtr,
1822 LLT::fixed_vector(2, LocalPtr),
1823 LLT::fixed_vector(2, PrivatePtr)},
1824 {S1, S32})
1825 .clampScalar(0, S16, S64)
1826 .scalarize(1)
1829 .clampMaxNumElements(0, S32, 2)
1830 .clampMaxNumElements(0, LocalPtr, 2)
1831 .clampMaxNumElements(0, PrivatePtr, 2)
1832 .scalarize(0)
1834 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1835
1836 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1837 // be more flexible with the shift amount type.
1838 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1839 .legalFor({{S32, S32}, {S64, S32}});
1840 if (ST.has16BitInsts()) {
1841 if (ST.hasVOP3PInsts()) {
1842 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1843 .clampMaxNumElements(0, S16, 2);
1844 } else
1845 Shifts.legalFor({{S16, S16}});
1846
1847 // TODO: Support 16-bit shift amounts for all types
1848 Shifts.widenScalarIf(
1849 [=](const LegalityQuery &Query) {
1850 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1851 // 32-bit amount.
1852 const LLT ValTy = Query.Types[0];
1853 const LLT AmountTy = Query.Types[1];
1854 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1855 AmountTy.getSizeInBits() < 16;
1856 }, changeTo(1, S16));
1857 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1858 Shifts.clampScalar(1, S32, S32);
1859 Shifts.widenScalarToNextPow2(0, 16);
1860 Shifts.clampScalar(0, S16, S64);
1861
1862 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1863 .minScalar(0, S16)
1864 .scalarize(0)
1865 .lower();
1866 } else {
1867 // Make sure we legalize the shift amount type first, as the general
1868 // expansion for the shifted type will produce much worse code if it hasn't
1869 // been truncated already.
1870 Shifts.clampScalar(1, S32, S32);
1871 Shifts.widenScalarToNextPow2(0, 32);
1872 Shifts.clampScalar(0, S32, S64);
1873
1874 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1875 .minScalar(0, S32)
1876 .scalarize(0)
1877 .lower();
1878 }
1879 Shifts.scalarize(0);
1880
1881 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1882 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1883 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1884 unsigned IdxTypeIdx = 2;
1885
1887 .customIf([=](const LegalityQuery &Query) {
1888 const LLT EltTy = Query.Types[EltTypeIdx];
1889 const LLT VecTy = Query.Types[VecTypeIdx];
1890 const LLT IdxTy = Query.Types[IdxTypeIdx];
1891 const unsigned EltSize = EltTy.getSizeInBits();
1892 const bool isLegalVecType =
1894 // Address space 8 pointers are 128-bit wide values, but the logic
1895 // below will try to bitcast them to 2N x s64, which will fail.
1896 // Therefore, as an intermediate step, wrap extracts/insertions from a
1897 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1898 // extraction result) in order to produce a vector operation that can
1899 // be handled by the logic below.
1900 if (EltTy.isPointer() && EltSize > 64)
1901 return true;
1902 return (EltSize == 32 || EltSize == 64) &&
1903 VecTy.getSizeInBits() % 32 == 0 &&
1904 VecTy.getSizeInBits() <= MaxRegisterSize &&
1905 IdxTy.getSizeInBits() == 32 &&
1906 isLegalVecType;
1907 })
1908 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1909 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1910 bitcastToVectorElement32(VecTypeIdx))
1911 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1912 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1913 scalarOrEltWiderThan(VecTypeIdx, 64)),
1914 [=](const LegalityQuery &Query) {
1915 // For > 64-bit element types, try to turn this into a
1916 // 64-bit element vector since we may be able to do better
1917 // indexing if this is scalar. If not, fall back to 32.
1918 const LLT EltTy = Query.Types[EltTypeIdx];
1919 const LLT VecTy = Query.Types[VecTypeIdx];
1920 const unsigned DstEltSize = EltTy.getSizeInBits();
1921 const unsigned VecSize = VecTy.getSizeInBits();
1922
1923 const unsigned TargetEltSize =
1924 DstEltSize % 64 == 0 ? 64 : 32;
1925 return std::pair(VecTypeIdx,
1926 LLT::fixed_vector(VecSize / TargetEltSize,
1927 TargetEltSize));
1928 })
1929 .clampScalar(EltTypeIdx, S32, S64)
1930 .clampScalar(VecTypeIdx, S32, S64)
1931 .clampScalar(IdxTypeIdx, S32, S32)
1932 .clampMaxNumElements(VecTypeIdx, S32, 32)
1933 // TODO: Clamp elements for 64-bit vectors?
1934 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1936 // It should only be necessary with variable indexes.
1937 // As a last resort, lower to the stack
1938 .lower();
1939 }
1940
1941 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1942 .unsupportedIf([=](const LegalityQuery &Query) {
1943 const LLT &EltTy = Query.Types[1].getElementType();
1944 return Query.Types[0] != EltTy;
1945 });
1946
1947 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1948 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1949 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1952 [=](const LegalityQuery &Query) {
1953 const LLT BigTy = Query.Types[BigTyIdx];
1954 return (BigTy.getScalarSizeInBits() < 16);
1955 },
1957 .widenScalarIf(
1958 [=](const LegalityQuery &Query) {
1959 const LLT LitTy = Query.Types[LitTyIdx];
1960 return (LitTy.getScalarSizeInBits() < 16);
1961 },
1963 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1964 .widenScalarToNextPow2(BigTyIdx, 32)
1965 .customIf([=](const LegalityQuery &Query) {
1966 // Generic lower operates on the full-width value, producing
1967 // shift+trunc/mask sequences. For simple cases where extract/insert
1968 // values are 32-bit aligned, we can instead unmerge/merge and work on
1969 // the 32-bit components. However, we can't check the offset here so
1970 // custom lower function will have to call generic lowering if offset
1971 // is not 32-bit aligned.
1972 const LLT BigTy = Query.Types[BigTyIdx];
1973 const LLT LitTy = Query.Types[LitTyIdx];
1974 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
1975 LitTy.getSizeInBits() % 32 == 0;
1976 })
1977 .lower();
1978 }
1979
1980 auto &BuildVector =
1981 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1983 .legalForCartesianProduct(AllS64Vectors, {S64})
1984 .clampNumElements(0, V16S32, V32S32)
1989
1990 if (ST.hasScalarPackInsts()) {
1991 BuildVector
1992 // FIXME: Should probably widen s1 vectors straight to s32
1993 .minScalarOrElt(0, S16)
1994 .minScalar(1, S16);
1995
1996 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1997 .legalFor({V2S16, S32})
1998 .lower();
1999 } else {
2000 BuildVector.customFor({V2S16, S16});
2001 BuildVector.minScalarOrElt(0, S32);
2002
2003 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2004 .customFor({V2S16, S32})
2005 .lower();
2006 }
2007
2008 BuildVector.legalIf(isRegisterType(ST, 0));
2009
2010 // FIXME: Clamp maximum size
2011 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2012 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2013 .clampMaxNumElements(0, S32, 32)
2014 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2015 .clampMaxNumElements(0, S16, 64);
2016
2017 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2018
2019 // Merge/Unmerge
2020 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2021 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2022 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2023
2024 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2025 const LLT Ty = Query.Types[TypeIdx];
2026 if (Ty.isVector()) {
2027 const LLT &EltTy = Ty.getElementType();
2028 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2029 return true;
2031 return true;
2032 }
2033 return false;
2034 };
2035
2036 auto &Builder =
2038 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2039 .lowerFor({{S16, V2S16}})
2040 .lowerIf([=](const LegalityQuery &Query) {
2041 const LLT BigTy = Query.Types[BigTyIdx];
2042 return BigTy.getSizeInBits() == 32;
2043 })
2044 // Try to widen to s16 first for small types.
2045 // TODO: Only do this on targets with legal s16 shifts
2046 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2047 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2049 oneMoreElement(BigTyIdx))
2051 elementTypeIs(1, S16)),
2052 changeTo(1, V2S16))
2053 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2054 // not worth considering the multiples of 64 since 2*192 and 2*384
2055 // are not valid.
2056 .clampScalar(LitTyIdx, S32, S512)
2057 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2058 // Break up vectors with weird elements into scalars
2060 [=](const LegalityQuery &Query) {
2061 return notValidElt(Query, LitTyIdx);
2062 },
2063 scalarize(0))
2064 .fewerElementsIf(
2065 [=](const LegalityQuery &Query) {
2066 return notValidElt(Query, BigTyIdx);
2067 },
2068 scalarize(1))
2069 .clampScalar(BigTyIdx, S32, MaxScalar);
2070
2071 if (Op == G_MERGE_VALUES) {
2072 Builder.widenScalarIf(
2073 // TODO: Use 16-bit shifts if legal for 8-bit values?
2074 [=](const LegalityQuery &Query) {
2075 const LLT Ty = Query.Types[LitTyIdx];
2076 return Ty.getSizeInBits() < 32;
2077 },
2078 changeTo(LitTyIdx, S32));
2079 }
2080
2081 Builder.widenScalarIf(
2082 [=](const LegalityQuery &Query) {
2083 const LLT Ty = Query.Types[BigTyIdx];
2084 return Ty.getSizeInBits() % 16 != 0;
2085 },
2086 [=](const LegalityQuery &Query) {
2087 // Pick the next power of 2, or a multiple of 64 over 128.
2088 // Whichever is smaller.
2089 const LLT &Ty = Query.Types[BigTyIdx];
2090 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2091 if (NewSizeInBits >= 256) {
2092 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2093 if (RoundedTo < NewSizeInBits)
2094 NewSizeInBits = RoundedTo;
2095 }
2096 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2097 })
2098 // Any vectors left are the wrong size. Scalarize them.
2099 .scalarize(0)
2100 .scalarize(1);
2101 }
2102
2103 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2104 // RegBankSelect.
2105 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2106 .legalFor({{S32}, {S64}})
2107 .clampScalar(0, S32, S64);
2108
2109 if (ST.hasVOP3PInsts()) {
2110 SextInReg.lowerFor({{V2S16}})
2111 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2112 // get more vector shift opportunities, since we'll get those when
2113 // expanded.
2114 .clampMaxNumElementsStrict(0, S16, 2);
2115 } else if (ST.has16BitInsts()) {
2116 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2117 } else {
2118 // Prefer to promote to s32 before lowering if we don't have 16-bit
2119 // shifts. This avoid a lot of intermediate truncate and extend operations.
2120 SextInReg.lowerFor({{S32}, {S64}});
2121 }
2122
2123 SextInReg
2124 .scalarize(0)
2125 .clampScalar(0, S32, S64)
2126 .lower();
2127
2128 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2129 .scalarize(0)
2130 .lower();
2131
2132 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2133 FSHRActionDefs.legalFor({{S32, S32}})
2134 .clampMaxNumElementsStrict(0, S16, 2);
2135 if (ST.hasVOP3PInsts())
2136 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2137 FSHRActionDefs.scalarize(0).lower();
2138
2139 if (ST.hasVOP3PInsts()) {
2141 .lowerFor({{V2S16, V2S16}})
2142 .clampMaxNumElementsStrict(0, S16, 2)
2143 .scalarize(0)
2144 .lower();
2145 } else {
2147 .scalarize(0)
2148 .lower();
2149 }
2150
2151 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2152 .legalFor({S64});
2153
2154 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2155
2157 .alwaysLegal();
2158
2159 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2160 .scalarize(0)
2161 .minScalar(0, S32)
2162 .lower();
2163
2164 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2165 .legalFor({{S32, S32}, {S64, S32}})
2166 .clampScalar(1, S32, S32)
2167 .clampScalar(0, S32, S64)
2169 .scalarize(0);
2170
2172 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2173 G_FCOPYSIGN,
2174
2175 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2176 G_READ_REGISTER, G_WRITE_REGISTER,
2177
2178 G_SADDO, G_SSUBO})
2179 .lower();
2180
2181 if (ST.hasIEEEMinimumMaximumInsts()) {
2182 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2183 .legalFor(FPTypesPK16)
2184 .clampMaxNumElements(0, S16, 2)
2185 .scalarize(0);
2186 } else if (ST.hasVOP3PInsts()) {
2187 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2188 .lowerFor({V2S16})
2189 .clampMaxNumElementsStrict(0, S16, 2)
2190 .scalarize(0)
2191 .lower();
2192 } else {
2193 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2194 .scalarize(0)
2195 .clampScalar(0, S32, S64)
2196 .lower();
2197 }
2198
2199 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2200 .lower();
2201
2202 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2203
2204 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2205 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2206 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2207 .unsupported();
2208
2210
2212 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2213 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2214 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2215 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2216 .legalFor(AllVectors)
2217 .scalarize(1)
2218 .lower();
2219
2221 verify(*ST.getInstrInfo());
2222}
2223
2226 LostDebugLocObserver &LocObserver) const {
2227 MachineIRBuilder &B = Helper.MIRBuilder;
2228 MachineRegisterInfo &MRI = *B.getMRI();
2229
2230 switch (MI.getOpcode()) {
2231 case TargetOpcode::G_ADDRSPACE_CAST:
2232 return legalizeAddrSpaceCast(MI, MRI, B);
2233 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2234 return legalizeFroundeven(MI, MRI, B);
2235 case TargetOpcode::G_FCEIL:
2236 return legalizeFceil(MI, MRI, B);
2237 case TargetOpcode::G_FREM:
2238 return legalizeFrem(MI, MRI, B);
2239 case TargetOpcode::G_INTRINSIC_TRUNC:
2240 return legalizeIntrinsicTrunc(MI, MRI, B);
2241 case TargetOpcode::G_SITOFP:
2242 return legalizeITOFP(MI, MRI, B, true);
2243 case TargetOpcode::G_UITOFP:
2244 return legalizeITOFP(MI, MRI, B, false);
2245 case TargetOpcode::G_FPTOSI:
2246 return legalizeFPTOI(MI, MRI, B, true);
2247 case TargetOpcode::G_FPTOUI:
2248 return legalizeFPTOI(MI, MRI, B, false);
2249 case TargetOpcode::G_FMINNUM:
2250 case TargetOpcode::G_FMAXNUM:
2251 case TargetOpcode::G_FMINIMUMNUM:
2252 case TargetOpcode::G_FMAXIMUMNUM:
2253 return legalizeMinNumMaxNum(Helper, MI);
2254 case TargetOpcode::G_EXTRACT:
2255 return legalizeExtract(Helper, MI);
2256 case TargetOpcode::G_INSERT:
2257 return legalizeInsert(Helper, MI);
2258 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2259 return legalizeExtractVectorElt(MI, MRI, B);
2260 case TargetOpcode::G_INSERT_VECTOR_ELT:
2261 return legalizeInsertVectorElt(MI, MRI, B);
2262 case TargetOpcode::G_FSIN:
2263 case TargetOpcode::G_FCOS:
2264 return legalizeSinCos(MI, MRI, B);
2265 case TargetOpcode::G_GLOBAL_VALUE:
2266 return legalizeGlobalValue(MI, MRI, B);
2267 case TargetOpcode::G_LOAD:
2268 case TargetOpcode::G_SEXTLOAD:
2269 case TargetOpcode::G_ZEXTLOAD:
2270 return legalizeLoad(Helper, MI);
2271 case TargetOpcode::G_STORE:
2272 return legalizeStore(Helper, MI);
2273 case TargetOpcode::G_FMAD:
2274 return legalizeFMad(MI, MRI, B);
2275 case TargetOpcode::G_FDIV:
2276 return legalizeFDIV(MI, MRI, B);
2277 case TargetOpcode::G_FFREXP:
2278 return legalizeFFREXP(MI, MRI, B);
2279 case TargetOpcode::G_FSQRT:
2280 return legalizeFSQRT(MI, MRI, B);
2281 case TargetOpcode::G_UDIV:
2282 case TargetOpcode::G_UREM:
2283 case TargetOpcode::G_UDIVREM:
2284 return legalizeUnsignedDIV_REM(MI, MRI, B);
2285 case TargetOpcode::G_SDIV:
2286 case TargetOpcode::G_SREM:
2287 case TargetOpcode::G_SDIVREM:
2288 return legalizeSignedDIV_REM(MI, MRI, B);
2289 case TargetOpcode::G_ATOMIC_CMPXCHG:
2290 return legalizeAtomicCmpXChg(MI, MRI, B);
2291 case TargetOpcode::G_FLOG2:
2292 return legalizeFlog2(MI, B);
2293 case TargetOpcode::G_FLOG:
2294 case TargetOpcode::G_FLOG10:
2295 return legalizeFlogCommon(MI, B);
2296 case TargetOpcode::G_FEXP2:
2297 return legalizeFExp2(MI, B);
2298 case TargetOpcode::G_FEXP:
2299 case TargetOpcode::G_FEXP10:
2300 return legalizeFExp(MI, B);
2301 case TargetOpcode::G_FPOW:
2302 return legalizeFPow(MI, B);
2303 case TargetOpcode::G_FFLOOR:
2304 return legalizeFFloor(MI, MRI, B);
2305 case TargetOpcode::G_BUILD_VECTOR:
2306 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2307 return legalizeBuildVector(MI, MRI, B);
2308 case TargetOpcode::G_MUL:
2309 return legalizeMul(Helper, MI);
2310 case TargetOpcode::G_CTLZ:
2311 case TargetOpcode::G_CTTZ:
2312 return legalizeCTLZ_CTTZ(MI, MRI, B);
2313 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2314 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2315 case TargetOpcode::G_STACKSAVE:
2316 return legalizeStackSave(MI, B);
2317 case TargetOpcode::G_GET_FPENV:
2318 return legalizeGetFPEnv(MI, MRI, B);
2319 case TargetOpcode::G_SET_FPENV:
2320 return legalizeSetFPEnv(MI, MRI, B);
2321 case TargetOpcode::G_TRAP:
2322 return legalizeTrap(MI, MRI, B);
2323 case TargetOpcode::G_DEBUGTRAP:
2324 return legalizeDebugTrap(MI, MRI, B);
2325 default:
2326 return false;
2327 }
2328
2329 llvm_unreachable("expected switch to return");
2330}
2331
2333 unsigned AS,
2335 MachineIRBuilder &B) const {
2336 MachineFunction &MF = B.getMF();
2337 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2338 const LLT S32 = LLT::scalar(32);
2339 const LLT S64 = LLT::scalar(64);
2340
2342
2343 if (ST.hasApertureRegs()) {
2344 // Note: this register is somewhat broken. When used as a 32-bit operand,
2345 // it only returns zeroes. The real value is in the upper 32 bits.
2346 // Thus, we must emit extract the high 32 bits.
2347 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2348 ? AMDGPU::SRC_SHARED_BASE
2349 : AMDGPU::SRC_PRIVATE_BASE;
2350 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2351 !ST.hasGloballyAddressableScratch()) &&
2352 "Cannot use src_private_base with globally addressable scratch!");
2354 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2355 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2356 return B.buildUnmerge(S32, Dst).getReg(1);
2357 }
2358
2361 // For code object version 5, private_base and shared_base are passed through
2362 // implicit kernargs.
2366
2371 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2372
2373 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2375
2376 if (!loadInputValue(KernargPtrReg, B,
2378 return Register();
2379
2381 PtrInfo.getWithOffset(Offset),
2385
2386 // Pointer address
2387 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2388 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2389 // Load address
2390 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2391 }
2392
2395
2397 return Register();
2398
2399 // TODO: Use custom PseudoSourceValue
2401
2402 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2403 // private_segment_aperture_base_hi.
2404 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2405
2407 PtrInfo,
2410 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2411
2412 B.buildObjectPtrOffset(
2413 LoadAddr, QueuePtr,
2414 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2415 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2416}
2417
2418/// Return true if the value is a known valid address, such that a null check is
2419/// not necessary.
2421 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2422 MachineInstr *Def = MRI.getVRegDef(Val);
2423 switch (Def->getOpcode()) {
2424 case AMDGPU::G_FRAME_INDEX:
2425 case AMDGPU::G_GLOBAL_VALUE:
2426 case AMDGPU::G_BLOCK_ADDR:
2427 return true;
2428 case AMDGPU::G_CONSTANT: {
2429 const ConstantInt *CI = Def->getOperand(1).getCImm();
2430 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2431 }
2432 default:
2433 return false;
2434 }
2435
2436 return false;
2437}
2438
2441 MachineIRBuilder &B) const {
2442 MachineFunction &MF = B.getMF();
2443
2444 // MI can either be a G_ADDRSPACE_CAST or a
2445 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2446 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2447 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2448 Intrinsic::amdgcn_addrspacecast_nonnull));
2449
2450 const LLT S32 = LLT::scalar(32);
2451 Register Dst = MI.getOperand(0).getReg();
2452 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2453 : MI.getOperand(1).getReg();
2454 LLT DstTy = MRI.getType(Dst);
2455 LLT SrcTy = MRI.getType(Src);
2456 unsigned DestAS = DstTy.getAddressSpace();
2457 unsigned SrcAS = SrcTy.getAddressSpace();
2458
2459 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2460 // vector element.
2461 assert(!DstTy.isVector());
2462
2463 const AMDGPUTargetMachine &TM
2464 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2465
2466 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2467 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2468 return true;
2469 }
2470
2471 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2472 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2473 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2474 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2475 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2476 ST.hasGloballyAddressableScratch()) {
2477 // flat -> private with globally addressable scratch: subtract
2478 // src_flat_scratch_base_lo.
2479 const LLT S32 = LLT::scalar(32);
2480 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2481 Register FlatScratchBaseLo =
2482 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2483 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2484 .getReg(0);
2485 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2486 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2487 return B.buildIntToPtr(Dst, Sub).getReg(0);
2488 }
2489
2490 // Extract low 32-bits of the pointer.
2491 return B.buildExtract(Dst, Src, 0).getReg(0);
2492 };
2493
2494 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2495 // G_ADDRSPACE_CAST we need to guess.
2496 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2497 castFlatToLocalOrPrivate(Dst);
2498 MI.eraseFromParent();
2499 return true;
2500 }
2501
2502 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2503
2504 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2505 auto FlatNull = B.buildConstant(SrcTy, 0);
2506
2507 // Extract low 32-bits of the pointer.
2508 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2509
2510 auto CmpRes =
2511 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2512 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2513
2514 MI.eraseFromParent();
2515 return true;
2516 }
2517
2518 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2519 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2520 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2521 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2522 // Coerce the type of the low half of the result so we can use
2523 // merge_values.
2524 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2525
2526 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2527 ST.hasGloballyAddressableScratch()) {
2528 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2529 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2530 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2531 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2532 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2533 .addUse(AllOnes)
2534 .addUse(ThreadID)
2535 .getReg(0);
2536 if (ST.isWave64()) {
2537 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2538 .addUse(AllOnes)
2539 .addUse(ThreadID)
2540 .getReg(0);
2541 }
2542 Register ShAmt =
2543 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2544 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2545 Register CvtPtr =
2546 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2547 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2548 // 64-bit hi:lo value.
2549 Register FlatScratchBase =
2550 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2551 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2552 .getReg(0);
2553 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2554 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2555 }
2556
2557 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2558 if (!ApertureReg.isValid())
2559 return false;
2560
2561 // TODO: Should we allow mismatched types but matching sizes in merges to
2562 // avoid the ptrtoint?
2563 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2564 };
2565
2566 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2567 // G_ADDRSPACE_CAST we need to guess.
2568 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2569 castLocalOrPrivateToFlat(Dst);
2570 MI.eraseFromParent();
2571 return true;
2572 }
2573
2574 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2575
2576 auto SegmentNull =
2577 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2578 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2579
2580 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2581 SegmentNull.getReg(0));
2582
2583 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2584
2585 MI.eraseFromParent();
2586 return true;
2587 }
2588
2589 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2590 SrcTy.getSizeInBits() == 64) {
2591 // Truncate.
2592 B.buildExtract(Dst, Src, 0);
2593 MI.eraseFromParent();
2594 return true;
2595 }
2596
2597 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2598 DstTy.getSizeInBits() == 64) {
2600 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2601 auto PtrLo = B.buildPtrToInt(S32, Src);
2602 if (AddrHiVal == 0) {
2603 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2604 B.buildIntToPtr(Dst, Zext);
2605 } else {
2606 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2607 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2608 }
2609
2610 MI.eraseFromParent();
2611 return true;
2612 }
2613
2614 // Invalid casts are poison.
2615 // TODO: Should return poison
2616 B.buildUndef(Dst);
2617 MI.eraseFromParent();
2618 return true;
2619}
2620
2623 MachineIRBuilder &B) const {
2624 Register Src = MI.getOperand(1).getReg();
2625 LLT Ty = MRI.getType(Src);
2626 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2627
2628 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2629 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2630
2631 auto C1 = B.buildFConstant(Ty, C1Val);
2632 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2633
2634 // TODO: Should this propagate fast-math-flags?
2635 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2636 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2637
2638 auto C2 = B.buildFConstant(Ty, C2Val);
2639 auto Fabs = B.buildFAbs(Ty, Src);
2640
2641 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2642 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2643 MI.eraseFromParent();
2644 return true;
2645}
2646
2649 MachineIRBuilder &B) const {
2650
2651 const LLT S1 = LLT::scalar(1);
2652 const LLT S64 = LLT::scalar(64);
2653
2654 Register Src = MI.getOperand(1).getReg();
2655 assert(MRI.getType(Src) == S64);
2656
2657 // result = trunc(src)
2658 // if (src > 0.0 && src != result)
2659 // result += 1.0
2660
2661 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2662
2663 const auto Zero = B.buildFConstant(S64, 0.0);
2664 const auto One = B.buildFConstant(S64, 1.0);
2665 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2666 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2667 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2668 auto Add = B.buildSelect(S64, And, One, Zero);
2669
2670 // TODO: Should this propagate fast-math-flags?
2671 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2672 MI.eraseFromParent();
2673 return true;
2674}
2675
2678 MachineIRBuilder &B) const {
2679 Register DstReg = MI.getOperand(0).getReg();
2680 Register Src0Reg = MI.getOperand(1).getReg();
2681 Register Src1Reg = MI.getOperand(2).getReg();
2682 auto Flags = MI.getFlags();
2683 LLT Ty = MRI.getType(DstReg);
2684
2685 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2686 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2687 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2688 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2689 MI.eraseFromParent();
2690 return true;
2691}
2692
2695 const unsigned FractBits = 52;
2696 const unsigned ExpBits = 11;
2697 LLT S32 = LLT::scalar(32);
2698
2699 auto Const0 = B.buildConstant(S32, FractBits - 32);
2700 auto Const1 = B.buildConstant(S32, ExpBits);
2701
2702 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2703 .addUse(Hi)
2704 .addUse(Const0.getReg(0))
2705 .addUse(Const1.getReg(0));
2706
2707 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2708}
2709
2712 MachineIRBuilder &B) const {
2713 const LLT S1 = LLT::scalar(1);
2714 const LLT S32 = LLT::scalar(32);
2715 const LLT S64 = LLT::scalar(64);
2716
2717 Register Src = MI.getOperand(1).getReg();
2718 assert(MRI.getType(Src) == S64);
2719
2720 // TODO: Should this use extract since the low half is unused?
2721 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2722 Register Hi = Unmerge.getReg(1);
2723
2724 // Extract the upper half, since this is where we will find the sign and
2725 // exponent.
2726 auto Exp = extractF64Exponent(Hi, B);
2727
2728 const unsigned FractBits = 52;
2729
2730 // Extract the sign bit.
2731 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2732 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2733
2734 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2735
2736 const auto Zero32 = B.buildConstant(S32, 0);
2737
2738 // Extend back to 64-bits.
2739 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2740
2741 auto Shr = B.buildAShr(S64, FractMask, Exp);
2742 auto Not = B.buildNot(S64, Shr);
2743 auto Tmp0 = B.buildAnd(S64, Src, Not);
2744 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2745
2746 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2747 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2748
2749 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2750 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2751 MI.eraseFromParent();
2752 return true;
2753}
2754
2757 MachineIRBuilder &B, bool Signed) const {
2758
2759 Register Dst = MI.getOperand(0).getReg();
2760 Register Src = MI.getOperand(1).getReg();
2761
2762 const LLT S64 = LLT::scalar(64);
2763 const LLT S32 = LLT::scalar(32);
2764
2765 assert(MRI.getType(Src) == S64);
2766
2767 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2768 auto ThirtyTwo = B.buildConstant(S32, 32);
2769
2770 if (MRI.getType(Dst) == S64) {
2771 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2772 : B.buildUITOFP(S64, Unmerge.getReg(1));
2773
2774 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2775 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2776
2777 // TODO: Should this propagate fast-math-flags?
2778 B.buildFAdd(Dst, LdExp, CvtLo);
2779 MI.eraseFromParent();
2780 return true;
2781 }
2782
2783 assert(MRI.getType(Dst) == S32);
2784
2785 auto One = B.buildConstant(S32, 1);
2786
2787 MachineInstrBuilder ShAmt;
2788 if (Signed) {
2789 auto ThirtyOne = B.buildConstant(S32, 31);
2790 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2791 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2792 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2793 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2794 .addUse(Unmerge.getReg(1));
2795 auto LS2 = B.buildSub(S32, LS, One);
2796 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2797 } else
2798 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2799 auto Norm = B.buildShl(S64, Src, ShAmt);
2800 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2801 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2802 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2803 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2804 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2805 B.buildFLdexp(Dst, FVal, Scale);
2806 MI.eraseFromParent();
2807 return true;
2808}
2809
2810// TODO: Copied from DAG implementation. Verify logic and document how this
2811// actually works.
2815 bool Signed) const {
2816
2817 Register Dst = MI.getOperand(0).getReg();
2818 Register Src = MI.getOperand(1).getReg();
2819
2820 const LLT S64 = LLT::scalar(64);
2821 const LLT S32 = LLT::scalar(32);
2822
2823 const LLT SrcLT = MRI.getType(Src);
2824 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2825
2826 unsigned Flags = MI.getFlags();
2827
2828 // The basic idea of converting a floating point number into a pair of 32-bit
2829 // integers is illustrated as follows:
2830 //
2831 // tf := trunc(val);
2832 // hif := floor(tf * 2^-32);
2833 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2834 // hi := fptoi(hif);
2835 // lo := fptoi(lof);
2836 //
2837 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2839 if (Signed && SrcLT == S32) {
2840 // However, a 32-bit floating point number has only 23 bits mantissa and
2841 // it's not enough to hold all the significant bits of `lof` if val is
2842 // negative. To avoid the loss of precision, We need to take the absolute
2843 // value after truncating and flip the result back based on the original
2844 // signedness.
2845 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2846 Trunc = B.buildFAbs(S32, Trunc, Flags);
2847 }
2848 MachineInstrBuilder K0, K1;
2849 if (SrcLT == S64) {
2850 K0 = B.buildFConstant(
2851 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2852 K1 = B.buildFConstant(
2853 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2854 } else {
2855 K0 = B.buildFConstant(
2856 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2857 K1 = B.buildFConstant(
2858 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2859 }
2860
2861 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2862 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2863 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2864
2865 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2866 : B.buildFPTOUI(S32, FloorMul);
2867 auto Lo = B.buildFPTOUI(S32, Fma);
2868
2869 if (Signed && SrcLT == S32) {
2870 // Flip the result based on the signedness, which is either all 0s or 1s.
2871 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2872 // r := xor({lo, hi}, sign) - sign;
2873 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2874 Sign);
2875 } else
2876 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2877 MI.eraseFromParent();
2878
2879 return true;
2880}
2881
2883 MachineInstr &MI) const {
2884 MachineFunction &MF = Helper.MIRBuilder.getMF();
2886
2887 // With ieee_mode disabled, the instructions have the correct behavior.
2888 if (!MFI->getMode().IEEE)
2889 return true;
2890
2892}
2893
2895 MachineInstr &MI) const {
2896 MachineIRBuilder &B = Helper.MIRBuilder;
2897 MachineRegisterInfo &MRI = *B.getMRI();
2898 Register DstReg = MI.getOperand(0).getReg();
2899 Register SrcReg = MI.getOperand(1).getReg();
2900 uint64_t Offset = MI.getOperand(2).getImm();
2901
2902 // Fall back to generic lowering for offset 0 (trivial trunc) and
2903 // non-32-bit-aligned cases which require shift+trunc sequences
2904 // that generic code handles correctly.
2905 if (Offset == 0 || Offset % 32 != 0)
2906 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2907
2908 const LLT DstTy = MRI.getType(DstReg);
2909 unsigned StartIdx = Offset / 32;
2910 unsigned DstCount = DstTy.getSizeInBits() / 32;
2911 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2912
2913 if (DstCount == 1) {
2914 if (DstTy.isPointer())
2915 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2916 else
2917 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2918 } else {
2919 SmallVector<Register, 8> MergeVec;
2920 for (unsigned I = 0; I < DstCount; ++I)
2921 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
2922 B.buildMergeLikeInstr(DstReg, MergeVec);
2923 }
2924
2925 MI.eraseFromParent();
2926 return true;
2927}
2928
2930 MachineInstr &MI) const {
2931 MachineIRBuilder &B = Helper.MIRBuilder;
2932 MachineRegisterInfo &MRI = *B.getMRI();
2933 Register DstReg = MI.getOperand(0).getReg();
2934 Register SrcReg = MI.getOperand(1).getReg();
2935 Register InsertSrc = MI.getOperand(2).getReg();
2936 uint64_t Offset = MI.getOperand(3).getImm();
2937
2938 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2939 const LLT InsertTy = MRI.getType(InsertSrc);
2940 unsigned InsertSize = InsertTy.getSizeInBits();
2941
2942 // Fall back to generic lowering for non-32-bit-aligned cases which
2943 // require shift+mask sequences that generic code handles correctly.
2944 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2945 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2946
2947 const LLT S32 = LLT::scalar(32);
2948 unsigned DstCount = DstSize / 32;
2949 unsigned InsertCount = InsertSize / 32;
2950 unsigned StartIdx = Offset / 32;
2951
2952 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
2953
2954 SmallVector<Register, 8> MergeVec;
2955 for (unsigned I = 0; I < StartIdx; ++I)
2956 MergeVec.push_back(SrcUnmerge.getReg(I));
2957
2958 if (InsertCount == 1) {
2959 // Merge-like instructions require same source types. Convert pointer
2960 // to scalar when inserting a pointer value into a scalar.
2961 if (InsertTy.isPointer())
2962 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
2963 MergeVec.push_back(InsertSrc);
2964 } else {
2965 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
2966 for (unsigned I = 0; I < InsertCount; ++I)
2967 MergeVec.push_back(InsertUnmerge.getReg(I));
2968 }
2969
2970 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
2971 MergeVec.push_back(SrcUnmerge.getReg(I));
2972
2973 B.buildMergeLikeInstr(DstReg, MergeVec);
2974
2975 MI.eraseFromParent();
2976 return true;
2977}
2978
2981 MachineIRBuilder &B) const {
2982 // TODO: Should move some of this into LegalizerHelper.
2983
2984 // TODO: Promote dynamic indexing of s16 to s32
2985
2986 Register Dst = MI.getOperand(0).getReg();
2987 Register Vec = MI.getOperand(1).getReg();
2988
2989 LLT VecTy = MRI.getType(Vec);
2990 LLT EltTy = VecTy.getElementType();
2991 assert(EltTy == MRI.getType(Dst));
2992
2993 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2994 // but we can't go directly to that logic becasue you can't bitcast a vector
2995 // of pointers to a vector of integers. Therefore, introduce an intermediate
2996 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2997 // drive the legalization forward.
2998 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2999 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3000 LLT IntVecTy = VecTy.changeElementType(IntTy);
3001
3002 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3003 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3004 B.buildIntToPtr(Dst, IntElt);
3005
3006 MI.eraseFromParent();
3007 return true;
3008 }
3009
3010 // FIXME: Artifact combiner probably should have replaced the truncated
3011 // constant before this, so we shouldn't need
3012 // getIConstantVRegValWithLookThrough.
3013 std::optional<ValueAndVReg> MaybeIdxVal =
3014 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3015 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3016 return true;
3017 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3018
3019 if (IdxVal < VecTy.getNumElements()) {
3020 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3021 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3022 } else {
3023 B.buildUndef(Dst);
3024 }
3025
3026 MI.eraseFromParent();
3027 return true;
3028}
3029
3032 MachineIRBuilder &B) const {
3033 // TODO: Should move some of this into LegalizerHelper.
3034
3035 // TODO: Promote dynamic indexing of s16 to s32
3036
3037 Register Dst = MI.getOperand(0).getReg();
3038 Register Vec = MI.getOperand(1).getReg();
3039 Register Ins = MI.getOperand(2).getReg();
3040
3041 LLT VecTy = MRI.getType(Vec);
3042 LLT EltTy = VecTy.getElementType();
3043 assert(EltTy == MRI.getType(Ins));
3044
3045 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3046 // but we can't go directly to that logic becasue you can't bitcast a vector
3047 // of pointers to a vector of integers. Therefore, make the pointer vector
3048 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3049 // new value, and then inttoptr the result vector back. This will then allow
3050 // the rest of legalization to take over.
3051 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3052 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3053 LLT IntVecTy = VecTy.changeElementType(IntTy);
3054
3055 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3056 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3057 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3058 MI.getOperand(3));
3059 B.buildIntToPtr(Dst, IntVecDest);
3060 MI.eraseFromParent();
3061 return true;
3062 }
3063
3064 // FIXME: Artifact combiner probably should have replaced the truncated
3065 // constant before this, so we shouldn't need
3066 // getIConstantVRegValWithLookThrough.
3067 std::optional<ValueAndVReg> MaybeIdxVal =
3068 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3069 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3070 return true;
3071
3072 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3073
3074 unsigned NumElts = VecTy.getNumElements();
3075 if (IdxVal < NumElts) {
3077 for (unsigned i = 0; i < NumElts; ++i)
3078 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3079 B.buildUnmerge(SrcRegs, Vec);
3080
3081 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3082 B.buildMergeLikeInstr(Dst, SrcRegs);
3083 } else {
3084 B.buildUndef(Dst);
3085 }
3086
3087 MI.eraseFromParent();
3088 return true;
3089}
3090
3093 MachineIRBuilder &B) const {
3094
3095 Register DstReg = MI.getOperand(0).getReg();
3096 Register SrcReg = MI.getOperand(1).getReg();
3097 LLT Ty = MRI.getType(DstReg);
3098 unsigned Flags = MI.getFlags();
3099
3100 Register TrigVal;
3101 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3102 if (ST.hasTrigReducedRange()) {
3103 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3104 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3105 .addUse(MulVal.getReg(0))
3106 .setMIFlags(Flags)
3107 .getReg(0);
3108 } else
3109 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3110
3111 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3112 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3113 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3114 .addUse(TrigVal)
3115 .setMIFlags(Flags);
3116 MI.eraseFromParent();
3117 return true;
3118}
3119
3122 const GlobalValue *GV,
3123 int64_t Offset,
3124 unsigned GAFlags) const {
3125 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3126 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3127 // to the following code sequence:
3128 //
3129 // For constant address space:
3130 // s_getpc_b64 s[0:1]
3131 // s_add_u32 s0, s0, $symbol
3132 // s_addc_u32 s1, s1, 0
3133 //
3134 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3135 // a fixup or relocation is emitted to replace $symbol with a literal
3136 // constant, which is a pc-relative offset from the encoding of the $symbol
3137 // operand to the global variable.
3138 //
3139 // For global address space:
3140 // s_getpc_b64 s[0:1]
3141 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3142 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3143 //
3144 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3145 // fixups or relocations are emitted to replace $symbol@*@lo and
3146 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3147 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3148 // operand to the global variable.
3149
3151
3152 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3153 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3154
3155 if (ST.has64BitLiterals()) {
3156 assert(GAFlags != SIInstrInfo::MO_NONE);
3157
3159 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3160 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3161 } else {
3163 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3164
3165 MIB.addGlobalAddress(GV, Offset, GAFlags);
3166 if (GAFlags == SIInstrInfo::MO_NONE)
3167 MIB.addImm(0);
3168 else
3169 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3170 }
3171
3172 if (!B.getMRI()->getRegClassOrNull(PCReg))
3173 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3174
3175 if (PtrTy.getSizeInBits() == 32)
3176 B.buildExtract(DstReg, PCReg, 0);
3177 return true;
3178}
3179
3180// Emit a ABS32_LO / ABS32_HI relocation stub.
3182 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3183 MachineRegisterInfo &MRI) const {
3184 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3185
3186 if (RequiresHighHalf && ST.has64BitLiterals()) {
3187 if (!MRI.getRegClassOrNull(DstReg))
3188 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3189 B.buildInstr(AMDGPU::S_MOV_B64)
3190 .addDef(DstReg)
3191 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3192 return;
3193 }
3194
3195 LLT S32 = LLT::scalar(32);
3196
3197 // Use the destination directly, if and only if we store the lower address
3198 // part only and we don't have a register class being set.
3199 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3200 ? DstReg
3202
3203 if (!MRI.getRegClassOrNull(AddrLo))
3204 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3205
3206 // Write the lower half.
3207 B.buildInstr(AMDGPU::S_MOV_B32)
3208 .addDef(AddrLo)
3209 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3210
3211 // If required, write the upper half as well.
3212 if (RequiresHighHalf) {
3213 assert(PtrTy.getSizeInBits() == 64 &&
3214 "Must provide a 64-bit pointer type!");
3215
3217 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3218
3219 B.buildInstr(AMDGPU::S_MOV_B32)
3220 .addDef(AddrHi)
3221 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3222
3223 // Use the destination directly, if and only if we don't have a register
3224 // class being set.
3225 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3226 ? DstReg
3228
3229 if (!MRI.getRegClassOrNull(AddrDst))
3230 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3231
3232 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3233
3234 // If we created a new register for the destination, cast the result into
3235 // the final output.
3236 if (AddrDst != DstReg)
3237 B.buildCast(DstReg, AddrDst);
3238 } else if (AddrLo != DstReg) {
3239 // If we created a new register for the destination, cast the result into
3240 // the final output.
3241 B.buildCast(DstReg, AddrLo);
3242 }
3243}
3244
3247 MachineIRBuilder &B) const {
3248 Register DstReg = MI.getOperand(0).getReg();
3249 LLT Ty = MRI.getType(DstReg);
3250 unsigned AS = Ty.getAddressSpace();
3251
3252 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3253 MachineFunction &MF = B.getMF();
3255
3257 if (!MFI->isModuleEntryFunction() &&
3258 GV->getName() != "llvm.amdgcn.module.lds" &&
3260 const Function &Fn = MF.getFunction();
3262 Fn, "local memory global used by non-kernel function",
3263 MI.getDebugLoc(), DS_Warning));
3264
3265 // We currently don't have a way to correctly allocate LDS objects that
3266 // aren't directly associated with a kernel. We do force inlining of
3267 // functions that use local objects. However, if these dead functions are
3268 // not eliminated, we don't want a compile time error. Just emit a warning
3269 // and a trap, since there should be no callable path here.
3270 B.buildTrap();
3271 B.buildUndef(DstReg);
3272 MI.eraseFromParent();
3273 return true;
3274 }
3275
3276 // TODO: We could emit code to handle the initialization somewhere.
3277 // We ignore the initializer for now and legalize it to allow selection.
3278 // The initializer will anyway get errored out during assembly emission.
3279 const SITargetLowering *TLI = ST.getTargetLowering();
3280 if (!TLI->shouldUseLDSConstAddress(GV)) {
3281 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3282 return true; // Leave in place;
3283 }
3284
3285 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3286 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3287 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3288 // zero-sized type in other languages to declare the dynamic shared
3289 // memory which size is not known at the compile time. They will be
3290 // allocated by the runtime and placed directly after the static
3291 // allocated ones. They all share the same offset.
3292 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3293 // Adjust alignment for that dynamic shared memory array.
3294 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3295 LLT S32 = LLT::scalar(32);
3296 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3297 B.buildIntToPtr(DstReg, Sz);
3298 MI.eraseFromParent();
3299 return true;
3300 }
3301 }
3302
3303 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3304 MI.eraseFromParent();
3305 return true;
3306 }
3307
3308 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3309 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3310 MI.eraseFromParent();
3311 return true;
3312 }
3313
3314 const SITargetLowering *TLI = ST.getTargetLowering();
3315
3316 if (TLI->shouldEmitFixup(GV)) {
3317 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3318 MI.eraseFromParent();
3319 return true;
3320 }
3321
3322 if (TLI->shouldEmitPCReloc(GV)) {
3323 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3324 MI.eraseFromParent();
3325 return true;
3326 }
3327
3329 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3330
3331 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3336 LoadTy, Align(8));
3337
3338 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3339
3340 if (Ty.getSizeInBits() == 32) {
3341 // Truncate if this is a 32-bit constant address.
3342 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3343 B.buildExtract(DstReg, Load, 0);
3344 } else
3345 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3346
3347 MI.eraseFromParent();
3348 return true;
3349}
3350
3352 if (Ty.isVector())
3353 return Ty.changeElementCount(
3354 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3355 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3356}
3357
3359 MachineInstr &MI) const {
3360 MachineIRBuilder &B = Helper.MIRBuilder;
3361 MachineRegisterInfo &MRI = *B.getMRI();
3362 GISelChangeObserver &Observer = Helper.Observer;
3363
3364 Register PtrReg = MI.getOperand(1).getReg();
3365 LLT PtrTy = MRI.getType(PtrReg);
3366 unsigned AddrSpace = PtrTy.getAddressSpace();
3367
3368 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3370 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3371 Observer.changingInstr(MI);
3372 MI.getOperand(1).setReg(Cast.getReg(0));
3373 Observer.changedInstr(MI);
3374 return true;
3375 }
3376
3377 if (MI.getOpcode() != AMDGPU::G_LOAD)
3378 return false;
3379
3380 Register ValReg = MI.getOperand(0).getReg();
3381 LLT ValTy = MRI.getType(ValReg);
3382
3383 if (hasBufferRsrcWorkaround(ValTy)) {
3384 Observer.changingInstr(MI);
3385 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3386 Observer.changedInstr(MI);
3387 return true;
3388 }
3389
3390 MachineMemOperand *MMO = *MI.memoperands_begin();
3391 const unsigned ValSize = ValTy.getSizeInBits();
3392 const LLT MemTy = MMO->getMemoryType();
3393 const Align MemAlign = MMO->getAlign();
3394 const unsigned MemSize = MemTy.getSizeInBits();
3395 const uint64_t AlignInBits = 8 * MemAlign.value();
3396
3397 // Widen non-power-of-2 loads to the alignment if needed
3398 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3399 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3400
3401 // This was already the correct extending load result type, so just adjust
3402 // the memory type.
3403 if (WideMemSize == ValSize) {
3404 MachineFunction &MF = B.getMF();
3405
3406 MachineMemOperand *WideMMO =
3407 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3408 Observer.changingInstr(MI);
3409 MI.setMemRefs(MF, {WideMMO});
3410 Observer.changedInstr(MI);
3411 return true;
3412 }
3413
3414 // Don't bother handling edge case that should probably never be produced.
3415 if (ValSize > WideMemSize)
3416 return false;
3417
3418 LLT WideTy = widenToNextPowerOf2(ValTy);
3419
3420 Register WideLoad;
3421 if (!WideTy.isVector()) {
3422 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3423 B.buildTrunc(ValReg, WideLoad).getReg(0);
3424 } else {
3425 // Extract the subvector.
3426
3427 if (isRegisterType(ST, ValTy)) {
3428 // If this a case where G_EXTRACT is legal, use it.
3429 // (e.g. <3 x s32> -> <4 x s32>)
3430 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3431 B.buildExtract(ValReg, WideLoad, 0);
3432 } else {
3433 // For cases where the widened type isn't a nice register value, unmerge
3434 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3435 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3436 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3437 }
3438 }
3439
3440 MI.eraseFromParent();
3441 return true;
3442 }
3443
3444 return false;
3445}
3446
3448 MachineInstr &MI) const {
3449 MachineIRBuilder &B = Helper.MIRBuilder;
3450 MachineRegisterInfo &MRI = *B.getMRI();
3451 GISelChangeObserver &Observer = Helper.Observer;
3452
3453 Register DataReg = MI.getOperand(0).getReg();
3454 LLT DataTy = MRI.getType(DataReg);
3455
3456 if (hasBufferRsrcWorkaround(DataTy)) {
3457 Observer.changingInstr(MI);
3459 Observer.changedInstr(MI);
3460 return true;
3461 }
3462 return false;
3463}
3464
3467 MachineIRBuilder &B) const {
3468 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3469 assert(Ty.isScalar());
3470
3471 MachineFunction &MF = B.getMF();
3473
3474 // TODO: Always legal with future ftz flag.
3475 // FIXME: Do we need just output?
3476 if (Ty == LLT::float32() &&
3478 return true;
3479 if (Ty == LLT::float16() &&
3481 return true;
3482
3483 MachineIRBuilder HelperBuilder(MI);
3484 GISelObserverWrapper DummyObserver;
3485 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3486 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3487}
3488
3491 Register DstReg = MI.getOperand(0).getReg();
3492 Register PtrReg = MI.getOperand(1).getReg();
3493 Register CmpVal = MI.getOperand(2).getReg();
3494 Register NewVal = MI.getOperand(3).getReg();
3495
3497 "this should not have been custom lowered");
3498
3499 LLT ValTy = MRI.getType(CmpVal);
3500 LLT VecTy = LLT::fixed_vector(2, ValTy);
3501
3502 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3503
3504 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3505 .addDef(DstReg)
3506 .addUse(PtrReg)
3507 .addUse(PackedVal)
3508 .setMemRefs(MI.memoperands());
3509
3510 MI.eraseFromParent();
3511 return true;
3512}
3513
3514/// Return true if it's known that \p Src can never be an f32 denormal value.
3516 Register Src) {
3517 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3518 switch (DefMI->getOpcode()) {
3519 case TargetOpcode::G_INTRINSIC: {
3521 case Intrinsic::amdgcn_frexp_mant:
3522 case Intrinsic::amdgcn_log:
3523 case Intrinsic::amdgcn_log_clamp:
3524 case Intrinsic::amdgcn_exp2:
3525 case Intrinsic::amdgcn_sqrt:
3526 return true;
3527 default:
3528 break;
3529 }
3530
3531 break;
3532 }
3533 case TargetOpcode::G_FSQRT:
3534 return true;
3535 case TargetOpcode::G_FFREXP: {
3536 if (DefMI->getOperand(0).getReg() == Src)
3537 return true;
3538 break;
3539 }
3540 case TargetOpcode::G_FPEXT: {
3541 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3542 }
3543 default:
3544 return false;
3545 }
3546
3547 return false;
3548}
3549
3550static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3551 return Flags & MachineInstr::FmAfn;
3552}
3553
3555 unsigned Flags) {
3556 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3559}
3560
3561std::pair<Register, Register>
3563 unsigned Flags) const {
3564 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3565 return {};
3566
3567 const LLT F32 = LLT::scalar(32);
3568 auto SmallestNormal = B.buildFConstant(
3570 auto IsLtSmallestNormal =
3571 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3572
3573 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3574 auto One = B.buildFConstant(F32, 1.0);
3575 auto ScaleFactor =
3576 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3577 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3578
3579 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3580}
3581
3583 MachineIRBuilder &B) const {
3584 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3585 // If we have to handle denormals, scale up the input and adjust the result.
3586
3587 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3588 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3589
3590 Register Dst = MI.getOperand(0).getReg();
3591 Register Src = MI.getOperand(1).getReg();
3592 LLT Ty = B.getMRI()->getType(Dst);
3593 unsigned Flags = MI.getFlags();
3594
3595 if (Ty == LLT::scalar(16)) {
3596 const LLT F32 = LLT::scalar(32);
3597 // Nothing in half is a denormal when promoted to f32.
3598 auto Ext = B.buildFPExt(F32, Src, Flags);
3599 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3600 .addUse(Ext.getReg(0))
3601 .setMIFlags(Flags);
3602 B.buildFPTrunc(Dst, Log2, Flags);
3603 MI.eraseFromParent();
3604 return true;
3605 }
3606
3607 assert(Ty == LLT::scalar(32));
3608
3609 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3610 if (!ScaledInput) {
3611 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3612 .addUse(Src)
3613 .setMIFlags(Flags);
3614 MI.eraseFromParent();
3615 return true;
3616 }
3617
3618 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3619 .addUse(ScaledInput)
3620 .setMIFlags(Flags);
3621
3622 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3623 auto Zero = B.buildFConstant(Ty, 0.0);
3624 auto ResultOffset =
3625 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3626 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3627
3628 MI.eraseFromParent();
3629 return true;
3630}
3631
3633 Register Z, unsigned Flags) {
3634 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3635 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3636}
3637
3639 MachineIRBuilder &B) const {
3640 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3641 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3642
3643 MachineRegisterInfo &MRI = *B.getMRI();
3644 Register Dst = MI.getOperand(0).getReg();
3645 Register X = MI.getOperand(1).getReg();
3646 unsigned Flags = MI.getFlags();
3647 const LLT Ty = MRI.getType(X);
3648
3649 const LLT F32 = LLT::scalar(32);
3650 const LLT F16 = LLT::scalar(16);
3651
3652 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3653 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3654 // depending on !fpmath metadata.
3655 bool PromoteToF32 =
3656 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3657 if (PromoteToF32) {
3659 auto PromoteSrc = B.buildFPExt(F32, X);
3660 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3661 B.buildFPTrunc(Dst, LogVal);
3662 } else {
3663 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3664 }
3665
3666 MI.eraseFromParent();
3667 return true;
3668 }
3669
3670 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3671 if (ScaledInput)
3672 X = ScaledInput;
3673
3674 auto Y =
3675 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3676
3677 Register R;
3678 if (ST.hasFastFMAF32()) {
3679 // c+cc are ln(2)/ln(10) to more than 49 bits
3680 const float c_log10 = 0x1.344134p-2f;
3681 const float cc_log10 = 0x1.09f79ep-26f;
3682
3683 // c + cc is ln(2) to more than 49 bits
3684 const float c_log = 0x1.62e42ep-1f;
3685 const float cc_log = 0x1.efa39ep-25f;
3686
3687 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3688 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3689 // This adds correction terms for which contraction may lead to an increase
3690 // in the error of the approximation, so disable it.
3691 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3692 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3693 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3694 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3695 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3696 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3697 } else {
3698 // ch+ct is ln(2)/ln(10) to more than 36 bits
3699 const float ch_log10 = 0x1.344000p-2f;
3700 const float ct_log10 = 0x1.3509f6p-18f;
3701
3702 // ch + ct is ln(2) to more than 36 bits
3703 const float ch_log = 0x1.62e000p-1f;
3704 const float ct_log = 0x1.0bfbe8p-15f;
3705
3706 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3707 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3708
3709 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3710 auto YH = B.buildAnd(Ty, Y, MaskConst);
3711 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3712 // This adds correction terms for which contraction may lead to an increase
3713 // in the error of the approximation, so disable it.
3714 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3715 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3716
3717 Register Mad0 =
3718 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3719 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3720 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3721 }
3722
3723 const bool IsFiniteOnly =
3725
3726 if (!IsFiniteOnly) {
3727 // Expand isfinite(x) => fabs(x) < inf
3728 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3729 auto Fabs = B.buildFAbs(Ty, Y);
3730 auto IsFinite =
3731 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3732 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3733 }
3734
3735 if (ScaledInput) {
3736 auto Zero = B.buildFConstant(Ty, 0.0);
3737 auto ShiftK =
3738 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3739 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3740 B.buildFSub(Dst, R, Shift, Flags);
3741 } else {
3742 B.buildCopy(Dst, R);
3743 }
3744
3745 MI.eraseFromParent();
3746 return true;
3747}
3748
3750 Register Src, bool IsLog10,
3751 unsigned Flags) const {
3752 const double Log2BaseInverted =
3754
3755 LLT Ty = B.getMRI()->getType(Dst);
3756
3757 if (Ty == LLT::scalar(32)) {
3758 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3759 if (ScaledInput) {
3760 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3761 .addUse(Src)
3762 .setMIFlags(Flags);
3763 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3764 auto Zero = B.buildFConstant(Ty, 0.0);
3765 auto ResultOffset =
3766 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3767 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3768
3769 if (ST.hasFastFMAF32())
3770 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3771 else {
3772 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3773 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3774 }
3775
3776 return true;
3777 }
3778 }
3779
3780 auto Log2Operand = Ty == LLT::scalar(16)
3781 ? B.buildFLog2(Ty, Src, Flags)
3782 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3783 .addUse(Src)
3784 .setMIFlags(Flags);
3785 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3786 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3787 return true;
3788}
3789
3791 MachineIRBuilder &B) const {
3792 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3793 // If we have to handle denormals, scale up the input and adjust the result.
3794
3795 Register Dst = MI.getOperand(0).getReg();
3796 Register Src = MI.getOperand(1).getReg();
3797 unsigned Flags = MI.getFlags();
3798 LLT Ty = B.getMRI()->getType(Dst);
3799 const LLT F16 = LLT::scalar(16);
3800 const LLT F32 = LLT::scalar(32);
3801 const LLT F64 = LLT::scalar(64);
3802
3803 if (Ty == F64)
3804 return legalizeFEXPF64(MI, B);
3805
3806 if (Ty == F16) {
3807 // Nothing in half is a denormal when promoted to f32.
3808 auto Ext = B.buildFPExt(F32, Src, Flags);
3809 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3810 .addUse(Ext.getReg(0))
3811 .setMIFlags(Flags);
3812 B.buildFPTrunc(Dst, Log2, Flags);
3813 MI.eraseFromParent();
3814 return true;
3815 }
3816
3817 assert(Ty == F32);
3818
3819 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3820 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3821 .addUse(Src)
3822 .setMIFlags(Flags);
3823 MI.eraseFromParent();
3824 return true;
3825 }
3826
3827 // bool needs_scaling = x < -0x1.f80000p+6f;
3828 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3829
3830 // -nextafter(128.0, -1)
3831 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3832 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3833 RangeCheckConst, Flags);
3834
3835 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3836 auto Zero = B.buildFConstant(Ty, 0.0);
3837 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3838 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3839
3840 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3841 .addUse(AddInput.getReg(0))
3842 .setMIFlags(Flags);
3843
3844 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3845 auto One = B.buildFConstant(Ty, 1.0);
3846 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3847 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3848 MI.eraseFromParent();
3849 return true;
3850}
3851
3853 const SrcOp &Src, unsigned Flags) {
3854 LLT Ty = Dst.getLLTTy(*B.getMRI());
3855
3856 if (Ty == LLT::scalar(32)) {
3857 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3858 .addUse(Src.getReg())
3859 .setMIFlags(Flags);
3860 }
3861 return B.buildFExp2(Dst, Src, Flags);
3862}
3863
3865 Register Dst, Register X,
3866 unsigned Flags,
3867 bool IsExp10) const {
3868 LLT Ty = B.getMRI()->getType(X);
3869
3870 // exp(x) -> exp2(M_LOG2E_F * x);
3871 // exp10(x) -> exp2(log2(10) * x);
3872 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3873 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3874 buildExp(B, Dst, Mul, Flags);
3875 return true;
3876}
3877
3879 Register X, unsigned Flags) const {
3880 LLT Ty = B.getMRI()->getType(Dst);
3881 LLT F32 = LLT::scalar(32);
3882
3883 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3884 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3885 }
3886
3887 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3888 auto NeedsScaling =
3889 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3890 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3891 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3892 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3893
3894 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3895 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3896
3897 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3898 .addUse(ExpInput.getReg(0))
3899 .setMIFlags(Flags);
3900
3901 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3902 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3903 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3904 return true;
3905}
3906
3908 Register Dst, Register X,
3909 unsigned Flags) const {
3910 LLT Ty = B.getMRI()->getType(Dst);
3911 LLT F32 = LLT::scalar(32);
3912
3913 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3914 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3915 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3916 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3917
3918 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3919 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3920 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3921 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3922 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3923 return true;
3924 }
3925
3926 // bool s = x < -0x1.2f7030p+5f;
3927 // x += s ? 0x1.0p+5f : 0.0f;
3928 // exp10 = exp2(x * 0x1.a92000p+1f) *
3929 // exp2(x * 0x1.4f0978p-11f) *
3930 // (s ? 0x1.9f623ep-107f : 1.0f);
3931
3932 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3933 auto NeedsScaling =
3934 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3935
3936 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3937 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3938 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3939
3940 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3941 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3942
3943 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3944 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3945 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3946 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3947
3948 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3949 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3950 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3951
3952 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3953 return true;
3954}
3955
3956// This expansion gives a result slightly better than 1ulp.
3958 MachineIRBuilder &B) const {
3959
3960 Register X = MI.getOperand(1).getReg();
3961 LLT S64 = LLT::scalar(64);
3962 LLT S32 = LLT::scalar(32);
3963 LLT S1 = LLT::scalar(1);
3964
3965 // TODO: Check if reassoc is safe. There is an output change in exp2 and
3966 // exp10, which slightly increases ulp.
3967 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
3968
3969 Register Dn, F, T;
3970
3971 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
3972 // Dn = rint(X)
3973 Dn = B.buildFRint(S64, X, Flags).getReg(0);
3974 // F = X - Dn
3975 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
3976 // T = F*C1 + F*C2
3977 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
3978 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
3979 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
3980 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
3981
3982 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
3983 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
3984 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
3985 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
3986
3987 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
3988 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
3989 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
3990 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
3991 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
3992
3993 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
3994 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
3995 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
3996 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
3997
3998 } else { // G_FEXP
3999 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4000 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4001 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4002
4003 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4004 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4005 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4006 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4007 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4008 }
4009
4010 // Polynomial chain for P
4011 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4012 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4013 Flags);
4014 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4015 Flags);
4016 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4017 Flags);
4018 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4019 Flags);
4020 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4021 Flags);
4022 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4023 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4024 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4025 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4026
4027 auto One = B.buildFConstant(S64, 1.0);
4028 P = B.buildFMA(S64, T, P, One, Flags);
4029 P = B.buildFMA(S64, T, P, One, Flags);
4030
4031 // Z = FLDEXP(P, (int)Dn)
4032 auto DnInt = B.buildFPTOSI(S32, Dn);
4033 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4034
4035 if (!(Flags & MachineInstr::FmNoInfs)) {
4036 // Overflow guard: if X <= 1024.0 then Z else +inf
4037 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4038 B.buildFConstant(S64, APFloat(1024.0)));
4039 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4040 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4041 }
4042
4043 // Underflow guard: if X >= -1075.0 then Z else 0.0
4044 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4045 B.buildFConstant(S64, APFloat(-1075.0)));
4046 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4047 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4048
4049 MI.eraseFromParent();
4050 return true;
4051}
4052
4054 MachineIRBuilder &B) const {
4055 Register Dst = MI.getOperand(0).getReg();
4056 Register X = MI.getOperand(1).getReg();
4057 const unsigned Flags = MI.getFlags();
4058 MachineFunction &MF = B.getMF();
4059 MachineRegisterInfo &MRI = *B.getMRI();
4060 LLT Ty = MRI.getType(Dst);
4061
4062 const LLT F64 = LLT::scalar(64);
4063
4064 if (Ty == F64)
4065 return legalizeFEXPF64(MI, B);
4066
4067 const LLT F16 = LLT::scalar(16);
4068 const LLT F32 = LLT::scalar(32);
4069 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4070
4071 if (Ty == F16) {
4072 // v_exp_f16 (fmul x, log2e)
4073 if (allowApproxFunc(MF, Flags)) {
4074 // TODO: Does this really require fast?
4075 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4076 : legalizeFExpUnsafe(B, Dst, X, Flags);
4077 MI.eraseFromParent();
4078 return true;
4079 }
4080
4081 // Nothing in half is a denormal when promoted to f32.
4082 //
4083 // exp(f16 x) ->
4084 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4085 //
4086 // exp10(f16 x) ->
4087 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4088 auto Ext = B.buildFPExt(F32, X, Flags);
4090 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4091 B.buildFPTrunc(Dst, Lowered, Flags);
4092 MI.eraseFromParent();
4093 return true;
4094 }
4095
4096 assert(Ty == F32);
4097
4098 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4099 // library behavior. Also, is known-not-daz source sufficient?
4100 if (allowApproxFunc(MF, Flags)) {
4101 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4102 : legalizeFExpUnsafe(B, Dst, X, Flags);
4103 MI.eraseFromParent();
4104 return true;
4105 }
4106
4107 // Algorithm:
4108 //
4109 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4110 //
4111 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4112 // n = 64*m + j, 0 <= j < 64
4113 //
4114 // e^x = 2^((64*m + j + f)/64)
4115 // = (2^m) * (2^(j/64)) * 2^(f/64)
4116 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4117 //
4118 // f = x*(64/ln(2)) - n
4119 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4120 //
4121 // e^x = (2^m) * (2^(j/64)) * e^r
4122 //
4123 // (2^(j/64)) is precomputed
4124 //
4125 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4126 // e^r = 1 + q
4127 //
4128 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4129 //
4130 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4131 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4132 Register PH, PL;
4133
4134 if (ST.hasFastFMAF32()) {
4135 const float c_exp = numbers::log2ef;
4136 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4137 const float c_exp10 = 0x1.a934f0p+1f;
4138 const float cc_exp10 = 0x1.2f346ep-24f;
4139
4140 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4141 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4142 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4143 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4144
4145 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4146 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4147 } else {
4148 const float ch_exp = 0x1.714000p+0f;
4149 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4150
4151 const float ch_exp10 = 0x1.a92000p+1f;
4152 const float cl_exp10 = 0x1.4f0978p-11f;
4153
4154 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4155 auto XH = B.buildAnd(Ty, X, MaskConst);
4156 auto XL = B.buildFSub(Ty, X, XH, Flags);
4157
4158 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4159 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4160
4161 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4162 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4163
4164 Register Mad0 =
4165 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4166 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4167 }
4168
4169 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4170
4171 // It is unsafe to contract this fsub into the PH multiply.
4172 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4173 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4174 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4175
4176 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4177 .addUse(A.getReg(0))
4178 .setMIFlags(Flags);
4179 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4180
4181 auto UnderflowCheckConst =
4182 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4183 auto Zero = B.buildFConstant(Ty, 0.0);
4184 auto Underflow =
4185 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4186
4187 R = B.buildSelect(Ty, Underflow, Zero, R);
4188
4189 if (!(Flags & MachineInstr::FmNoInfs)) {
4190 auto OverflowCheckConst =
4191 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4192
4193 auto Overflow =
4194 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4195 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4196 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4197 }
4198
4199 B.buildCopy(Dst, R);
4200 MI.eraseFromParent();
4201 return true;
4202}
4203
4205 MachineIRBuilder &B) const {
4206 Register Dst = MI.getOperand(0).getReg();
4207 Register Src0 = MI.getOperand(1).getReg();
4208 Register Src1 = MI.getOperand(2).getReg();
4209 unsigned Flags = MI.getFlags();
4210 LLT Ty = B.getMRI()->getType(Dst);
4211 const LLT F16 = LLT::float16();
4212 const LLT F32 = LLT::float32();
4213
4214 if (Ty == F32) {
4215 auto Log = B.buildFLog2(F32, Src0, Flags);
4216 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4217 .addUse(Log.getReg(0))
4218 .addUse(Src1)
4219 .setMIFlags(Flags);
4220 B.buildFExp2(Dst, Mul, Flags);
4221 } else if (Ty == F16) {
4222 // There's no f16 fmul_legacy, so we need to convert for it.
4223 auto Log = B.buildFLog2(F16, Src0, Flags);
4224 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4225 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4226 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4227 .addUse(Ext0.getReg(0))
4228 .addUse(Ext1.getReg(0))
4229 .setMIFlags(Flags);
4230 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4231 } else
4232 return false;
4233
4234 MI.eraseFromParent();
4235 return true;
4236}
4237
4238// Find a source register, ignoring any possible source modifiers.
4240 Register ModSrc = OrigSrc;
4241 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4242 ModSrc = SrcFNeg->getOperand(1).getReg();
4243 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4244 ModSrc = SrcFAbs->getOperand(1).getReg();
4245 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4246 ModSrc = SrcFAbs->getOperand(1).getReg();
4247 return ModSrc;
4248}
4249
4252 MachineIRBuilder &B) const {
4253
4254 const LLT S1 = LLT::scalar(1);
4255 const LLT F64 = LLT::float64();
4256 Register Dst = MI.getOperand(0).getReg();
4257 Register OrigSrc = MI.getOperand(1).getReg();
4258 unsigned Flags = MI.getFlags();
4259 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4260 "this should not have been custom lowered");
4261
4262 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4263 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4264 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4265 // V_FRACT bug is:
4266 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4267 //
4268 // Convert floor(x) to (x - fract(x))
4269
4270 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4271 .addUse(OrigSrc)
4272 .setMIFlags(Flags);
4273
4274 // Give source modifier matching some assistance before obscuring a foldable
4275 // pattern.
4276
4277 // TODO: We can avoid the neg on the fract? The input sign to fract
4278 // shouldn't matter?
4279 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4280
4281 auto Const =
4282 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4283
4285
4286 // We don't need to concern ourselves with the snan handling difference, so
4287 // use the one which will directly select.
4288 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4289 if (MFI->getMode().IEEE)
4290 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4291 else
4292 B.buildFMinNum(Min, Fract, Const, Flags);
4293
4294 Register CorrectedFract = Min;
4295 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4296 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4297 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4298 }
4299
4300 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4301 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4302
4303 MI.eraseFromParent();
4304 return true;
4305}
4306
4307// Turn an illegal packed v2s16 build vector into bit operations.
4308// TODO: This should probably be a bitcast action in LegalizerHelper.
4311 Register Dst = MI.getOperand(0).getReg();
4312 const LLT S32 = LLT::scalar(32);
4313 const LLT S16 = LLT::scalar(16);
4314 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4315
4316 Register Src0 = MI.getOperand(1).getReg();
4317 Register Src1 = MI.getOperand(2).getReg();
4318
4319 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4320 assert(MRI.getType(Src0) == S32);
4321 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4322 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4323 }
4324
4325 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4326 B.buildBitcast(Dst, Merge);
4327
4328 MI.eraseFromParent();
4329 return true;
4330}
4331
4332// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4333//
4334// Source and accumulation registers must all be 32-bits.
4335//
4336// TODO: When the multiply is uniform, we should produce a code sequence
4337// that is better suited to instruction selection on the SALU. Instead of
4338// the outer loop going over parts of the result, the outer loop should go
4339// over parts of one of the factors. This should result in instruction
4340// selection that makes full use of S_ADDC_U32 instructions.
4343 ArrayRef<Register> Src0,
4344 ArrayRef<Register> Src1,
4345 bool UsePartialMad64_32,
4346 bool SeparateOddAlignedProducts) const {
4347 // Use (possibly empty) vectors of S1 registers to represent the set of
4348 // carries from one pair of positions to the next.
4349 using Carry = SmallVector<Register, 2>;
4350
4351 MachineIRBuilder &B = Helper.MIRBuilder;
4352 GISelValueTracking &VT = *Helper.getValueTracking();
4353
4354 const LLT S1 = LLT::scalar(1);
4355 const LLT S32 = LLT::scalar(32);
4356 const LLT S64 = LLT::scalar(64);
4357
4358 Register Zero32;
4359 Register Zero64;
4360
4361 auto getZero32 = [&]() -> Register {
4362 if (!Zero32)
4363 Zero32 = B.buildConstant(S32, 0).getReg(0);
4364 return Zero32;
4365 };
4366 auto getZero64 = [&]() -> Register {
4367 if (!Zero64)
4368 Zero64 = B.buildConstant(S64, 0).getReg(0);
4369 return Zero64;
4370 };
4371
4372 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4373 for (unsigned i = 0; i < Src0.size(); ++i) {
4374 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4375 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4376 }
4377
4378 // Merge the given carries into the 32-bit LocalAccum, which is modified
4379 // in-place.
4380 //
4381 // Returns the carry-out, which is a single S1 register or null.
4382 auto mergeCarry =
4383 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4384 if (CarryIn.empty())
4385 return Register();
4386
4387 bool HaveCarryOut = true;
4388 Register CarryAccum;
4389 if (CarryIn.size() == 1) {
4390 if (!LocalAccum) {
4391 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4392 return Register();
4393 }
4394
4395 CarryAccum = getZero32();
4396 } else {
4397 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4398 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4399 CarryAccum =
4400 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4401 .getReg(0);
4402 }
4403
4404 if (!LocalAccum) {
4405 LocalAccum = getZero32();
4406 HaveCarryOut = false;
4407 }
4408 }
4409
4410 auto Add =
4411 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4412 LocalAccum = Add.getReg(0);
4413 return HaveCarryOut ? Add.getReg(1) : Register();
4414 };
4415
4416 // Build a multiply-add chain to compute
4417 //
4418 // LocalAccum + (partial products at DstIndex)
4419 // + (opportunistic subset of CarryIn)
4420 //
4421 // LocalAccum is an array of one or two 32-bit registers that are updated
4422 // in-place. The incoming registers may be null.
4423 //
4424 // In some edge cases, carry-ins can be consumed "for free". In that case,
4425 // the consumed carry bits are removed from CarryIn in-place.
4426 auto buildMadChain =
4427 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4428 -> Carry {
4429 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4430 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4431
4432 Carry CarryOut;
4433 unsigned j0 = 0;
4434
4435 // Use plain 32-bit multiplication for the most significant part of the
4436 // result by default.
4437 if (LocalAccum.size() == 1 &&
4438 (!UsePartialMad64_32 || !CarryIn.empty())) {
4439 do {
4440 // Skip multiplication if one of the operands is 0
4441 unsigned j1 = DstIndex - j0;
4442 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4443 ++j0;
4444 continue;
4445 }
4446 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4447 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4448 LocalAccum[0] = Mul.getReg(0);
4449 } else {
4450 if (CarryIn.empty()) {
4451 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4452 } else {
4453 LocalAccum[0] =
4454 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4455 .getReg(0);
4456 CarryIn.pop_back();
4457 }
4458 }
4459 ++j0;
4460 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4461 }
4462
4463 // Build full 64-bit multiplies.
4464 if (j0 <= DstIndex) {
4465 bool HaveSmallAccum = false;
4466 Register Tmp;
4467
4468 if (LocalAccum[0]) {
4469 if (LocalAccum.size() == 1) {
4470 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4471 HaveSmallAccum = true;
4472 } else if (LocalAccum[1]) {
4473 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4474 HaveSmallAccum = false;
4475 } else {
4476 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4477 HaveSmallAccum = true;
4478 }
4479 } else {
4480 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4481 Tmp = getZero64();
4482 HaveSmallAccum = true;
4483 }
4484
4485 do {
4486 unsigned j1 = DstIndex - j0;
4487 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4488 ++j0;
4489 continue;
4490 }
4491 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4492 {Src0[j0], Src1[j1], Tmp});
4493 Tmp = Mad.getReg(0);
4494 if (!HaveSmallAccum)
4495 CarryOut.push_back(Mad.getReg(1));
4496 HaveSmallAccum = false;
4497
4498 ++j0;
4499 } while (j0 <= DstIndex);
4500
4501 auto Unmerge = B.buildUnmerge(S32, Tmp);
4502 LocalAccum[0] = Unmerge.getReg(0);
4503 if (LocalAccum.size() > 1)
4504 LocalAccum[1] = Unmerge.getReg(1);
4505 }
4506
4507 return CarryOut;
4508 };
4509
4510 // Outer multiply loop, iterating over destination parts from least
4511 // significant to most significant parts.
4512 //
4513 // The columns of the following diagram correspond to the destination parts
4514 // affected by one iteration of the outer loop (ignoring boundary
4515 // conditions).
4516 //
4517 // Dest index relative to 2 * i: 1 0 -1
4518 // ------
4519 // Carries from previous iteration: e o
4520 // Even-aligned partial product sum: E E .
4521 // Odd-aligned partial product sum: O O
4522 //
4523 // 'o' is OddCarry, 'e' is EvenCarry.
4524 // EE and OO are computed from partial products via buildMadChain and use
4525 // accumulation where possible and appropriate.
4526 //
4527 Register SeparateOddCarry;
4528 Carry EvenCarry;
4529 Carry OddCarry;
4530
4531 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4532 Carry OddCarryIn = std::move(OddCarry);
4533 Carry EvenCarryIn = std::move(EvenCarry);
4534 OddCarry.clear();
4535 EvenCarry.clear();
4536
4537 // Partial products at offset 2 * i.
4538 if (2 * i < Accum.size()) {
4539 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4540 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4541 }
4542
4543 // Partial products at offset 2 * i - 1.
4544 if (i > 0) {
4545 if (!SeparateOddAlignedProducts) {
4546 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4547 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4548 } else {
4549 bool IsHighest = 2 * i >= Accum.size();
4550 Register SeparateOddOut[2];
4551 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4552 .take_front(IsHighest ? 1 : 2);
4553 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4554
4556
4557 if (i == 1) {
4558 if (!IsHighest)
4559 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4560 else
4561 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4562 } else {
4563 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4564 SeparateOddCarry);
4565 }
4566 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4567
4568 if (!IsHighest) {
4569 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4570 Lo->getOperand(1).getReg());
4571 Accum[2 * i] = Hi.getReg(0);
4572 SeparateOddCarry = Hi.getReg(1);
4573 }
4574 }
4575 }
4576
4577 // Add in the carries from the previous iteration
4578 if (i > 0) {
4579 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4580 EvenCarryIn.push_back(CarryOut);
4581
4582 if (2 * i < Accum.size()) {
4583 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4584 OddCarry.push_back(CarryOut);
4585 }
4586 }
4587 }
4588}
4589
4590// Custom narrowing of wide multiplies using wide multiply-add instructions.
4591//
4592// TODO: If the multiply is followed by an addition, we should attempt to
4593// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4595 MachineInstr &MI) const {
4596 assert(ST.hasMad64_32());
4597 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4598
4599 MachineIRBuilder &B = Helper.MIRBuilder;
4600 MachineRegisterInfo &MRI = *B.getMRI();
4601
4602 Register DstReg = MI.getOperand(0).getReg();
4603 Register Src0 = MI.getOperand(1).getReg();
4604 Register Src1 = MI.getOperand(2).getReg();
4605
4606 LLT Ty = MRI.getType(DstReg);
4607 assert(Ty.isScalar());
4608
4609 unsigned Size = Ty.getSizeInBits();
4610 if (ST.hasVectorMulU64() && Size == 64)
4611 return true;
4612
4613 unsigned NumParts = Size / 32;
4614 assert((Size % 32) == 0);
4615 assert(NumParts >= 2);
4616
4617 // Whether to use MAD_64_32 for partial products whose high half is
4618 // discarded. This avoids some ADD instructions but risks false dependency
4619 // stalls on some subtargets in some cases.
4620 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4621
4622 // Whether to compute odd-aligned partial products separately. This is
4623 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4624 // in an even-aligned VGPR.
4625 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4626
4627 LLT S32 = LLT::scalar(32);
4628 SmallVector<Register, 2> Src0Parts, Src1Parts;
4629 for (unsigned i = 0; i < NumParts; ++i) {
4632 }
4633 B.buildUnmerge(Src0Parts, Src0);
4634 B.buildUnmerge(Src1Parts, Src1);
4635
4636 SmallVector<Register, 2> AccumRegs(NumParts);
4637 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4638 SeparateOddAlignedProducts);
4639
4640 B.buildMergeLikeInstr(DstReg, AccumRegs);
4641 MI.eraseFromParent();
4642 return true;
4643}
4644
4645// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4646// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4647// case with a single min instruction instead of a compare+select.
4650 MachineIRBuilder &B) const {
4651 Register Dst = MI.getOperand(0).getReg();
4652 Register Src = MI.getOperand(1).getReg();
4653 LLT DstTy = MRI.getType(Dst);
4654 LLT SrcTy = MRI.getType(Src);
4655
4656 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4657 ? AMDGPU::G_AMDGPU_FFBH_U32
4658 : AMDGPU::G_AMDGPU_FFBL_B32;
4659 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4660 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4661
4662 MI.eraseFromParent();
4663 return true;
4664}
4665
4668 MachineIRBuilder &B) const {
4669 Register Dst = MI.getOperand(0).getReg();
4670 Register Src = MI.getOperand(1).getReg();
4671 LLT SrcTy = MRI.getType(Src);
4672 TypeSize NumBits = SrcTy.getSizeInBits();
4673
4674 assert(NumBits < 32u);
4675
4676 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4677 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4678 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4679 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4680 B.buildTrunc(Dst, Ctlz);
4681 MI.eraseFromParent();
4682 return true;
4683}
4684
4685// Check that this is a G_XOR x, -1
4686static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4687 if (MI.getOpcode() != TargetOpcode::G_XOR)
4688 return false;
4689 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4690 return ConstVal == -1;
4691}
4692
4693// Return the use branch instruction, otherwise null if the usage is invalid.
4694static MachineInstr *
4696 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4697 Register CondDef = MI.getOperand(0).getReg();
4698 if (!MRI.hasOneNonDBGUse(CondDef))
4699 return nullptr;
4700
4701 MachineBasicBlock *Parent = MI.getParent();
4702 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4703
4704 if (isNot(MRI, *UseMI)) {
4705 Register NegatedCond = UseMI->getOperand(0).getReg();
4706 if (!MRI.hasOneNonDBGUse(NegatedCond))
4707 return nullptr;
4708
4709 // We're deleting the def of this value, so we need to remove it.
4710 eraseInstr(*UseMI, MRI);
4711
4712 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4713 Negated = true;
4714 }
4715
4716 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4717 return nullptr;
4718
4719 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4720 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4721 if (Next == Parent->end()) {
4722 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4723 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4724 return nullptr;
4725 UncondBrTarget = &*NextMBB;
4726 } else {
4727 if (Next->getOpcode() != AMDGPU::G_BR)
4728 return nullptr;
4729 Br = &*Next;
4730 UncondBrTarget = Br->getOperand(0).getMBB();
4731 }
4732
4733 return UseMI;
4734}
4735
4738 const ArgDescriptor *Arg,
4739 const TargetRegisterClass *ArgRC,
4740 LLT ArgTy) const {
4741 MCRegister SrcReg = Arg->getRegister();
4742 assert(SrcReg.isPhysical() && "Physical register expected");
4743 assert(DstReg.isVirtual() && "Virtual register expected");
4744
4745 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4746 *ArgRC, B.getDebugLoc(), ArgTy);
4747 if (Arg->isMasked()) {
4748 // TODO: Should we try to emit this once in the entry block?
4749 const LLT S32 = LLT::scalar(32);
4750 const unsigned Mask = Arg->getMask();
4751 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4752
4753 Register AndMaskSrc = LiveIn;
4754
4755 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4756 // 0.
4757 if (Shift != 0) {
4758 auto ShiftAmt = B.buildConstant(S32, Shift);
4759 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4760 }
4761
4762 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4763 } else {
4764 B.buildCopy(DstReg, LiveIn);
4765 }
4766}
4767
4772 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4773 Register DstReg = MI.getOperand(0).getReg();
4774 if (!ST.hasClusters()) {
4775 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4776 return false;
4777 MI.eraseFromParent();
4778 return true;
4779 }
4780
4781 // Clusters are supported. Return the global position in the grid. If clusters
4782 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4783
4784 // WorkGroupIdXYZ = ClusterId == 0 ?
4785 // ClusterIdXYZ :
4786 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4787 MachineRegisterInfo &MRI = *B.getMRI();
4788 const LLT S32 = LLT::scalar(32);
4789 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4790 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4791 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4792 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4793 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4794 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4795 return false;
4796
4797 auto One = B.buildConstant(S32, 1);
4798 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4799 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4800 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4801
4802 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4803
4804 switch (MFI->getClusterDims().getKind()) {
4807 B.buildCopy(DstReg, GlobalIdXYZ);
4808 MI.eraseFromParent();
4809 return true;
4810 }
4812 B.buildCopy(DstReg, ClusterIdXYZ);
4813 MI.eraseFromParent();
4814 return true;
4815 }
4817 using namespace AMDGPU::Hwreg;
4818 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4819 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4820 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4821 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4822 .addDef(ClusterId)
4823 .addImm(ClusterIdField);
4824 auto Zero = B.buildConstant(S32, 0);
4825 auto NoClusters =
4826 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4827 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4828 MI.eraseFromParent();
4829 return true;
4830 }
4831 }
4832
4833 llvm_unreachable("nothing should reach here");
4834}
4835
4837 Register DstReg, MachineIRBuilder &B,
4839 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4840 const ArgDescriptor *Arg = nullptr;
4841 const TargetRegisterClass *ArgRC;
4842 LLT ArgTy;
4843
4844 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4845 const ArgDescriptor WorkGroupIDX =
4846 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4847 // If GridZ is not programmed in an entry function then the hardware will set
4848 // it to all zeros, so there is no need to mask the GridY value in the low
4849 // order bits.
4850 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4851 AMDGPU::TTMP7,
4852 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4853 const ArgDescriptor WorkGroupIDZ =
4854 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4855 const ArgDescriptor ClusterWorkGroupIDX =
4856 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4857 const ArgDescriptor ClusterWorkGroupIDY =
4858 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4859 const ArgDescriptor ClusterWorkGroupIDZ =
4860 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4861 const ArgDescriptor ClusterWorkGroupMaxIDX =
4862 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4863 const ArgDescriptor ClusterWorkGroupMaxIDY =
4864 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4865 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4866 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4867 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4868 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4869
4870 auto LoadConstant = [&](unsigned N) {
4871 B.buildConstant(DstReg, N);
4872 return true;
4873 };
4874
4875 if (ST.hasArchitectedSGPRs() &&
4877 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4878 bool HasFixedDims = ClusterDims.isFixedDims();
4879
4880 switch (ArgType) {
4882 Arg = &WorkGroupIDX;
4883 ArgRC = &AMDGPU::SReg_32RegClass;
4884 ArgTy = LLT::scalar(32);
4885 break;
4887 Arg = &WorkGroupIDY;
4888 ArgRC = &AMDGPU::SReg_32RegClass;
4889 ArgTy = LLT::scalar(32);
4890 break;
4892 Arg = &WorkGroupIDZ;
4893 ArgRC = &AMDGPU::SReg_32RegClass;
4894 ArgTy = LLT::scalar(32);
4895 break;
4897 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4898 return LoadConstant(0);
4899 Arg = &ClusterWorkGroupIDX;
4900 ArgRC = &AMDGPU::SReg_32RegClass;
4901 ArgTy = LLT::scalar(32);
4902 break;
4904 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4905 return LoadConstant(0);
4906 Arg = &ClusterWorkGroupIDY;
4907 ArgRC = &AMDGPU::SReg_32RegClass;
4908 ArgTy = LLT::scalar(32);
4909 break;
4911 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4912 return LoadConstant(0);
4913 Arg = &ClusterWorkGroupIDZ;
4914 ArgRC = &AMDGPU::SReg_32RegClass;
4915 ArgTy = LLT::scalar(32);
4916 break;
4918 if (HasFixedDims)
4919 return LoadConstant(ClusterDims.getDims()[0] - 1);
4920 Arg = &ClusterWorkGroupMaxIDX;
4921 ArgRC = &AMDGPU::SReg_32RegClass;
4922 ArgTy = LLT::scalar(32);
4923 break;
4925 if (HasFixedDims)
4926 return LoadConstant(ClusterDims.getDims()[1] - 1);
4927 Arg = &ClusterWorkGroupMaxIDY;
4928 ArgRC = &AMDGPU::SReg_32RegClass;
4929 ArgTy = LLT::scalar(32);
4930 break;
4932 if (HasFixedDims)
4933 return LoadConstant(ClusterDims.getDims()[2] - 1);
4934 Arg = &ClusterWorkGroupMaxIDZ;
4935 ArgRC = &AMDGPU::SReg_32RegClass;
4936 ArgTy = LLT::scalar(32);
4937 break;
4939 Arg = &ClusterWorkGroupMaxFlatID;
4940 ArgRC = &AMDGPU::SReg_32RegClass;
4941 ArgTy = LLT::scalar(32);
4942 break;
4943 default:
4944 break;
4945 }
4946 }
4947
4948 if (!Arg)
4949 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4950
4951 if (!Arg) {
4953 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4954 // which case the pointer argument may be missing and we use null.
4955 return LoadConstant(0);
4956 }
4957
4958 // It's undefined behavior if a function marked with the amdgpu-no-*
4959 // attributes uses the corresponding intrinsic.
4960 B.buildUndef(DstReg);
4961 return true;
4962 }
4963
4964 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4965 return false; // TODO: Handle these
4966 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4967 return true;
4968}
4969
4973 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4974 return false;
4975
4976 MI.eraseFromParent();
4977 return true;
4978}
4979
4981 int64_t C) {
4982 B.buildConstant(MI.getOperand(0).getReg(), C);
4983 MI.eraseFromParent();
4984 return true;
4985}
4986
4989 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4990 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4991 if (MaxID == 0)
4992 return replaceWithConstant(B, MI, 0);
4993
4994 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4995 const ArgDescriptor *Arg;
4996 const TargetRegisterClass *ArgRC;
4997 LLT ArgTy;
4998 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4999
5000 Register DstReg = MI.getOperand(0).getReg();
5001 if (!Arg) {
5002 // It's undefined behavior if a function marked with the amdgpu-no-*
5003 // attributes uses the corresponding intrinsic.
5004 B.buildUndef(DstReg);
5005 MI.eraseFromParent();
5006 return true;
5007 }
5008
5009 if (Arg->isMasked()) {
5010 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5011 // masking operations anyway.
5012 //
5013 // TODO: We could assert the top bit is 0 for the source copy.
5014 if (!loadInputValue(DstReg, B, ArgType))
5015 return false;
5016 } else {
5018 if (!loadInputValue(TmpReg, B, ArgType))
5019 return false;
5020 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5021 }
5022
5023 MI.eraseFromParent();
5024 return true;
5025}
5026
5029 // This isn't really a constant pool but close enough.
5032 return PtrInfo;
5033}
5034
5036 int64_t Offset) const {
5038 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5039
5040 // TODO: If we passed in the base kernel offset we could have a better
5041 // alignment than 4, but we don't really need it.
5042 if (!loadInputValue(KernArgReg, B,
5044 llvm_unreachable("failed to find kernarg segment ptr");
5045
5046 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5047 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5048}
5049
5050/// Legalize a value that's loaded from kernel arguments. This is only used by
5051/// legacy intrinsics.
5055 Align Alignment) const {
5056 Register DstReg = MI.getOperand(0).getReg();
5057
5058 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5059 "unexpected kernarg parameter type");
5060
5063 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5066 MI.eraseFromParent();
5067 return true;
5068}
5069
5072 MachineIRBuilder &B) const {
5073 Register Dst = MI.getOperand(0).getReg();
5074 LLT DstTy = MRI.getType(Dst);
5075 LLT S16 = LLT::scalar(16);
5076 LLT S32 = LLT::scalar(32);
5077 LLT S64 = LLT::scalar(64);
5078
5079 if (DstTy == S16)
5080 return legalizeFDIV16(MI, MRI, B);
5081 if (DstTy == S32)
5082 return legalizeFDIV32(MI, MRI, B);
5083 if (DstTy == S64)
5084 return legalizeFDIV64(MI, MRI, B);
5085
5086 return false;
5087}
5088
5090 Register DstDivReg,
5091 Register DstRemReg,
5092 Register X,
5093 Register Y) const {
5094 const LLT S1 = LLT::scalar(1);
5095 const LLT S32 = LLT::scalar(32);
5096
5097 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5098 // algorithm used here.
5099
5100 // Initial estimate of inv(y).
5101 auto FloatY = B.buildUITOFP(S32, Y);
5102 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5103 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5104 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5105 auto Z = B.buildFPTOUI(S32, ScaledY);
5106
5107 // One round of UNR.
5108 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5109 auto NegYZ = B.buildMul(S32, NegY, Z);
5110 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5111
5112 // Quotient/remainder estimate.
5113 auto Q = B.buildUMulH(S32, X, Z);
5114 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5115
5116 // First quotient/remainder refinement.
5117 auto One = B.buildConstant(S32, 1);
5118 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5119 if (DstDivReg)
5120 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5121 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5122
5123 // Second quotient/remainder refinement.
5124 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5125 if (DstDivReg)
5126 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5127
5128 if (DstRemReg)
5129 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5130}
5131
5132// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5133//
5134// Return lo, hi of result
5135//
5136// %cvt.lo = G_UITOFP Val.lo
5137// %cvt.hi = G_UITOFP Val.hi
5138// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5139// %rcp = G_AMDGPU_RCP_IFLAG %mad
5140// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5141// %mul2 = G_FMUL %mul1, 2**(-32)
5142// %trunc = G_INTRINSIC_TRUNC %mul2
5143// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5144// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5145static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5146 Register Val) {
5147 const LLT S32 = LLT::scalar(32);
5148 auto Unmerge = B.buildUnmerge(S32, Val);
5149
5150 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5151 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5152
5153 auto Mad = B.buildFMAD(
5154 S32, CvtHi, // 2**32
5155 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5156
5157 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5158 auto Mul1 = B.buildFMul(
5159 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5160
5161 // 2**(-32)
5162 auto Mul2 = B.buildFMul(
5163 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5164 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5165
5166 // -(2**32)
5167 auto Mad2 = B.buildFMAD(
5168 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5169 Mul1);
5170
5171 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5172 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5173
5174 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5175}
5176
5178 Register DstDivReg,
5179 Register DstRemReg,
5180 Register Numer,
5181 Register Denom) const {
5182 const LLT S32 = LLT::scalar(32);
5183 const LLT S64 = LLT::scalar(64);
5184 const LLT S1 = LLT::scalar(1);
5185 Register RcpLo, RcpHi;
5186
5187 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5188
5189 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5190
5191 auto Zero64 = B.buildConstant(S64, 0);
5192 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5193
5194 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5195 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5196
5197 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5198 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5199 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5200
5201 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5202 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5203 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5204
5205 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5206 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5207 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5208 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5209 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5210
5211 auto Zero32 = B.buildConstant(S32, 0);
5212 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5213 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5214 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5215
5216 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5217 Register NumerLo = UnmergeNumer.getReg(0);
5218 Register NumerHi = UnmergeNumer.getReg(1);
5219
5220 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5221 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5222 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5223 Register Mul3_Lo = UnmergeMul3.getReg(0);
5224 Register Mul3_Hi = UnmergeMul3.getReg(1);
5225 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5226 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5227 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5228 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5229
5230 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5231 Register DenomLo = UnmergeDenom.getReg(0);
5232 Register DenomHi = UnmergeDenom.getReg(1);
5233
5234 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5235 auto C1 = B.buildSExt(S32, CmpHi);
5236
5237 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5238 auto C2 = B.buildSExt(S32, CmpLo);
5239
5240 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5241 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5242
5243 // TODO: Here and below portions of the code can be enclosed into if/endif.
5244 // Currently control flow is unconditional and we have 4 selects after
5245 // potential endif to substitute PHIs.
5246
5247 // if C3 != 0 ...
5248 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5249 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5250 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5251 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5252
5253 auto One64 = B.buildConstant(S64, 1);
5254 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5255
5256 auto C4 =
5257 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5258 auto C5 =
5259 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5260 auto C6 = B.buildSelect(
5261 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5262
5263 // if (C6 != 0)
5264 auto Add4 = B.buildAdd(S64, Add3, One64);
5265 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5266
5267 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5268 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5269 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5270
5271 // endif C6
5272 // endif C3
5273
5274 if (DstDivReg) {
5275 auto Sel1 = B.buildSelect(
5276 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5277 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5278 Sel1, MulHi3);
5279 }
5280
5281 if (DstRemReg) {
5282 auto Sel2 = B.buildSelect(
5283 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5284 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5285 Sel2, Sub1);
5286 }
5287}
5288
5291 MachineIRBuilder &B) const {
5292 Register DstDivReg, DstRemReg;
5293 switch (MI.getOpcode()) {
5294 default:
5295 llvm_unreachable("Unexpected opcode!");
5296 case AMDGPU::G_UDIV: {
5297 DstDivReg = MI.getOperand(0).getReg();
5298 break;
5299 }
5300 case AMDGPU::G_UREM: {
5301 DstRemReg = MI.getOperand(0).getReg();
5302 break;
5303 }
5304 case AMDGPU::G_UDIVREM: {
5305 DstDivReg = MI.getOperand(0).getReg();
5306 DstRemReg = MI.getOperand(1).getReg();
5307 break;
5308 }
5309 }
5310
5311 const LLT S64 = LLT::scalar(64);
5312 const LLT S32 = LLT::scalar(32);
5313 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5314 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5315 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5316 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5317
5318 if (Ty == S32)
5319 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5320 else if (Ty == S64)
5321 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5322 else
5323 return false;
5324
5325 MI.eraseFromParent();
5326 return true;
5327}
5328
5331 MachineIRBuilder &B) const {
5332 const LLT S64 = LLT::scalar(64);
5333 const LLT S32 = LLT::scalar(32);
5334
5335 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5336 if (Ty != S32 && Ty != S64)
5337 return false;
5338
5339 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5340 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5341 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5342
5343 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5344 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5345 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5346
5347 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5348 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5349
5350 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5351 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5352
5353 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5354 switch (MI.getOpcode()) {
5355 default:
5356 llvm_unreachable("Unexpected opcode!");
5357 case AMDGPU::G_SDIV: {
5358 DstDivReg = MI.getOperand(0).getReg();
5359 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5360 break;
5361 }
5362 case AMDGPU::G_SREM: {
5363 DstRemReg = MI.getOperand(0).getReg();
5364 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5365 break;
5366 }
5367 case AMDGPU::G_SDIVREM: {
5368 DstDivReg = MI.getOperand(0).getReg();
5369 DstRemReg = MI.getOperand(1).getReg();
5370 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5371 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5372 break;
5373 }
5374 }
5375
5376 if (Ty == S32)
5377 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5378 else
5379 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5380
5381 if (DstDivReg) {
5382 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5383 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5384 B.buildSub(DstDivReg, SignXor, Sign);
5385 }
5386
5387 if (DstRemReg) {
5388 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5389 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5390 B.buildSub(DstRemReg, SignXor, Sign);
5391 }
5392
5393 MI.eraseFromParent();
5394 return true;
5395}
5396
5399 MachineIRBuilder &B) const {
5400 Register Res = MI.getOperand(0).getReg();
5401 Register LHS = MI.getOperand(1).getReg();
5402 Register RHS = MI.getOperand(2).getReg();
5403 uint16_t Flags = MI.getFlags();
5404 LLT ResTy = MRI.getType(Res);
5405
5406 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5407
5408 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5409 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5410 return false;
5411
5412 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5413 // the CI documentation has a worst case error of 1 ulp.
5414 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5415 // use it as long as we aren't trying to use denormals.
5416 //
5417 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5418
5419 // 1 / x -> RCP(x)
5420 if (CLHS->isExactlyValue(1.0)) {
5421 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5422 .addUse(RHS)
5423 .setMIFlags(Flags);
5424
5425 MI.eraseFromParent();
5426 return true;
5427 }
5428
5429 // -1 / x -> RCP( FNEG(x) )
5430 if (CLHS->isExactlyValue(-1.0)) {
5431 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5432 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5433 .addUse(FNeg.getReg(0))
5434 .setMIFlags(Flags);
5435
5436 MI.eraseFromParent();
5437 return true;
5438 }
5439 }
5440
5441 // For f16 require afn or arcp.
5442 // For f32 require afn.
5443 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5444 !MI.getFlag(MachineInstr::FmArcp)))
5445 return false;
5446
5447 // x / y -> x * (1.0 / y)
5448 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5449 .addUse(RHS)
5450 .setMIFlags(Flags);
5451 B.buildFMul(Res, LHS, RCP, Flags);
5452
5453 MI.eraseFromParent();
5454 return true;
5455}
5456
5459 MachineIRBuilder &B) const {
5460 Register Res = MI.getOperand(0).getReg();
5461 Register X = MI.getOperand(1).getReg();
5462 Register Y = MI.getOperand(2).getReg();
5463 uint16_t Flags = MI.getFlags();
5464 LLT ResTy = MRI.getType(Res);
5465
5466 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5467
5468 if (!AllowInaccurateRcp)
5469 return false;
5470
5471 auto NegY = B.buildFNeg(ResTy, Y);
5472 auto One = B.buildFConstant(ResTy, 1.0);
5473
5474 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5475 .addUse(Y)
5476 .setMIFlags(Flags);
5477
5478 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5479 R = B.buildFMA(ResTy, Tmp0, R, R);
5480
5481 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5482 R = B.buildFMA(ResTy, Tmp1, R, R);
5483
5484 auto Ret = B.buildFMul(ResTy, X, R);
5485 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5486
5487 B.buildFMA(Res, Tmp2, R, Ret);
5488 MI.eraseFromParent();
5489 return true;
5490}
5491
5494 MachineIRBuilder &B) const {
5495 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5496 return true;
5497
5498 Register Res = MI.getOperand(0).getReg();
5499 Register LHS = MI.getOperand(1).getReg();
5500 Register RHS = MI.getOperand(2).getReg();
5501
5502 uint16_t Flags = MI.getFlags();
5503
5504 LLT S16 = LLT::scalar(16);
5505 LLT S32 = LLT::scalar(32);
5506
5507 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5508 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5509 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5510 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5511 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5512 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5513 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5514 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5515 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5516 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5517 // q16.u = opx(V_CVT_F16_F32, q32.u);
5518 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5519
5520 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5521 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5522 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5523 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5524 .addUse(RHSExt.getReg(0))
5525 .setMIFlags(Flags);
5526 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5528 if (ST.hasMadMacF32Insts()) {
5529 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5530 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5531 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5532 } else {
5533 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5534 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5535 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5536 }
5537 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5538 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5539 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5540 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5541 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5542 .addUse(RDst.getReg(0))
5543 .addUse(RHS)
5544 .addUse(LHS)
5545 .setMIFlags(Flags);
5546
5547 MI.eraseFromParent();
5548 return true;
5549}
5550
5551static constexpr unsigned SPDenormModeBitField =
5553
5554// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5555// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5557 const GCNSubtarget &ST,
5559 // Set SP denorm mode to this value.
5560 unsigned SPDenormMode =
5561 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5562
5563 if (ST.hasDenormModeInst()) {
5564 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5565 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5566
5567 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5568 B.buildInstr(AMDGPU::S_DENORM_MODE)
5569 .addImm(NewDenormModeValue);
5570
5571 } else {
5572 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5573 .addImm(SPDenormMode)
5574 .addImm(SPDenormModeBitField);
5575 }
5576}
5577
5580 MachineIRBuilder &B) const {
5581 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5582 return true;
5583
5584 Register Res = MI.getOperand(0).getReg();
5585 Register LHS = MI.getOperand(1).getReg();
5586 Register RHS = MI.getOperand(2).getReg();
5587 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5588 SIModeRegisterDefaults Mode = MFI->getMode();
5589
5590 uint16_t Flags = MI.getFlags();
5591
5592 LLT S32 = LLT::scalar(32);
5593 LLT S1 = LLT::scalar(1);
5594
5595 auto One = B.buildFConstant(S32, 1.0f);
5596
5597 auto DenominatorScaled =
5598 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5599 .addUse(LHS)
5600 .addUse(RHS)
5601 .addImm(0)
5602 .setMIFlags(Flags);
5603 auto NumeratorScaled =
5604 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5605 .addUse(LHS)
5606 .addUse(RHS)
5607 .addImm(1)
5608 .setMIFlags(Flags);
5609
5610 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5611 .addUse(DenominatorScaled.getReg(0))
5612 .setMIFlags(Flags);
5613 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5614
5615 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5616 const bool HasDynamicDenormals =
5617 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5618 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5619
5620 Register SavedSPDenormMode;
5621 if (!PreservesDenormals) {
5622 if (HasDynamicDenormals) {
5623 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5624 B.buildInstr(AMDGPU::S_GETREG_B32)
5625 .addDef(SavedSPDenormMode)
5626 .addImm(SPDenormModeBitField);
5627 }
5628 toggleSPDenormMode(true, B, ST, Mode);
5629 }
5630
5631 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5632 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5633 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5634 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5635 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5636 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5637
5638 if (!PreservesDenormals) {
5639 if (HasDynamicDenormals) {
5640 assert(SavedSPDenormMode);
5641 B.buildInstr(AMDGPU::S_SETREG_B32)
5642 .addReg(SavedSPDenormMode)
5643 .addImm(SPDenormModeBitField);
5644 } else
5645 toggleSPDenormMode(false, B, ST, Mode);
5646 }
5647
5648 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5649 .addUse(Fma4.getReg(0))
5650 .addUse(Fma1.getReg(0))
5651 .addUse(Fma3.getReg(0))
5652 .addUse(NumeratorScaled.getReg(1))
5653 .setMIFlags(Flags);
5654
5655 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5656 .addUse(Fmas.getReg(0))
5657 .addUse(RHS)
5658 .addUse(LHS)
5659 .setMIFlags(Flags);
5660
5661 MI.eraseFromParent();
5662 return true;
5663}
5664
5667 MachineIRBuilder &B) const {
5668 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5669 return true;
5670
5671 Register Res = MI.getOperand(0).getReg();
5672 Register LHS = MI.getOperand(1).getReg();
5673 Register RHS = MI.getOperand(2).getReg();
5674
5675 uint16_t Flags = MI.getFlags();
5676
5677 LLT S64 = LLT::scalar(64);
5678 LLT S1 = LLT::scalar(1);
5679
5680 auto One = B.buildFConstant(S64, 1.0);
5681
5682 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5683 .addUse(LHS)
5684 .addUse(RHS)
5685 .addImm(0)
5686 .setMIFlags(Flags);
5687
5688 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5689
5690 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5691 .addUse(DivScale0.getReg(0))
5692 .setMIFlags(Flags);
5693
5694 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5695 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5696 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5697
5698 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5699 .addUse(LHS)
5700 .addUse(RHS)
5701 .addImm(1)
5702 .setMIFlags(Flags);
5703
5704 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5705 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5706 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5707
5708 Register Scale;
5709 if (!ST.hasUsableDivScaleConditionOutput()) {
5710 // Workaround a hardware bug on SI where the condition output from div_scale
5711 // is not usable.
5712
5713 LLT S32 = LLT::scalar(32);
5714
5715 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5716 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5717 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5718 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5719
5720 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5721 Scale1Unmerge.getReg(1));
5722 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5723 Scale0Unmerge.getReg(1));
5724 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5725 } else {
5726 Scale = DivScale1.getReg(1);
5727 }
5728
5729 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5730 .addUse(Fma4.getReg(0))
5731 .addUse(Fma3.getReg(0))
5732 .addUse(Mul.getReg(0))
5733 .addUse(Scale)
5734 .setMIFlags(Flags);
5735
5736 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5737 .addUse(Fmas.getReg(0))
5738 .addUse(RHS)
5739 .addUse(LHS)
5740 .setMIFlags(Flags);
5741
5742 MI.eraseFromParent();
5743 return true;
5744}
5745
5748 MachineIRBuilder &B) const {
5749 Register Res0 = MI.getOperand(0).getReg();
5750 Register Res1 = MI.getOperand(1).getReg();
5751 Register Val = MI.getOperand(2).getReg();
5752 uint16_t Flags = MI.getFlags();
5753
5754 LLT Ty = MRI.getType(Res0);
5755 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5756
5757 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5758 .addUse(Val)
5759 .setMIFlags(Flags);
5760 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5761 .addUse(Val)
5762 .setMIFlags(Flags);
5763
5764 if (ST.hasFractBug()) {
5765 auto Fabs = B.buildFAbs(Ty, Val);
5766 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5767 auto IsFinite =
5768 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5769 auto Zero = B.buildConstant(InstrExpTy, 0);
5770 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5771 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5772 }
5773
5774 B.buildCopy(Res0, Mant);
5775 B.buildSExtOrTrunc(Res1, Exp);
5776
5777 MI.eraseFromParent();
5778 return true;
5779}
5780
5783 MachineIRBuilder &B) const {
5784 Register Res = MI.getOperand(0).getReg();
5785 Register LHS = MI.getOperand(2).getReg();
5786 Register RHS = MI.getOperand(3).getReg();
5787 uint16_t Flags = MI.getFlags();
5788
5789 LLT S32 = LLT::scalar(32);
5790 LLT S1 = LLT::scalar(1);
5791
5792 auto Abs = B.buildFAbs(S32, RHS, Flags);
5793 const APFloat C0Val(1.0f);
5794
5795 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5796 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5797 auto C2 = B.buildFConstant(S32, 1.0f);
5798
5799 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5800 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5801
5802 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5803
5804 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5805 .addUse(Mul0.getReg(0))
5806 .setMIFlags(Flags);
5807
5808 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5809
5810 B.buildFMul(Res, Sel, Mul1, Flags);
5811
5812 MI.eraseFromParent();
5813 return true;
5814}
5815
5818 MachineIRBuilder &B) const {
5819 // Bypass the correct expansion a standard promotion through G_FSQRT would
5820 // get. The f32 op is accurate enough for the f16 cas.
5821 unsigned Flags = MI.getFlags();
5822 assert(!ST.has16BitInsts());
5823 const LLT F32 = LLT::scalar(32);
5824 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5825 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5826 .addUse(Ext.getReg(0))
5827 .setMIFlags(Flags);
5828 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5829 MI.eraseFromParent();
5830 return true;
5831}
5832
5835 MachineIRBuilder &B) const {
5836 MachineFunction &MF = B.getMF();
5837 Register Dst = MI.getOperand(0).getReg();
5838 Register X = MI.getOperand(1).getReg();
5839 const unsigned Flags = MI.getFlags();
5840 const LLT S1 = LLT::scalar(1);
5841 const LLT F32 = LLT::scalar(32);
5842 const LLT I32 = LLT::scalar(32);
5843
5844 if (allowApproxFunc(MF, Flags)) {
5845 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5846 .addUse(X)
5847 .setMIFlags(Flags);
5848 MI.eraseFromParent();
5849 return true;
5850 }
5851
5852 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5853 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5854 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5855 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5856 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5857
5859 if (needsDenormHandlingF32(MF, X, Flags)) {
5860 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5861 .addUse(SqrtX.getReg(0))
5862 .setMIFlags(Flags);
5863
5864 auto NegOne = B.buildConstant(I32, -1);
5865 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5866
5867 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5868 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5869
5870 auto PosOne = B.buildConstant(I32, 1);
5871 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5872
5873 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5874 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5875
5876 auto Zero = B.buildFConstant(F32, 0.0f);
5877 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5878
5879 SqrtS =
5880 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5881
5882 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5883 SqrtS =
5884 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5885 } else {
5886 auto SqrtR =
5887 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5888 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5889
5890 auto Half = B.buildFConstant(F32, 0.5f);
5891 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5892 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5893 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5894 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5895 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5896 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5897 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5898 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5899 }
5900
5901 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5902
5903 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5904
5905 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5906
5907 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5908 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5909
5910 MI.eraseFromParent();
5911 return true;
5912}
5913
5916 MachineIRBuilder &B) const {
5917 // For double type, the SQRT and RSQ instructions don't have required
5918 // precision, we apply Goldschmidt's algorithm to improve the result:
5919 //
5920 // y0 = rsq(x)
5921 // g0 = x * y0
5922 // h0 = 0.5 * y0
5923 //
5924 // r0 = 0.5 - h0 * g0
5925 // g1 = g0 * r0 + g0
5926 // h1 = h0 * r0 + h0
5927 //
5928 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5929 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5930 // h2 = h1 * r1 + h1
5931 //
5932 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5933 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5934 //
5935 // sqrt(x) = g3
5936
5937 const LLT S1 = LLT::scalar(1);
5938 const LLT S32 = LLT::scalar(32);
5939 const LLT F64 = LLT::scalar(64);
5940
5941 Register Dst = MI.getOperand(0).getReg();
5942 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5943
5944 Register X = MI.getOperand(1).getReg();
5945 unsigned Flags = MI.getFlags();
5946
5947 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5948
5949 auto ZeroInt = B.buildConstant(S32, 0);
5950 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5951
5952 // Scale up input if it is too small.
5953 auto ScaleUpFactor = B.buildConstant(S32, 256);
5954 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5955 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5956
5957 auto SqrtY =
5958 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5959
5960 auto Half = B.buildFConstant(F64, 0.5);
5961 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5962 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5963
5964 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5965 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5966
5967 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5968 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5969
5970 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5971 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5972
5973 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5974
5975 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5976 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5977
5978 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5979
5980 // Scale down the result.
5981 auto ScaleDownFactor = B.buildConstant(S32, -128);
5982 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5983 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5984
5985 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5986 // with finite only or nsz because rsq(+/-0) = +/-inf
5987
5988 // TODO: Check for DAZ and expand to subnormals
5989 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5990
5991 // If x is +INF, +0, or -0, use its original value
5992 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5993
5994 MI.eraseFromParent();
5995 return true;
5996}
5997
6000 MachineIRBuilder &B) const {
6001 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6002 if (Ty == LLT::scalar(32))
6003 return legalizeFSQRTF32(MI, MRI, B);
6004 if (Ty == LLT::scalar(64))
6005 return legalizeFSQRTF64(MI, MRI, B);
6006 if (Ty == LLT::scalar(16))
6007 return legalizeFSQRTF16(MI, MRI, B);
6008 return false;
6009}
6010
6011// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6012// FIXME: Why do we handle this one but not other removed instructions?
6013//
6014// Reciprocal square root. The clamp prevents infinite results, clamping
6015// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6016// +-max_float.
6019 MachineIRBuilder &B) const {
6020 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6021 return true;
6022
6023 Register Dst = MI.getOperand(0).getReg();
6024 Register Src = MI.getOperand(2).getReg();
6025 auto Flags = MI.getFlags();
6026
6027 LLT Ty = MRI.getType(Dst);
6028
6029 const fltSemantics *FltSemantics;
6030 if (Ty == LLT::scalar(32))
6031 FltSemantics = &APFloat::IEEEsingle();
6032 else if (Ty == LLT::scalar(64))
6033 FltSemantics = &APFloat::IEEEdouble();
6034 else
6035 return false;
6036
6037 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6038 .addUse(Src)
6039 .setMIFlags(Flags);
6040
6041 // We don't need to concern ourselves with the snan handling difference, since
6042 // the rsq quieted (or not) so use the one which will directly select.
6043 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6044 const bool UseIEEE = MFI->getMode().IEEE;
6045
6046 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6047 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6048 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6049
6050 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6051
6052 if (UseIEEE)
6053 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6054 else
6055 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6056 MI.eraseFromParent();
6057 return true;
6058}
6059
6060// TODO: Fix pointer type handling
6063 Intrinsic::ID IID) const {
6064
6065 MachineIRBuilder &B = Helper.MIRBuilder;
6066 MachineRegisterInfo &MRI = *B.getMRI();
6067
6068 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6069 IID == Intrinsic::amdgcn_permlanex16;
6070 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6071 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6072
6073 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6074 Register Src2, LLT VT) -> Register {
6075 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6076 switch (IID) {
6077 case Intrinsic::amdgcn_readfirstlane:
6078 case Intrinsic::amdgcn_permlane64:
6079 return LaneOp.getReg(0);
6080 case Intrinsic::amdgcn_readlane:
6081 case Intrinsic::amdgcn_set_inactive:
6082 case Intrinsic::amdgcn_set_inactive_chain_arg:
6083 return LaneOp.addUse(Src1).getReg(0);
6084 case Intrinsic::amdgcn_writelane:
6085 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6086 case Intrinsic::amdgcn_permlane16:
6087 case Intrinsic::amdgcn_permlanex16: {
6088 Register Src3 = MI.getOperand(5).getReg();
6089 int64_t Src4 = MI.getOperand(6).getImm();
6090 int64_t Src5 = MI.getOperand(7).getImm();
6091 return LaneOp.addUse(Src1)
6092 .addUse(Src2)
6093 .addUse(Src3)
6094 .addImm(Src4)
6095 .addImm(Src5)
6096 .getReg(0);
6097 }
6098 case Intrinsic::amdgcn_mov_dpp8:
6099 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6100 case Intrinsic::amdgcn_update_dpp:
6101 return LaneOp.addUse(Src1)
6102 .addImm(MI.getOperand(4).getImm())
6103 .addImm(MI.getOperand(5).getImm())
6104 .addImm(MI.getOperand(6).getImm())
6105 .addImm(MI.getOperand(7).getImm())
6106 .getReg(0);
6107 default:
6108 llvm_unreachable("unhandled lane op");
6109 }
6110 };
6111
6112 Register DstReg = MI.getOperand(0).getReg();
6113 Register Src0 = MI.getOperand(2).getReg();
6114 Register Src1, Src2;
6115 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6116 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6117 Src1 = MI.getOperand(3).getReg();
6118 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6119 Src2 = MI.getOperand(4).getReg();
6120 }
6121 }
6122
6123 LLT Ty = MRI.getType(DstReg);
6124 unsigned Size = Ty.getSizeInBits();
6125
6126 unsigned SplitSize = 32;
6127 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6128 ST.hasDPALU_DPP() &&
6129 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6130 SplitSize = 64;
6131
6132 if (Size == SplitSize) {
6133 // Already legal
6134 return true;
6135 }
6136
6137 if (Size < 32) {
6138 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6139
6140 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6141 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6142
6143 if (IID == Intrinsic::amdgcn_writelane)
6144 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6145
6146 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6147 B.buildTrunc(DstReg, LaneOpDst);
6148 MI.eraseFromParent();
6149 return true;
6150 }
6151
6152 if (Size % SplitSize != 0)
6153 return false;
6154
6155 LLT PartialResTy = LLT::scalar(SplitSize);
6156 bool NeedsBitcast = false;
6157 if (Ty.isVector()) {
6158 LLT EltTy = Ty.getElementType();
6159 unsigned EltSize = EltTy.getSizeInBits();
6160 if (EltSize == SplitSize) {
6161 PartialResTy = EltTy;
6162 } else if (EltSize == 16 || EltSize == 32) {
6163 unsigned NElem = SplitSize / EltSize;
6164 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6165 } else {
6166 // Handle all other cases via S32/S64 pieces
6167 NeedsBitcast = true;
6168 }
6169 }
6170
6171 SmallVector<Register, 4> PartialRes;
6172 unsigned NumParts = Size / SplitSize;
6173 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6174 MachineInstrBuilder Src1Parts, Src2Parts;
6175
6176 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6177 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6178
6179 if (IID == Intrinsic::amdgcn_writelane)
6180 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6181
6182 for (unsigned i = 0; i < NumParts; ++i) {
6183 Src0 = Src0Parts.getReg(i);
6184
6185 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6186 Src1 = Src1Parts.getReg(i);
6187
6188 if (IID == Intrinsic::amdgcn_writelane)
6189 Src2 = Src2Parts.getReg(i);
6190
6191 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6192 }
6193
6194 if (NeedsBitcast)
6195 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6196 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6197 else
6198 B.buildMergeLikeInstr(DstReg, PartialRes);
6199
6200 MI.eraseFromParent();
6201 return true;
6202}
6203
6206 MachineIRBuilder &B) const {
6208 ST.getTargetLowering()->getImplicitParameterOffset(
6210 LLT DstTy = MRI.getType(DstReg);
6211 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6212
6213 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6214 if (!loadInputValue(KernargPtrReg, B,
6216 return false;
6217
6218 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6219 B.buildConstant(IdxTy, Offset).getReg(0));
6220 return true;
6221}
6222
6223/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6224/// bits of the pointer and replace them with the stride argument, then
6225/// merge_values everything together. In the common case of a raw buffer (the
6226/// stride component is 0), we can just AND off the upper half.
6229 Register Result = MI.getOperand(0).getReg();
6230 Register Pointer = MI.getOperand(2).getReg();
6231 Register Stride = MI.getOperand(3).getReg();
6232 Register NumRecords = MI.getOperand(4).getReg();
6233 Register Flags = MI.getOperand(5).getReg();
6234
6235 LLT S32 = LLT::scalar(32);
6236 LLT S64 = LLT::scalar(64);
6237
6238 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6239
6240 auto ExtStride = B.buildAnyExt(S32, Stride);
6241
6242 if (ST.has45BitNumRecordsBufferResource()) {
6243 Register Zero = B.buildConstant(S32, 0).getReg(0);
6244 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6245 // num_records.
6246 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6247 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6248 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6249 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6250 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6251
6252 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6253 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6254 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6255 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6256 auto ExtShiftedStride =
6257 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6258 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6259 auto ExtShiftedFlags =
6260 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6261 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6262 Register HighHalf =
6263 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6264 B.buildMergeValues(Result, {LowHalf, HighHalf});
6265 } else {
6266 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6267 auto Unmerge = B.buildUnmerge(S32, Pointer);
6268 auto LowHalf = Unmerge.getReg(0);
6269 auto HighHalf = Unmerge.getReg(1);
6270
6271 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6272 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6273 auto ShiftConst = B.buildConstant(S32, 16);
6274 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6275 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6276 Register NewHighHalfReg = NewHighHalf.getReg(0);
6277 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6278 }
6279
6280 MI.eraseFromParent();
6281 return true;
6282}
6283
6286 MachineIRBuilder &B) const {
6287 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6288 if (!MFI->isEntryFunction()) {
6289 return legalizePreloadedArgIntrin(MI, MRI, B,
6291 }
6292
6293 Register DstReg = MI.getOperand(0).getReg();
6294 if (!getImplicitArgPtr(DstReg, MRI, B))
6295 return false;
6296
6297 MI.eraseFromParent();
6298 return true;
6299}
6300
6303 MachineIRBuilder &B) const {
6304 Function &F = B.getMF().getFunction();
6305 std::optional<uint32_t> KnownSize =
6307 if (KnownSize.has_value())
6308 B.buildConstant(DstReg, *KnownSize);
6309 return false;
6310}
6311
6314 MachineIRBuilder &B) const {
6315
6316 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6317 if (!MFI->isEntryFunction()) {
6318 return legalizePreloadedArgIntrin(MI, MRI, B,
6320 }
6321
6322 Register DstReg = MI.getOperand(0).getReg();
6323 if (!getLDSKernelId(DstReg, MRI, B))
6324 return false;
6325
6326 MI.eraseFromParent();
6327 return true;
6328}
6329
6333 unsigned AddrSpace) const {
6334 const LLT S32 = LLT::scalar(32);
6335 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6336 Register Hi32 = Unmerge.getReg(1);
6337
6338 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6339 ST.hasGloballyAddressableScratch()) {
6340 Register FlatScratchBaseHi =
6341 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6342 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6343 .getReg(0);
6344 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6345 // Test bits 63..58 against the aperture address.
6346 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6347 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6348 B.buildConstant(S32, 1u << 26));
6349 } else {
6350 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6351 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6352 }
6353 MI.eraseFromParent();
6354 return true;
6355}
6356
6357// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6358// offset (the offset that is included in bounds checking and swizzling, to be
6359// split between the instruction's voffset and immoffset fields) and soffset
6360// (the offset that is excluded from bounds checking and swizzling, to go in
6361// the instruction's soffset field). This function takes the first kind of
6362// offset and figures out how to split it between voffset and immoffset.
6363std::pair<Register, unsigned>
6365 Register OrigOffset) const {
6366 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6367 Register BaseReg;
6368 unsigned ImmOffset;
6369 const LLT S32 = LLT::scalar(32);
6370 MachineRegisterInfo &MRI = *B.getMRI();
6371
6372 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6373 // being added, so we can only safely match a 32-bit addition with no unsigned
6374 // overflow.
6375 bool CheckNUW = ST.hasGFX1250Insts();
6376 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6377 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6378
6379 // If BaseReg is a pointer, convert it to int.
6380 if (MRI.getType(BaseReg).isPointer())
6381 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6382
6383 // If the immediate value is too big for the immoffset field, put only bits
6384 // that would normally fit in the immoffset field. The remaining value that
6385 // is copied/added for the voffset field is a large power of 2, and it
6386 // stands more chance of being CSEd with the copy/add for another similar
6387 // load/store.
6388 // However, do not do that rounding down if that is a negative
6389 // number, as it appears to be illegal to have a negative offset in the
6390 // vgpr, even if adding the immediate offset makes it positive.
6391 unsigned Overflow = ImmOffset & ~MaxImm;
6392 ImmOffset -= Overflow;
6393 if ((int32_t)Overflow < 0) {
6394 Overflow += ImmOffset;
6395 ImmOffset = 0;
6396 }
6397
6398 if (Overflow != 0) {
6399 if (!BaseReg) {
6400 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6401 } else {
6402 auto OverflowVal = B.buildConstant(S32, Overflow);
6403 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6404 }
6405 }
6406
6407 if (!BaseReg)
6408 BaseReg = B.buildConstant(S32, 0).getReg(0);
6409
6410 return std::pair(BaseReg, ImmOffset);
6411}
6412
6413/// Handle register layout difference for f16 images for some subtargets.
6416 Register Reg,
6417 bool ImageStore) const {
6418 const LLT S16 = LLT::scalar(16);
6419 const LLT S32 = LLT::scalar(32);
6420 LLT StoreVT = MRI.getType(Reg);
6421 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6422
6423 if (ST.hasUnpackedD16VMem()) {
6424 auto Unmerge = B.buildUnmerge(S16, Reg);
6425
6426 SmallVector<Register, 4> WideRegs;
6427 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6428 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6429
6430 int NumElts = StoreVT.getNumElements();
6431
6432 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6433 .getReg(0);
6434 }
6435
6436 if (ImageStore && ST.hasImageStoreD16Bug()) {
6437 if (StoreVT.getNumElements() == 2) {
6438 SmallVector<Register, 4> PackedRegs;
6439 Reg = B.buildBitcast(S32, Reg).getReg(0);
6440 PackedRegs.push_back(Reg);
6441 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6442 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6443 .getReg(0);
6444 }
6445
6446 if (StoreVT.getNumElements() == 3) {
6447 SmallVector<Register, 4> PackedRegs;
6448 auto Unmerge = B.buildUnmerge(S16, Reg);
6449 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6450 PackedRegs.push_back(Unmerge.getReg(I));
6451 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6452 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6453 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6454 }
6455
6456 if (StoreVT.getNumElements() == 4) {
6457 SmallVector<Register, 4> PackedRegs;
6458 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6459 auto Unmerge = B.buildUnmerge(S32, Reg);
6460 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6461 PackedRegs.push_back(Unmerge.getReg(I));
6462 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6463 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6464 .getReg(0);
6465 }
6466
6467 llvm_unreachable("invalid data type");
6468 }
6469
6470 if (StoreVT == LLT::fixed_vector(3, S16)) {
6471 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6472 .getReg(0);
6473 }
6474 return Reg;
6475}
6476
6478 Register VData, LLT MemTy,
6479 bool IsFormat) const {
6480 MachineRegisterInfo *MRI = B.getMRI();
6481 LLT Ty = MRI->getType(VData);
6482
6483 const LLT S16 = LLT::scalar(16);
6484
6485 // Fixup buffer resources themselves needing to be v4i128.
6487 return castBufferRsrcToV4I32(VData, B);
6488
6489 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6490 Ty = getBitcastRegisterType(Ty);
6491 VData = B.buildBitcast(Ty, VData).getReg(0);
6492 }
6493 // Fixup illegal register types for i8 stores.
6494 if (Ty == LLT::scalar(8) || Ty == S16) {
6495 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6496 return AnyExt;
6497 }
6498
6499 if (Ty.isVector()) {
6500 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6501 if (IsFormat)
6502 return handleD16VData(B, *MRI, VData);
6503 }
6504 }
6505
6506 return VData;
6507}
6508
6510 LegalizerHelper &Helper,
6511 bool IsTyped,
6512 bool IsFormat) const {
6513 MachineIRBuilder &B = Helper.MIRBuilder;
6514 MachineRegisterInfo &MRI = *B.getMRI();
6515
6516 Register VData = MI.getOperand(1).getReg();
6517 LLT Ty = MRI.getType(VData);
6518 LLT EltTy = Ty.getScalarType();
6519 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6520 const LLT S32 = LLT::scalar(32);
6521
6522 MachineMemOperand *MMO = *MI.memoperands_begin();
6523 const int MemSize = MMO->getSize().getValue();
6524 LLT MemTy = MMO->getMemoryType();
6525
6526 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6527
6529 Register RSrc = MI.getOperand(2).getReg();
6530
6531 unsigned ImmOffset;
6532
6533 // The typed intrinsics add an immediate after the registers.
6534 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6535
6536 // The struct intrinsic variants add one additional operand over raw.
6537 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6538 Register VIndex;
6539 int OpOffset = 0;
6540 if (HasVIndex) {
6541 VIndex = MI.getOperand(3).getReg();
6542 OpOffset = 1;
6543 } else {
6544 VIndex = B.buildConstant(S32, 0).getReg(0);
6545 }
6546
6547 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6548 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6549
6550 unsigned Format = 0;
6551 if (IsTyped) {
6552 Format = MI.getOperand(5 + OpOffset).getImm();
6553 ++OpOffset;
6554 }
6555
6556 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6557
6558 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6559
6560 unsigned Opc;
6561 if (IsTyped) {
6562 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6563 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6564 } else if (IsFormat) {
6565 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6566 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6567 } else {
6568 switch (MemSize) {
6569 case 1:
6570 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6571 break;
6572 case 2:
6573 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6574 break;
6575 default:
6576 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6577 break;
6578 }
6579 }
6580
6581 auto MIB = B.buildInstr(Opc)
6582 .addUse(VData) // vdata
6583 .addUse(RSrc) // rsrc
6584 .addUse(VIndex) // vindex
6585 .addUse(VOffset) // voffset
6586 .addUse(SOffset) // soffset
6587 .addImm(ImmOffset); // offset(imm)
6588
6589 if (IsTyped)
6590 MIB.addImm(Format);
6591
6592 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6593 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6594 .addMemOperand(MMO);
6595
6596 MI.eraseFromParent();
6597 return true;
6598}
6599
6600static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6601 Register VIndex, Register VOffset, Register SOffset,
6602 unsigned ImmOffset, unsigned Format,
6603 unsigned AuxiliaryData, MachineMemOperand *MMO,
6604 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6605 auto MIB = B.buildInstr(Opc)
6606 .addDef(LoadDstReg) // vdata
6607 .addUse(RSrc) // rsrc
6608 .addUse(VIndex) // vindex
6609 .addUse(VOffset) // voffset
6610 .addUse(SOffset) // soffset
6611 .addImm(ImmOffset); // offset(imm)
6612
6613 if (IsTyped)
6614 MIB.addImm(Format);
6615
6616 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6617 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6618 .addMemOperand(MMO);
6619}
6620
6622 LegalizerHelper &Helper,
6623 bool IsFormat,
6624 bool IsTyped) const {
6625 MachineIRBuilder &B = Helper.MIRBuilder;
6626 MachineRegisterInfo &MRI = *B.getMRI();
6627 GISelChangeObserver &Observer = Helper.Observer;
6628
6629 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6630 MachineMemOperand *MMO = *MI.memoperands_begin();
6631 const LLT MemTy = MMO->getMemoryType();
6632 const LLT S32 = LLT::scalar(32);
6633
6634 Register Dst = MI.getOperand(0).getReg();
6635
6636 Register StatusDst;
6637 int OpOffset = 0;
6638 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6639 bool IsTFE = MI.getNumExplicitDefs() == 2;
6640 if (IsTFE) {
6641 StatusDst = MI.getOperand(1).getReg();
6642 ++OpOffset;
6643 }
6644
6645 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6646 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6647
6648 // The typed intrinsics add an immediate after the registers.
6649 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6650
6651 // The struct intrinsic variants add one additional operand over raw.
6652 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6653 Register VIndex;
6654 if (HasVIndex) {
6655 VIndex = MI.getOperand(3 + OpOffset).getReg();
6656 ++OpOffset;
6657 } else {
6658 VIndex = B.buildConstant(S32, 0).getReg(0);
6659 }
6660
6661 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6662 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6663
6664 unsigned Format = 0;
6665 if (IsTyped) {
6666 Format = MI.getOperand(5 + OpOffset).getImm();
6667 ++OpOffset;
6668 }
6669
6670 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6671 unsigned ImmOffset;
6672
6673 LLT Ty = MRI.getType(Dst);
6674 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6675 // logic doesn't have to handle that case.
6676 if (hasBufferRsrcWorkaround(Ty)) {
6677 Observer.changingInstr(MI);
6678 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6679 Observer.changedInstr(MI);
6680 Dst = MI.getOperand(0).getReg();
6681 B.setInsertPt(B.getMBB(), MI);
6682 }
6683 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6684 Ty = getBitcastRegisterType(Ty);
6685 Observer.changingInstr(MI);
6686 Helper.bitcastDst(MI, Ty, 0);
6687 Observer.changedInstr(MI);
6688 Dst = MI.getOperand(0).getReg();
6689 B.setInsertPt(B.getMBB(), MI);
6690 }
6691
6692 LLT EltTy = Ty.getScalarType();
6693 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6694 const bool Unpacked = ST.hasUnpackedD16VMem();
6695
6696 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6697
6698 unsigned Opc;
6699
6700 // TODO: Support TFE for typed and narrow loads.
6701 if (IsTyped) {
6702 if (IsTFE)
6703 return false;
6704 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6705 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6706 } else if (IsFormat) {
6707 if (IsD16) {
6708 if (IsTFE)
6709 return false;
6710 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6711 } else {
6712 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6713 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6714 }
6715 } else {
6716 switch (MemTy.getSizeInBits()) {
6717 case 8:
6718 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6719 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6720 break;
6721 case 16:
6722 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6723 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6724 break;
6725 default:
6726 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6727 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6728 break;
6729 }
6730 }
6731
6732 if (IsTFE) {
6733 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6734 unsigned NumLoadDWords = NumValueDWords + 1;
6735 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6736 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6737 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6738 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6739 if (MemTy.getSizeInBits() < 32) {
6740 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6741 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6742 B.buildTrunc(Dst, ExtDst);
6743 } else if (NumValueDWords == 1) {
6744 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6745 } else {
6746 SmallVector<Register, 5> LoadElts;
6747 for (unsigned I = 0; I != NumValueDWords; ++I)
6748 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6749 LoadElts.push_back(StatusDst);
6750 B.buildUnmerge(LoadElts, LoadDstReg);
6751 LoadElts.truncate(NumValueDWords);
6752 B.buildMergeLikeInstr(Dst, LoadElts);
6753 }
6754 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6755 (IsD16 && !Ty.isVector())) {
6756 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6757 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6758 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6759 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6760 B.buildTrunc(Dst, LoadDstReg);
6761 } else if (Unpacked && IsD16 && Ty.isVector()) {
6762 LLT UnpackedTy = Ty.changeElementSize(32);
6763 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6764 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6765 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6766 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6767 // FIXME: G_TRUNC should work, but legalization currently fails
6768 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6770 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6771 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6772 B.buildMergeLikeInstr(Dst, Repack);
6773 } else {
6774 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6775 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6776 }
6777
6778 MI.eraseFromParent();
6779 return true;
6780}
6781
6782static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6783 switch (IntrID) {
6784 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6785 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6786 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6787 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6788 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6789 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6790 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6791 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6792 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6793 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6794 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6795 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6796 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6797 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6798 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6799 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6800 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6801 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6802 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6803 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6804 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6805 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6806 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6807 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6808 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6809 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6810 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6811 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6812 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6813 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6814 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6815 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6816 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6817 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6818 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6819 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6820 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6821 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6822 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6823 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6824 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6825 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6826 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6827 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6828 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6829 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6830 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6831 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6832 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6833 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6834 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6835 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6836 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6837 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6838 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6839 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6840 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6841 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6842 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6843 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6844 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6845 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6846 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6847 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6848 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6849 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6850 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6851 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6852 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6853 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6854 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6855 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6856 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6857 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6858 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6859 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6860 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6861 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6862 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6863 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6864 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6865 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6866 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6867 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6868 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6869 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6870 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6871 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6872 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6873 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6874 default:
6875 llvm_unreachable("unhandled atomic opcode");
6876 }
6877}
6878
6881 Intrinsic::ID IID) const {
6882 const bool IsCmpSwap =
6883 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6884 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6885 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6886 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6887
6888 Register Dst = MI.getOperand(0).getReg();
6889 // Since we don't have 128-bit atomics, we don't need to handle the case of
6890 // p8 argmunents to the atomic itself
6891 Register VData = MI.getOperand(2).getReg();
6892
6893 Register CmpVal;
6894 int OpOffset = 0;
6895
6896 if (IsCmpSwap) {
6897 CmpVal = MI.getOperand(3).getReg();
6898 ++OpOffset;
6899 }
6900
6901 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6902 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6903 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6904
6905 // The struct intrinsic variants add one additional operand over raw.
6906 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6907 Register VIndex;
6908 if (HasVIndex) {
6909 VIndex = MI.getOperand(4 + OpOffset).getReg();
6910 ++OpOffset;
6911 } else {
6912 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6913 }
6914
6915 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6916 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6917 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6918
6919 MachineMemOperand *MMO = *MI.memoperands_begin();
6920
6921 unsigned ImmOffset;
6922 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6923
6924 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6925 .addDef(Dst)
6926 .addUse(VData); // vdata
6927
6928 if (IsCmpSwap)
6929 MIB.addReg(CmpVal);
6930
6931 MIB.addUse(RSrc) // rsrc
6932 .addUse(VIndex) // vindex
6933 .addUse(VOffset) // voffset
6934 .addUse(SOffset) // soffset
6935 .addImm(ImmOffset) // offset(imm)
6936 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6937 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6938 .addMemOperand(MMO);
6939
6940 MI.eraseFromParent();
6941 return true;
6942}
6943
6944/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6945/// vector with s16 typed elements.
6947 SmallVectorImpl<Register> &PackedAddrs,
6948 unsigned ArgOffset,
6950 bool IsA16, bool IsG16) {
6951 const LLT S16 = LLT::scalar(16);
6952 const LLT V2S16 = LLT::fixed_vector(2, 16);
6953 auto EndIdx = Intr->VAddrEnd;
6954
6955 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6956 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6957 if (!SrcOp.isReg())
6958 continue; // _L to _LZ may have eliminated this.
6959
6960 Register AddrReg = SrcOp.getReg();
6961
6962 if ((I < Intr->GradientStart) ||
6963 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6964 (I >= Intr->CoordStart && !IsA16)) {
6965 if ((I < Intr->GradientStart) && IsA16 &&
6966 (B.getMRI()->getType(AddrReg) == S16)) {
6967 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6968 // Special handling of bias when A16 is on. Bias is of type half but
6969 // occupies full 32-bit.
6970 PackedAddrs.push_back(
6971 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6972 .getReg(0));
6973 } else {
6974 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6975 "Bias needs to be converted to 16 bit in A16 mode");
6976 // Handle any gradient or coordinate operands that should not be packed
6977 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6978 PackedAddrs.push_back(AddrReg);
6979 }
6980 } else {
6981 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6982 // derivatives dx/dh and dx/dv are packed with undef.
6983 if (((I + 1) >= EndIdx) ||
6984 ((Intr->NumGradients / 2) % 2 == 1 &&
6985 (I == static_cast<unsigned>(Intr->GradientStart +
6986 (Intr->NumGradients / 2) - 1) ||
6987 I == static_cast<unsigned>(Intr->GradientStart +
6988 Intr->NumGradients - 1))) ||
6989 // Check for _L to _LZ optimization
6990 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6991 PackedAddrs.push_back(
6992 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6993 .getReg(0));
6994 } else {
6995 PackedAddrs.push_back(
6996 B.buildBuildVector(
6997 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6998 .getReg(0));
6999 ++I;
7000 }
7001 }
7002 }
7003}
7004
7005/// Convert from separate vaddr components to a single vector address register,
7006/// and replace the remaining operands with $noreg.
7008 int DimIdx, int NumVAddrs) {
7009 const LLT S32 = LLT::scalar(32);
7010 (void)S32;
7011 SmallVector<Register, 8> AddrRegs;
7012 for (int I = 0; I != NumVAddrs; ++I) {
7013 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7014 if (SrcOp.isReg()) {
7015 AddrRegs.push_back(SrcOp.getReg());
7016 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7017 }
7018 }
7019
7020 int NumAddrRegs = AddrRegs.size();
7021 if (NumAddrRegs != 1) {
7022 auto VAddr =
7023 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7024 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7025 }
7026
7027 for (int I = 1; I != NumVAddrs; ++I) {
7028 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7029 if (SrcOp.isReg())
7030 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7031 }
7032}
7033
7034/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7035///
7036/// Depending on the subtarget, load/store with 16-bit element data need to be
7037/// rewritten to use the low half of 32-bit registers, or directly use a packed
7038/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7039/// registers.
7040///
7041/// We don't want to directly select image instructions just yet, but also want
7042/// to exposes all register repacking to the legalizer/combiners. We also don't
7043/// want a selected instruction entering RegBankSelect. In order to avoid
7044/// defining a multitude of intermediate image instructions, directly hack on
7045/// the intrinsic's arguments. In cases like a16 addresses, this requires
7046/// padding now unnecessary arguments with $noreg.
7049 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7050
7051 const MachineFunction &MF = *MI.getMF();
7052 const unsigned NumDefs = MI.getNumExplicitDefs();
7053 const unsigned ArgOffset = NumDefs + 1;
7054 bool IsTFE = NumDefs == 2;
7055 // We are only processing the operands of d16 image operations on subtargets
7056 // that use the unpacked register layout, or need to repack the TFE result.
7057
7058 // TODO: Do we need to guard against already legalized intrinsics?
7059 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7061
7062 MachineRegisterInfo *MRI = B.getMRI();
7063 const LLT S32 = LLT::scalar(32);
7064 const LLT S16 = LLT::scalar(16);
7065 const LLT V2S16 = LLT::fixed_vector(2, 16);
7066
7067 unsigned DMask = 0;
7068 Register VData;
7069 LLT Ty;
7070
7071 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7072 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7073 Ty = MRI->getType(VData);
7074 }
7075
7076 const bool IsAtomicPacked16Bit =
7077 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7078 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7079
7080 // Check for 16 bit addresses and pack if true.
7081 LLT GradTy =
7082 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7083 LLT AddrTy =
7084 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7085 const bool IsG16 =
7086 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7087 const bool IsA16 = AddrTy == S16;
7088 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7089
7090 int DMaskLanes = 0;
7091 if (!BaseOpcode->Atomic) {
7092 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7093 if (BaseOpcode->Gather4) {
7094 DMaskLanes = 4;
7095 } else if (DMask != 0) {
7096 DMaskLanes = llvm::popcount(DMask);
7097 } else if (!IsTFE && !BaseOpcode->Store) {
7098 // If dmask is 0, this is a no-op load. This can be eliminated.
7099 B.buildUndef(MI.getOperand(0));
7100 MI.eraseFromParent();
7101 return true;
7102 }
7103 }
7104
7105 Observer.changingInstr(MI);
7106 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7107
7108 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7109 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7110 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7111 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7112 unsigned NewOpcode = LoadOpcode;
7113 if (BaseOpcode->Store)
7114 NewOpcode = StoreOpcode;
7115 else if (BaseOpcode->NoReturn)
7116 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7117
7118 // Track that we legalized this
7119 MI.setDesc(B.getTII().get(NewOpcode));
7120
7121 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7122 // dmask to be at least 1 otherwise the instruction will fail
7123 if (IsTFE && DMask == 0) {
7124 DMask = 0x1;
7125 DMaskLanes = 1;
7126 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7127 }
7128
7129 if (BaseOpcode->Atomic) {
7130 Register VData0 = MI.getOperand(2).getReg();
7131 LLT Ty = MRI->getType(VData0);
7132
7133 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7134 if (Ty.isVector() && !IsAtomicPacked16Bit)
7135 return false;
7136
7137 if (BaseOpcode->AtomicX2) {
7138 Register VData1 = MI.getOperand(3).getReg();
7139 // The two values are packed in one register.
7140 LLT PackedTy = LLT::fixed_vector(2, Ty);
7141 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7142 MI.getOperand(2).setReg(Concat.getReg(0));
7143 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7144 }
7145 }
7146
7147 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7148
7149 // Rewrite the addressing register layout before doing anything else.
7150 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7151 // 16 bit gradients are supported, but are tied to the A16 control
7152 // so both gradients and addresses must be 16 bit
7153 return false;
7154 }
7155
7156 if (IsA16 && !ST.hasA16()) {
7157 // A16 not supported
7158 return false;
7159 }
7160
7161 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7162 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7163
7164 if (IsA16 || IsG16) {
7165 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7166 // instructions expect VGPR_32
7167 SmallVector<Register, 4> PackedRegs;
7168
7169 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7170
7171 // See also below in the non-a16 branch
7172 const bool UseNSA = ST.hasNSAEncoding() &&
7173 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7174 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7175 const bool UsePartialNSA =
7176 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7177
7178 if (UsePartialNSA) {
7179 // Pack registers that would go over NSAMaxSize into last VAddr register
7180 LLT PackedAddrTy =
7181 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7182 auto Concat = B.buildConcatVectors(
7183 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7184 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7185 PackedRegs.resize(NSAMaxSize);
7186 } else if (!UseNSA && PackedRegs.size() > 1) {
7187 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7188 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7189 PackedRegs[0] = Concat.getReg(0);
7190 PackedRegs.resize(1);
7191 }
7192
7193 const unsigned NumPacked = PackedRegs.size();
7194 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7195 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7196 if (!SrcOp.isReg()) {
7197 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7198 continue;
7199 }
7200
7201 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7202
7203 if (I - Intr->VAddrStart < NumPacked)
7204 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7205 else
7206 SrcOp.setReg(AMDGPU::NoRegister);
7207 }
7208 } else {
7209 // If the register allocator cannot place the address registers contiguously
7210 // without introducing moves, then using the non-sequential address encoding
7211 // is always preferable, since it saves VALU instructions and is usually a
7212 // wash in terms of code size or even better.
7213 //
7214 // However, we currently have no way of hinting to the register allocator
7215 // that MIMG addresses should be placed contiguously when it is possible to
7216 // do so, so force non-NSA for the common 2-address case as a heuristic.
7217 //
7218 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7219 // allocation when possible.
7220 //
7221 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7222 // set of the remaining addresses.
7223 const bool UseNSA = ST.hasNSAEncoding() &&
7224 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7225 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7226 const bool UsePartialNSA =
7227 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7228
7229 if (UsePartialNSA) {
7231 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7232 Intr->NumVAddrs - NSAMaxSize + 1);
7233 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7234 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7235 Intr->NumVAddrs);
7236 }
7237 }
7238
7239 int Flags = 0;
7240 if (IsA16)
7241 Flags |= 1;
7242 if (IsG16)
7243 Flags |= 2;
7244 MI.addOperand(MachineOperand::CreateImm(Flags));
7245
7246 if (BaseOpcode->NoReturn) { // No TFE for stores?
7247 // TODO: Handle dmask trim
7248 if (!Ty.isVector() || !IsD16)
7249 return true;
7250
7251 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7252 if (RepackedReg != VData) {
7253 MI.getOperand(1).setReg(RepackedReg);
7254 }
7255
7256 return true;
7257 }
7258
7259 Register DstReg = MI.getOperand(0).getReg();
7260 const LLT EltTy = Ty.getScalarType();
7261 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7262
7263 // Confirm that the return type is large enough for the dmask specified
7264 if (NumElts < DMaskLanes)
7265 return false;
7266
7267 if (NumElts > 4 || DMaskLanes > 4)
7268 return false;
7269
7270 // Image atomic instructions are using DMask to specify how many bits
7271 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7272 // DMaskLanes for image atomic has default value '0'.
7273 // We must be sure that atomic variants (especially packed) will not be
7274 // truncated from v2s16 or v4s16 to s16 type.
7275 //
7276 // ChangeElementCount will be needed for image load where Ty is always scalar.
7277 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7278 const LLT AdjustedTy =
7279 DMaskLanes == 0
7280 ? Ty
7281 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7282
7283 // The raw dword aligned data component of the load. The only legal cases
7284 // where this matters should be when using the packed D16 format, for
7285 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7286 LLT RoundedTy;
7287
7288 // S32 vector to cover all data, plus TFE result element.
7289 LLT TFETy;
7290
7291 // Register type to use for each loaded component. Will be S32 or V2S16.
7292 LLT RegTy;
7293
7294 if (IsD16 && ST.hasUnpackedD16VMem()) {
7295 RoundedTy =
7296 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7297 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7298 RegTy = S32;
7299 } else {
7300 unsigned EltSize = EltTy.getSizeInBits();
7301 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7302 unsigned RoundedSize = 32 * RoundedElts;
7303 RoundedTy = LLT::scalarOrVector(
7304 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7305 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7306 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7307 }
7308
7309 // The return type does not need adjustment.
7310 // TODO: Should we change s16 case to s32 or <2 x s16>?
7311 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7312 return true;
7313
7314 Register Dst1Reg;
7315
7316 // Insert after the instruction.
7317 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7318
7319 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7320 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7321 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7322 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7323
7324 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7325
7326 MI.getOperand(0).setReg(NewResultReg);
7327
7328 // In the IR, TFE is supposed to be used with a 2 element struct return
7329 // type. The instruction really returns these two values in one contiguous
7330 // register, with one additional dword beyond the loaded data. Rewrite the
7331 // return type to use a single register result.
7332
7333 if (IsTFE) {
7334 Dst1Reg = MI.getOperand(1).getReg();
7335 if (MRI->getType(Dst1Reg) != S32)
7336 return false;
7337
7338 // TODO: Make sure the TFE operand bit is set.
7339 MI.removeOperand(1);
7340
7341 // Handle the easy case that requires no repack instructions.
7342 if (Ty == S32) {
7343 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7344 return true;
7345 }
7346 }
7347
7348 // Now figure out how to copy the new result register back into the old
7349 // result.
7350 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7351
7352 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7353
7354 if (ResultNumRegs == 1) {
7355 assert(!IsTFE);
7356 ResultRegs[0] = NewResultReg;
7357 } else {
7358 // We have to repack into a new vector of some kind.
7359 for (int I = 0; I != NumDataRegs; ++I)
7360 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7361 B.buildUnmerge(ResultRegs, NewResultReg);
7362
7363 // Drop the final TFE element to get the data part. The TFE result is
7364 // directly written to the right place already.
7365 if (IsTFE)
7366 ResultRegs.resize(NumDataRegs);
7367 }
7368
7369 // For an s16 scalar result, we form an s32 result with a truncate regardless
7370 // of packed vs. unpacked.
7371 if (IsD16 && !Ty.isVector()) {
7372 B.buildTrunc(DstReg, ResultRegs[0]);
7373 return true;
7374 }
7375
7376 // Avoid a build/concat_vector of 1 entry.
7377 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7378 B.buildBitcast(DstReg, ResultRegs[0]);
7379 return true;
7380 }
7381
7382 assert(Ty.isVector());
7383
7384 if (IsD16) {
7385 // For packed D16 results with TFE enabled, all the data components are
7386 // S32. Cast back to the expected type.
7387 //
7388 // TODO: We don't really need to use load s32 elements. We would only need one
7389 // cast for the TFE result if a multiple of v2s16 was used.
7390 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7391 for (Register &Reg : ResultRegs)
7392 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7393 } else if (ST.hasUnpackedD16VMem()) {
7394 for (Register &Reg : ResultRegs)
7395 Reg = B.buildTrunc(S16, Reg).getReg(0);
7396 }
7397 }
7398
7399 auto padWithUndef = [&](LLT Ty, int NumElts) {
7400 if (NumElts == 0)
7401 return;
7402 Register Undef = B.buildUndef(Ty).getReg(0);
7403 for (int I = 0; I != NumElts; ++I)
7404 ResultRegs.push_back(Undef);
7405 };
7406
7407 // Pad out any elements eliminated due to the dmask.
7408 LLT ResTy = MRI->getType(ResultRegs[0]);
7409 if (!ResTy.isVector()) {
7410 padWithUndef(ResTy, NumElts - ResultRegs.size());
7411 B.buildBuildVector(DstReg, ResultRegs);
7412 return true;
7413 }
7414
7415 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7416 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7417
7418 // Deal with the one annoying legal case.
7419 const LLT V3S16 = LLT::fixed_vector(3, 16);
7420 if (Ty == V3S16) {
7421 if (IsTFE) {
7422 if (ResultRegs.size() == 1) {
7423 NewResultReg = ResultRegs[0];
7424 } else if (ResultRegs.size() == 2) {
7425 LLT V4S16 = LLT::fixed_vector(4, 16);
7426 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7427 } else {
7428 return false;
7429 }
7430 }
7431
7432 if (MRI->getType(DstReg).getNumElements() <
7433 MRI->getType(NewResultReg).getNumElements()) {
7434 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7435 } else {
7436 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7437 }
7438 return true;
7439 }
7440
7441 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7442 B.buildConcatVectors(DstReg, ResultRegs);
7443 return true;
7444}
7445
7447 MachineInstr &MI) const {
7448 MachineIRBuilder &B = Helper.MIRBuilder;
7449 GISelChangeObserver &Observer = Helper.Observer;
7450
7451 Register OrigDst = MI.getOperand(0).getReg();
7452 Register Dst;
7453 LLT Ty = B.getMRI()->getType(OrigDst);
7454 unsigned Size = Ty.getSizeInBits();
7455 MachineFunction &MF = B.getMF();
7456 unsigned Opc = 0;
7457 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7458 assert(Size == 8 || Size == 16);
7459 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7460 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7461 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7462 // destination register.
7463 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7464 } else {
7465 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7466 Dst = OrigDst;
7467 }
7468
7469 Observer.changingInstr(MI);
7470
7471 // Handle needing to s.buffer.load() a p8 value.
7472 if (hasBufferRsrcWorkaround(Ty)) {
7473 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7474 B.setInsertPt(B.getMBB(), MI);
7475 }
7477 Ty = getBitcastRegisterType(Ty);
7478 Helper.bitcastDst(MI, Ty, 0);
7479 B.setInsertPt(B.getMBB(), MI);
7480 }
7481
7482 // FIXME: We don't really need this intermediate instruction. The intrinsic
7483 // should be fixed to have a memory operand. Since it's readnone, we're not
7484 // allowed to add one.
7485 MI.setDesc(B.getTII().get(Opc));
7486 MI.removeOperand(1); // Remove intrinsic ID
7487
7488 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7489 const unsigned MemSize = (Size + 7) / 8;
7490 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7496 MemSize, MemAlign);
7497 MI.addMemOperand(MF, MMO);
7498 if (Dst != OrigDst) {
7499 MI.getOperand(0).setReg(Dst);
7500 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7501 B.buildTrunc(OrigDst, Dst);
7502 }
7503
7504 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7505 // always be legal. We may need to restore this to a 96-bit result if it turns
7506 // out this needs to be converted to a vector load during RegBankSelect.
7507 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7508 if (Ty.isVector())
7510 else
7511 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7512 }
7513
7514 Observer.changedInstr(MI);
7515 return true;
7516}
7517
7519 MachineInstr &MI) const {
7520 MachineIRBuilder &B = Helper.MIRBuilder;
7521 GISelChangeObserver &Observer = Helper.Observer;
7522 Observer.changingInstr(MI);
7523 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7524 MI.removeOperand(0); // Remove intrinsic ID
7526 Observer.changedInstr(MI);
7527 return true;
7528}
7529
7530// TODO: Move to selection
7533 MachineIRBuilder &B) const {
7534 if (!ST.hasTrapHandler() ||
7535 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7536 return legalizeTrapEndpgm(MI, MRI, B);
7537
7538 return ST.supportsGetDoorbellID() ?
7540}
7541
7544 const DebugLoc &DL = MI.getDebugLoc();
7545 MachineBasicBlock &BB = B.getMBB();
7546 MachineFunction *MF = BB.getParent();
7547
7548 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7549 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7550 .addImm(0);
7551 MI.eraseFromParent();
7552 return true;
7553 }
7554
7555 // We need a block split to make the real endpgm a terminator. We also don't
7556 // want to break phis in successor blocks, so we can't just delete to the
7557 // end of the block.
7558 BB.splitAt(MI, false /*UpdateLiveIns*/);
7560 MF->push_back(TrapBB);
7561 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7562 .addImm(0);
7563 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7564 .addMBB(TrapBB);
7565
7566 BB.addSuccessor(TrapBB);
7567 MI.eraseFromParent();
7568 return true;
7569}
7570
7573 MachineFunction &MF = B.getMF();
7574 const LLT S64 = LLT::scalar(64);
7575
7576 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7577 // For code object version 5, queue_ptr is passed through implicit kernarg.
7583 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7584
7585 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7587
7588 if (!loadInputValue(KernargPtrReg, B,
7590 return false;
7591
7592 // TODO: can we be smarter about machine pointer info?
7595 PtrInfo.getWithOffset(Offset),
7599
7600 // Pointer address
7603 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7604 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7605 // Load address
7606 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7607 B.buildCopy(SGPR01, Temp);
7608 B.buildInstr(AMDGPU::S_TRAP)
7609 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7610 .addReg(SGPR01, RegState::Implicit);
7611 MI.eraseFromParent();
7612 return true;
7613 }
7614
7615 // Pass queue pointer to trap handler as input, and insert trap instruction
7616 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7617 Register LiveIn =
7620 return false;
7621
7622 B.buildCopy(SGPR01, LiveIn);
7623 B.buildInstr(AMDGPU::S_TRAP)
7624 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7625 .addReg(SGPR01, RegState::Implicit);
7626
7627 MI.eraseFromParent();
7628 return true;
7629}
7630
7633 MachineIRBuilder &B) const {
7634 // We need to simulate the 's_trap 2' instruction on targets that run in
7635 // PRIV=1 (where it is treated as a nop).
7636 if (ST.hasPrivEnabledTrap2NopBug()) {
7637 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7638 MI.getDebugLoc());
7639 MI.eraseFromParent();
7640 return true;
7641 }
7642
7643 B.buildInstr(AMDGPU::S_TRAP)
7644 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7645 MI.eraseFromParent();
7646 return true;
7647}
7648
7651 MachineIRBuilder &B) const {
7652 // Is non-HSA path or trap-handler disabled? Then, report a warning
7653 // accordingly
7654 if (!ST.hasTrapHandler() ||
7655 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7656 Function &Fn = B.getMF().getFunction();
7658 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7659 } else {
7660 // Insert debug-trap instruction
7661 B.buildInstr(AMDGPU::S_TRAP)
7662 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7663 }
7664
7665 MI.eraseFromParent();
7666 return true;
7667}
7668
7670 MachineInstr &MI, MachineIRBuilder &B) const {
7671 MachineRegisterInfo &MRI = *B.getMRI();
7672 const LLT S16 = LLT::scalar(16);
7673 const LLT S32 = LLT::scalar(32);
7674 const LLT V2S16 = LLT::fixed_vector(2, 16);
7675 const LLT V3S32 = LLT::fixed_vector(3, 32);
7676
7677 Register DstReg = MI.getOperand(0).getReg();
7678 Register NodePtr = MI.getOperand(2).getReg();
7679 Register RayExtent = MI.getOperand(3).getReg();
7680 Register RayOrigin = MI.getOperand(4).getReg();
7681 Register RayDir = MI.getOperand(5).getReg();
7682 Register RayInvDir = MI.getOperand(6).getReg();
7683 Register TDescr = MI.getOperand(7).getReg();
7684
7685 if (!ST.hasGFX10_AEncoding()) {
7686 Function &Fn = B.getMF().getFunction();
7688 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7689 return false;
7690 }
7691
7692 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7693 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7694 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7695 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7696 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7697 const unsigned NumVDataDwords = 4;
7698 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7699 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7700 const bool UseNSA =
7701 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7702
7703 const unsigned BaseOpcodes[2][2] = {
7704 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7705 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7706 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7707 int Opcode;
7708 if (UseNSA) {
7709 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7710 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7711 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7712 : AMDGPU::MIMGEncGfx10NSA,
7713 NumVDataDwords, NumVAddrDwords);
7714 } else {
7715 assert(!IsGFX12Plus);
7716 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7717 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7718 : AMDGPU::MIMGEncGfx10Default,
7719 NumVDataDwords, NumVAddrDwords);
7720 }
7721 assert(Opcode != -1);
7722
7724 if (UseNSA && IsGFX11Plus) {
7725 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7726 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7727 auto Merged = B.buildMergeLikeInstr(
7728 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7729 Ops.push_back(Merged.getReg(0));
7730 };
7731
7732 Ops.push_back(NodePtr);
7733 Ops.push_back(RayExtent);
7734 packLanes(RayOrigin);
7735
7736 if (IsA16) {
7737 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7738 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7739 auto MergedDir = B.buildMergeLikeInstr(
7740 V3S32,
7741 {B.buildBitcast(
7742 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7743 UnmergeRayDir.getReg(0)}))
7744 .getReg(0),
7745 B.buildBitcast(
7746 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7747 UnmergeRayDir.getReg(1)}))
7748 .getReg(0),
7749 B.buildBitcast(
7750 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7751 UnmergeRayDir.getReg(2)}))
7752 .getReg(0)});
7753 Ops.push_back(MergedDir.getReg(0));
7754 } else {
7755 packLanes(RayDir);
7756 packLanes(RayInvDir);
7757 }
7758 } else {
7759 if (Is64) {
7760 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7761 Ops.push_back(Unmerge.getReg(0));
7762 Ops.push_back(Unmerge.getReg(1));
7763 } else {
7764 Ops.push_back(NodePtr);
7765 }
7766 Ops.push_back(RayExtent);
7767
7768 auto packLanes = [&Ops, &S32, &B](Register Src) {
7769 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7770 Ops.push_back(Unmerge.getReg(0));
7771 Ops.push_back(Unmerge.getReg(1));
7772 Ops.push_back(Unmerge.getReg(2));
7773 };
7774
7775 packLanes(RayOrigin);
7776 if (IsA16) {
7777 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7778 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7782 B.buildMergeLikeInstr(R1,
7783 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7784 B.buildMergeLikeInstr(
7785 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7786 B.buildMergeLikeInstr(
7787 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7788 Ops.push_back(R1);
7789 Ops.push_back(R2);
7790 Ops.push_back(R3);
7791 } else {
7792 packLanes(RayDir);
7793 packLanes(RayInvDir);
7794 }
7795 }
7796
7797 if (!UseNSA) {
7798 // Build a single vector containing all the operands so far prepared.
7799 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7800 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7801 Ops.clear();
7802 Ops.push_back(MergedOps);
7803 }
7804
7805 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7806 .addDef(DstReg)
7807 .addImm(Opcode);
7808
7809 for (Register R : Ops) {
7810 MIB.addUse(R);
7811 }
7812
7813 MIB.addUse(TDescr)
7814 .addImm(IsA16 ? 1 : 0)
7815 .cloneMemRefs(MI);
7816
7817 MI.eraseFromParent();
7818 return true;
7819}
7820
7822 MachineInstr &MI, MachineIRBuilder &B) const {
7823 const LLT S32 = LLT::scalar(32);
7824 const LLT V2S32 = LLT::fixed_vector(2, 32);
7825
7826 Register DstReg = MI.getOperand(0).getReg();
7827 Register DstOrigin = MI.getOperand(1).getReg();
7828 Register DstDir = MI.getOperand(2).getReg();
7829 Register NodePtr = MI.getOperand(4).getReg();
7830 Register RayExtent = MI.getOperand(5).getReg();
7831 Register InstanceMask = MI.getOperand(6).getReg();
7832 Register RayOrigin = MI.getOperand(7).getReg();
7833 Register RayDir = MI.getOperand(8).getReg();
7834 Register Offsets = MI.getOperand(9).getReg();
7835 Register TDescr = MI.getOperand(10).getReg();
7836
7837 if (!ST.hasBVHDualAndBVH8Insts()) {
7838 Function &Fn = B.getMF().getFunction();
7840 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7841 return false;
7842 }
7843
7844 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7845 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7846 const unsigned NumVDataDwords = 10;
7847 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7848 int Opcode = AMDGPU::getMIMGOpcode(
7849 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7850 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7851 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7852 assert(Opcode != -1);
7853
7854 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7855 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7856
7857 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7858 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7859 .addDef(DstReg)
7860 .addDef(DstOrigin)
7861 .addDef(DstDir)
7862 .addImm(Opcode)
7863 .addUse(NodePtr)
7864 .addUse(RayExtentInstanceMaskVec.getReg(0))
7865 .addUse(RayOrigin)
7866 .addUse(RayDir)
7867 .addUse(Offsets)
7868 .addUse(TDescr)
7869 .cloneMemRefs(MI);
7870
7871 MI.eraseFromParent();
7872 return true;
7873}
7874
7876 MachineIRBuilder &B) const {
7877 const SITargetLowering *TLI = ST.getTargetLowering();
7879 Register DstReg = MI.getOperand(0).getReg();
7880 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7881 MI.eraseFromParent();
7882 return true;
7883}
7884
7886 MachineIRBuilder &B) const {
7887 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7888 if (!ST.hasArchitectedSGPRs())
7889 return false;
7890 LLT S32 = LLT::scalar(32);
7891 Register DstReg = MI.getOperand(0).getReg();
7892 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7893 auto LSB = B.buildConstant(S32, 25);
7894 auto Width = B.buildConstant(S32, 5);
7895 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7896 MI.eraseFromParent();
7897 return true;
7898}
7899
7902 AMDGPU::Hwreg::Id HwReg,
7903 unsigned LowBit,
7904 unsigned Width) const {
7905 MachineRegisterInfo &MRI = *B.getMRI();
7906 Register DstReg = MI.getOperand(0).getReg();
7907 if (!MRI.getRegClassOrNull(DstReg))
7908 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7909 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7910 .addDef(DstReg)
7911 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7912 MI.eraseFromParent();
7913 return true;
7914}
7915
7916static constexpr unsigned FPEnvModeBitField =
7918
7919static constexpr unsigned FPEnvTrapBitField =
7921
7924 MachineIRBuilder &B) const {
7925 Register Src = MI.getOperand(0).getReg();
7926 if (MRI.getType(Src) != S64)
7927 return false;
7928
7929 auto ModeReg =
7930 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7931 /*HasSideEffects=*/true, /*isConvergent=*/false)
7932 .addImm(FPEnvModeBitField);
7933 auto TrapReg =
7934 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7935 /*HasSideEffects=*/true, /*isConvergent=*/false)
7936 .addImm(FPEnvTrapBitField);
7937 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7938 MI.eraseFromParent();
7939 return true;
7940}
7941
7944 MachineIRBuilder &B) const {
7945 Register Src = MI.getOperand(0).getReg();
7946 if (MRI.getType(Src) != S64)
7947 return false;
7948
7949 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7950 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7951 /*HasSideEffects=*/true, /*isConvergent=*/false)
7952 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7953 .addReg(Unmerge.getReg(0));
7954 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7955 /*HasSideEffects=*/true, /*isConvergent=*/false)
7956 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7957 .addReg(Unmerge.getReg(1));
7958 MI.eraseFromParent();
7959 return true;
7960}
7961
7963 MachineInstr &MI) const {
7964 MachineIRBuilder &B = Helper.MIRBuilder;
7965 MachineRegisterInfo &MRI = *B.getMRI();
7966
7967 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7968 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7969 switch (IntrID) {
7970 case Intrinsic::sponentry:
7971 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
7972 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
7973 // that we can remove this cast.
7974 const LLT S32 = LLT::scalar(32);
7976 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
7977
7978 Register DstReg = MI.getOperand(0).getReg();
7979 B.buildIntToPtr(DstReg, TmpReg);
7980 MI.eraseFromParent();
7981 } else {
7982 int FI = B.getMF().getFrameInfo().CreateFixedObject(
7983 1, 0, /*IsImmutable=*/false);
7984 B.buildFrameIndex(MI.getOperand(0), FI);
7985 MI.eraseFromParent();
7986 }
7987 return true;
7988 case Intrinsic::amdgcn_if:
7989 case Intrinsic::amdgcn_else: {
7990 MachineInstr *Br = nullptr;
7991 MachineBasicBlock *UncondBrTarget = nullptr;
7992 bool Negated = false;
7993 if (MachineInstr *BrCond =
7994 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7995 const SIRegisterInfo *TRI
7996 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7997
7998 Register Def = MI.getOperand(1).getReg();
7999 Register Use = MI.getOperand(3).getReg();
8000
8001 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8002
8003 if (Negated)
8004 std::swap(CondBrTarget, UncondBrTarget);
8005
8006 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8007 if (IntrID == Intrinsic::amdgcn_if) {
8008 B.buildInstr(AMDGPU::SI_IF)
8009 .addDef(Def)
8010 .addUse(Use)
8011 .addMBB(UncondBrTarget);
8012 } else {
8013 B.buildInstr(AMDGPU::SI_ELSE)
8014 .addDef(Def)
8015 .addUse(Use)
8016 .addMBB(UncondBrTarget);
8017 }
8018
8019 if (Br) {
8020 Br->getOperand(0).setMBB(CondBrTarget);
8021 } else {
8022 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8023 // since we're swapping branch targets it needs to be reinserted.
8024 // FIXME: IRTranslator should probably not do this
8025 B.buildBr(*CondBrTarget);
8026 }
8027
8028 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8029 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8030 MI.eraseFromParent();
8031 BrCond->eraseFromParent();
8032 return true;
8033 }
8034
8035 return false;
8036 }
8037 case Intrinsic::amdgcn_loop: {
8038 MachineInstr *Br = nullptr;
8039 MachineBasicBlock *UncondBrTarget = nullptr;
8040 bool Negated = false;
8041 if (MachineInstr *BrCond =
8042 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8043 const SIRegisterInfo *TRI
8044 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8045
8046 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8047 Register Reg = MI.getOperand(2).getReg();
8048
8049 if (Negated)
8050 std::swap(CondBrTarget, UncondBrTarget);
8051
8052 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8053 B.buildInstr(AMDGPU::SI_LOOP)
8054 .addUse(Reg)
8055 .addMBB(UncondBrTarget);
8056
8057 if (Br)
8058 Br->getOperand(0).setMBB(CondBrTarget);
8059 else
8060 B.buildBr(*CondBrTarget);
8061
8062 MI.eraseFromParent();
8063 BrCond->eraseFromParent();
8064 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8065 return true;
8066 }
8067
8068 return false;
8069 }
8070 case Intrinsic::amdgcn_addrspacecast_nonnull:
8071 return legalizeAddrSpaceCast(MI, MRI, B);
8072 case Intrinsic::amdgcn_make_buffer_rsrc:
8073 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8074 case Intrinsic::amdgcn_kernarg_segment_ptr:
8075 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8076 // This only makes sense to call in a kernel, so just lower to null.
8077 B.buildConstant(MI.getOperand(0).getReg(), 0);
8078 MI.eraseFromParent();
8079 return true;
8080 }
8081
8084 case Intrinsic::amdgcn_implicitarg_ptr:
8085 return legalizeImplicitArgPtr(MI, MRI, B);
8086 case Intrinsic::amdgcn_workitem_id_x:
8087 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8089 case Intrinsic::amdgcn_workitem_id_y:
8090 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8092 case Intrinsic::amdgcn_workitem_id_z:
8093 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8095 case Intrinsic::amdgcn_workgroup_id_x:
8096 return legalizeWorkGroupId(
8100 case Intrinsic::amdgcn_workgroup_id_y:
8101 return legalizeWorkGroupId(
8105 case Intrinsic::amdgcn_workgroup_id_z:
8106 return legalizeWorkGroupId(
8110 case Intrinsic::amdgcn_cluster_id_x:
8111 return ST.hasClusters() &&
8114 case Intrinsic::amdgcn_cluster_id_y:
8115 return ST.hasClusters() &&
8118 case Intrinsic::amdgcn_cluster_id_z:
8119 return ST.hasClusters() &&
8122 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8123 return ST.hasClusters() &&
8126 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8127 return ST.hasClusters() &&
8130 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8131 return ST.hasClusters() &&
8134 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8135 return ST.hasClusters() &&
8137 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8138 return ST.hasClusters() &&
8141 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8142 return ST.hasClusters() &&
8145 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8146 return ST.hasClusters() &&
8149 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8150 return ST.hasClusters() &&
8152 MI, MRI, B,
8154 case Intrinsic::amdgcn_wave_id:
8155 return legalizeWaveID(MI, B);
8156 case Intrinsic::amdgcn_lds_kernel_id:
8157 return legalizePreloadedArgIntrin(MI, MRI, B,
8159 case Intrinsic::amdgcn_dispatch_ptr:
8160 return legalizePreloadedArgIntrin(MI, MRI, B,
8162 case Intrinsic::amdgcn_queue_ptr:
8163 return legalizePreloadedArgIntrin(MI, MRI, B,
8165 case Intrinsic::amdgcn_implicit_buffer_ptr:
8168 case Intrinsic::amdgcn_dispatch_id:
8169 return legalizePreloadedArgIntrin(MI, MRI, B,
8171 case Intrinsic::r600_read_ngroups_x:
8172 // TODO: Emit error for hsa
8175 case Intrinsic::r600_read_ngroups_y:
8178 case Intrinsic::r600_read_ngroups_z:
8181 case Intrinsic::r600_read_local_size_x:
8182 // TODO: Could insert G_ASSERT_ZEXT from s16
8184 case Intrinsic::r600_read_local_size_y:
8185 // TODO: Could insert G_ASSERT_ZEXT from s16
8187 // TODO: Could insert G_ASSERT_ZEXT from s16
8188 case Intrinsic::r600_read_local_size_z:
8191 case Intrinsic::amdgcn_fdiv_fast:
8192 return legalizeFDIVFastIntrin(MI, MRI, B);
8193 case Intrinsic::amdgcn_is_shared:
8195 case Intrinsic::amdgcn_is_private:
8197 case Intrinsic::amdgcn_wavefrontsize: {
8198 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8199 MI.eraseFromParent();
8200 return true;
8201 }
8202 case Intrinsic::amdgcn_s_buffer_load:
8203 return legalizeSBufferLoad(Helper, MI);
8204 case Intrinsic::amdgcn_raw_buffer_store:
8205 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8206 case Intrinsic::amdgcn_struct_buffer_store:
8207 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8208 return legalizeBufferStore(MI, Helper, false, false);
8209 case Intrinsic::amdgcn_raw_buffer_store_format:
8210 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8211 case Intrinsic::amdgcn_struct_buffer_store_format:
8212 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8213 return legalizeBufferStore(MI, Helper, false, true);
8214 case Intrinsic::amdgcn_raw_tbuffer_store:
8215 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8216 case Intrinsic::amdgcn_struct_tbuffer_store:
8217 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8218 return legalizeBufferStore(MI, Helper, true, true);
8219 case Intrinsic::amdgcn_raw_buffer_load:
8220 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8221 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8222 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8223 case Intrinsic::amdgcn_struct_buffer_load:
8224 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8225 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8226 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8227 return legalizeBufferLoad(MI, Helper, false, false);
8228 case Intrinsic::amdgcn_raw_buffer_load_format:
8229 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8230 case Intrinsic::amdgcn_struct_buffer_load_format:
8231 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8232 return legalizeBufferLoad(MI, Helper, true, false);
8233 case Intrinsic::amdgcn_raw_tbuffer_load:
8234 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8235 case Intrinsic::amdgcn_struct_tbuffer_load:
8236 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8237 return legalizeBufferLoad(MI, Helper, true, true);
8238 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8239 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8240 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8241 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8242 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8243 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8244 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8245 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8246 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8247 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8248 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8249 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8250 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8252 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8254 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8255 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8256 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8257 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8258 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8259 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8260 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8262 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8263 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8264 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8265 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8266 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8268 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8269 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8270 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8271 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8272 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8274 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8276 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8278 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8280 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8282 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8284 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8286 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8288 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8289 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8290 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8291 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8292 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8293 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8294 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8295 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8296 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8298 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8299 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8300 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8301 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8302 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8303 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8304 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8305 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8306 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8307 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8308 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8309 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8310 return legalizeBufferAtomic(MI, B, IntrID);
8311 case Intrinsic::amdgcn_rsq_clamp:
8312 return legalizeRsqClampIntrinsic(MI, MRI, B);
8313 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8315 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8316 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8318 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8319 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8320 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8321 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8322 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8323 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8324 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8325 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8326 Register Index = MI.getOperand(5).getReg();
8327 LLT S64 = LLT::scalar(64);
8328 LLT IndexArgTy = MRI.getType(Index);
8329 if (IndexArgTy != S64) {
8330 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8331 : B.buildAnyExt(S64, Index);
8332 MI.getOperand(5).setReg(NewIndex.getReg(0));
8333 }
8334 return true;
8335 }
8336 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8337 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8338 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8339 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8340 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8341 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8342 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8343 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8344 Register Index = MI.getOperand(5).getReg();
8345 LLT S32 = LLT::scalar(32);
8346 if (MRI.getType(Index) != S32)
8347 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8348 return true;
8349 }
8350 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8351 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8352 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8353 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8354 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8355 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8356 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8357 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8358 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8359 Register Index = MI.getOperand(7).getReg();
8360 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8361 ? LLT::scalar(64)
8362 : LLT::scalar(32);
8363 LLT IndexArgTy = MRI.getType(Index);
8364 if (IndexArgTy != IdxTy) {
8365 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8366 : B.buildAnyExt(IdxTy, Index);
8367 MI.getOperand(7).setReg(NewIndex.getReg(0));
8368 }
8369 return true;
8370 }
8371
8372 case Intrinsic::amdgcn_fmed3: {
8373 GISelChangeObserver &Observer = Helper.Observer;
8374
8375 // FIXME: This is to workaround the inability of tablegen match combiners to
8376 // match intrinsics in patterns.
8377 Observer.changingInstr(MI);
8378 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8379 MI.removeOperand(1);
8380 Observer.changedInstr(MI);
8381 return true;
8382 }
8383 case Intrinsic::amdgcn_readlane:
8384 case Intrinsic::amdgcn_writelane:
8385 case Intrinsic::amdgcn_readfirstlane:
8386 case Intrinsic::amdgcn_permlane16:
8387 case Intrinsic::amdgcn_permlanex16:
8388 case Intrinsic::amdgcn_permlane64:
8389 case Intrinsic::amdgcn_set_inactive:
8390 case Intrinsic::amdgcn_set_inactive_chain_arg:
8391 case Intrinsic::amdgcn_mov_dpp8:
8392 case Intrinsic::amdgcn_update_dpp:
8393 return legalizeLaneOp(Helper, MI, IntrID);
8394 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8395 return legalizeSBufferPrefetch(Helper, MI);
8396 case Intrinsic::amdgcn_dead: {
8397 // TODO: Use poison instead of undef
8398 for (const MachineOperand &Def : MI.defs())
8399 B.buildUndef(Def);
8400 MI.eraseFromParent();
8401 return true;
8402 }
8403 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8404 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8405 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8406 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8407 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8408 MI.eraseFromParent();
8409 return true;
8410 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8411 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8412 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8413 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8414 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8415 MI.eraseFromParent();
8416 return true;
8417 case Intrinsic::amdgcn_flat_load_monitor_b32:
8418 case Intrinsic::amdgcn_flat_load_monitor_b64:
8419 case Intrinsic::amdgcn_flat_load_monitor_b128:
8420 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8421 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8422 .add(MI.getOperand(0))
8423 .add(MI.getOperand(2))
8424 .addMemOperand(*MI.memoperands_begin());
8425 MI.eraseFromParent();
8426 return true;
8427 case Intrinsic::amdgcn_global_load_monitor_b32:
8428 case Intrinsic::amdgcn_global_load_monitor_b64:
8429 case Intrinsic::amdgcn_global_load_monitor_b128:
8430 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8431 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8432 .add(MI.getOperand(0))
8433 .add(MI.getOperand(2))
8434 .addMemOperand(*MI.memoperands_begin());
8435 MI.eraseFromParent();
8436 return true;
8437 default: {
8438 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8440 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8441 return true;
8442 }
8443 }
8444
8445 return true;
8446}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:71
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1269
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1213
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1193
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1153
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:133
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:387
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:916
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:2036
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:652
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:460
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:313
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1720
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.