LLVM 19.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT S64 = LLT::scalar(64);
287static const LLT S96 = LLT::scalar(96);
288static const LLT S128 = LLT::scalar(128);
289static const LLT S160 = LLT::scalar(160);
290static const LLT S224 = LLT::scalar(224);
291static const LLT S256 = LLT::scalar(256);
292static const LLT S512 = LLT::scalar(512);
294
295static const LLT V2S8 = LLT::fixed_vector(2, 8);
296static const LLT V2S16 = LLT::fixed_vector(2, 16);
297static const LLT V4S16 = LLT::fixed_vector(4, 16);
298static const LLT V6S16 = LLT::fixed_vector(6, 16);
299static const LLT V8S16 = LLT::fixed_vector(8, 16);
300static const LLT V10S16 = LLT::fixed_vector(10, 16);
301static const LLT V12S16 = LLT::fixed_vector(12, 16);
302static const LLT V16S16 = LLT::fixed_vector(16, 16);
303
304static const LLT V2S32 = LLT::fixed_vector(2, 32);
305static const LLT V3S32 = LLT::fixed_vector(3, 32);
306static const LLT V4S32 = LLT::fixed_vector(4, 32);
307static const LLT V5S32 = LLT::fixed_vector(5, 32);
308static const LLT V6S32 = LLT::fixed_vector(6, 32);
309static const LLT V7S32 = LLT::fixed_vector(7, 32);
310static const LLT V8S32 = LLT::fixed_vector(8, 32);
311static const LLT V9S32 = LLT::fixed_vector(9, 32);
312static const LLT V10S32 = LLT::fixed_vector(10, 32);
313static const LLT V11S32 = LLT::fixed_vector(11, 32);
314static const LLT V12S32 = LLT::fixed_vector(12, 32);
315static const LLT V16S32 = LLT::fixed_vector(16, 32);
316static const LLT V32S32 = LLT::fixed_vector(32, 32);
317
318static const LLT V2S64 = LLT::fixed_vector(2, 64);
319static const LLT V3S64 = LLT::fixed_vector(3, 64);
320static const LLT V4S64 = LLT::fixed_vector(4, 64);
321static const LLT V5S64 = LLT::fixed_vector(5, 64);
322static const LLT V6S64 = LLT::fixed_vector(6, 64);
323static const LLT V7S64 = LLT::fixed_vector(7, 64);
324static const LLT V8S64 = LLT::fixed_vector(8, 64);
325static const LLT V16S64 = LLT::fixed_vector(16, 64);
326
327static const LLT V2S128 = LLT::fixed_vector(2, 128);
328static const LLT V4S128 = LLT::fixed_vector(4, 128);
329
330static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
331 S160, S224, S256, S512};
332
333static std::initializer_list<LLT> AllS16Vectors{
335
336static std::initializer_list<LLT> AllS32Vectors = {
339
340static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
342
343// Checks whether a type is in the list of legal register types.
344static bool isRegisterClassType(LLT Ty) {
347
350}
351
352static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
353 return [TypeIdx](const LegalityQuery &Query) {
354 return isRegisterClassType(Query.Types[TypeIdx]);
355 };
356}
357
358// If we have a truncating store or an extending load with a data size larger
359// than 32-bits, we need to reduce to a 32-bit type.
361 return [=](const LegalityQuery &Query) {
362 const LLT Ty = Query.Types[TypeIdx];
363 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
364 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
365 };
366}
367
368// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
369// handle some operations by just promoting the register during
370// selection. There are also d16 loads on GFX9+ which preserve the high bits.
371static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
372 bool IsLoad, bool IsAtomic) {
373 switch (AS) {
375 // FIXME: Private element size.
376 return ST.enableFlatScratch() ? 128 : 32;
378 return ST.useDS128() ? 128 : 64;
383 // Treat constant and global as identical. SMRD loads are sometimes usable for
384 // global loads (ideally constant address space should be eliminated)
385 // depending on the context. Legality cannot be context dependent, but
386 // RegBankSelect can split the load as necessary depending on the pointer
387 // register bank/uniformity and if the memory is invariant or not written in a
388 // kernel.
389 return IsLoad ? 512 : 128;
390 default:
391 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
392 // if they may alias scratch depending on the subtarget. This needs to be
393 // moved to custom handling to use addressMayBeAccessedAsPrivate
394 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
395 }
396}
397
398static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
399 const LegalityQuery &Query) {
400 const LLT Ty = Query.Types[0];
401
402 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
403 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
404
405 unsigned RegSize = Ty.getSizeInBits();
406 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
407 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
408 unsigned AS = Query.Types[1].getAddressSpace();
409
410 // All of these need to be custom lowered to cast the pointer operand.
412 return false;
413
414 // Do not handle extending vector loads.
415 if (Ty.isVector() && MemSize != RegSize)
416 return false;
417
418 // TODO: We should be able to widen loads if the alignment is high enough, but
419 // we also need to modify the memory access size.
420#if 0
421 // Accept widening loads based on alignment.
422 if (IsLoad && MemSize < Size)
423 MemSize = std::max(MemSize, Align);
424#endif
425
426 // Only 1-byte and 2-byte to 32-bit extloads are valid.
427 if (MemSize != RegSize && RegSize != 32)
428 return false;
429
430 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
431 Query.MMODescrs[0].Ordering !=
432 AtomicOrdering::NotAtomic))
433 return false;
434
435 switch (MemSize) {
436 case 8:
437 case 16:
438 case 32:
439 case 64:
440 case 128:
441 break;
442 case 96:
443 if (!ST.hasDwordx3LoadStores())
444 return false;
445 break;
446 case 256:
447 case 512:
448 // These may contextually need to be broken down.
449 break;
450 default:
451 return false;
452 }
453
454 assert(RegSize >= MemSize);
455
456 if (AlignBits < MemSize) {
457 const SITargetLowering *TLI = ST.getTargetLowering();
458 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
459 Align(AlignBits / 8)))
460 return false;
461 }
462
463 return true;
464}
465
466// The newer buffer intrinsic forms take their resource arguments as
467// pointers in address space 8, aka s128 values. However, in order to not break
468// SelectionDAG, the underlying operations have to continue to take v4i32
469// arguments. Therefore, we convert resource pointers - or vectors of them
470// to integer values here.
471static bool hasBufferRsrcWorkaround(const LLT Ty) {
473 return true;
474 if (Ty.isVector()) {
475 const LLT ElemTy = Ty.getElementType();
476 return hasBufferRsrcWorkaround(ElemTy);
477 }
478 return false;
479}
480
481// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
482// workaround this. Eventually it should ignore the type for loads and only care
483// about the size. Return true in cases where we will workaround this for now by
484// bitcasting.
485static bool loadStoreBitcastWorkaround(const LLT Ty) {
487 return false;
488
489 const unsigned Size = Ty.getSizeInBits();
490 if (Size <= 64)
491 return false;
492 // Address space 8 pointers get their own workaround.
494 return false;
495 if (!Ty.isVector())
496 return true;
497
498 if (Ty.isPointerVector())
499 return true;
500
501 unsigned EltSize = Ty.getScalarSizeInBits();
502 return EltSize != 32 && EltSize != 64;
503}
504
505static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
506 const LLT Ty = Query.Types[0];
507 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
509}
510
511/// Return true if a load or store of the type should be lowered with a bitcast
512/// to a different type.
513static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
514 const LLT MemTy) {
515 const unsigned MemSizeInBits = MemTy.getSizeInBits();
516 const unsigned Size = Ty.getSizeInBits();
517 if (Size != MemSizeInBits)
518 return Size <= 32 && Ty.isVector();
519
521 return true;
522
523 // Don't try to handle bitcasting vector ext loads for now.
524 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
525 (Size <= 32 || isRegisterSize(Size)) &&
527}
528
529/// Return true if we should legalize a load by widening an odd sized memory
530/// access up to the alignment. Note this case when the memory access itself
531/// changes, not the size of the result register.
532static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
533 uint64_t AlignInBits, unsigned AddrSpace,
534 unsigned Opcode) {
535 unsigned SizeInBits = MemoryTy.getSizeInBits();
536 // We don't want to widen cases that are naturally legal.
537 if (isPowerOf2_32(SizeInBits))
538 return false;
539
540 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
541 // end up widening these for a scalar load during RegBankSelect, if we don't
542 // have 96-bit scalar loads.
543 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
544 return false;
545
546 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
547 return false;
548
549 // A load is known dereferenceable up to the alignment, so it's legal to widen
550 // to it.
551 //
552 // TODO: Could check dereferenceable for less aligned cases.
553 unsigned RoundedSize = NextPowerOf2(SizeInBits);
554 if (AlignInBits < RoundedSize)
555 return false;
556
557 // Do not widen if it would introduce a slow unaligned load.
558 const SITargetLowering *TLI = ST.getTargetLowering();
559 unsigned Fast = 0;
561 RoundedSize, AddrSpace, Align(AlignInBits / 8),
563 Fast;
564}
565
566static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
567 unsigned Opcode) {
568 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
569 return false;
570
571 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
572 Query.MMODescrs[0].AlignInBits,
573 Query.Types[1].getAddressSpace(), Opcode);
574}
575
576/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
577/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
578/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
580 MachineRegisterInfo &MRI, unsigned Idx) {
581 MachineOperand &MO = MI.getOperand(Idx);
582
583 const LLT PointerTy = MRI.getType(MO.getReg());
584
585 // Paranoidly prevent us from doing this multiple times.
587 return PointerTy;
588
589 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
590 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
591 if (!PointerTy.isVector()) {
592 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
593 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
594 const LLT S32 = LLT::scalar(32);
595
596 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
597 std::array<Register, 4> VectorElems;
598 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
599 for (unsigned I = 0; I < NumParts; ++I)
600 VectorElems[I] =
601 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
602 B.buildMergeValues(MO, VectorElems);
603 MO.setReg(VectorReg);
604 return VectorTy;
605 }
606 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
607 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
608 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
609 B.buildIntToPtr(MO, Scalar);
610 MO.setReg(BitcastReg);
611
612 return VectorTy;
613}
614
615/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
616/// the form in which the value must be in order to be passed to the low-level
617/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
618/// needed in order to account for the fact that we can't define a register
619/// class for s128 without breaking SelectionDAG.
621 MachineRegisterInfo &MRI = *B.getMRI();
622 const LLT PointerTy = MRI.getType(Pointer);
623 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
624 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
625
626 if (!PointerTy.isVector()) {
627 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
628 SmallVector<Register, 4> PointerParts;
629 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
630 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
631 for (unsigned I = 0; I < NumParts; ++I)
632 PointerParts.push_back(Unmerged.getReg(I));
633 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
634 }
635 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
636 return B.buildBitcast(VectorTy, Scalar).getReg(0);
637}
638
640 unsigned Idx) {
641 MachineOperand &MO = MI.getOperand(Idx);
642
643 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
644 // Paranoidly prevent us from doing this multiple times.
646 return;
648}
649
651 const GCNTargetMachine &TM)
652 : ST(ST_) {
653 using namespace TargetOpcode;
654
655 auto GetAddrSpacePtr = [&TM](unsigned AS) {
656 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
657 };
658
659 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
660 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
661 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
662 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
663 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
664 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
665 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
666 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
667 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
668 const LLT BufferStridedPtr =
669 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
670
671 const LLT CodePtr = FlatPtr;
672
673 const std::initializer_list<LLT> AddrSpaces64 = {
674 GlobalPtr, ConstantPtr, FlatPtr
675 };
676
677 const std::initializer_list<LLT> AddrSpaces32 = {
678 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
679 };
680
681 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
682
683 const std::initializer_list<LLT> FPTypesBase = {
684 S32, S64
685 };
686
687 const std::initializer_list<LLT> FPTypes16 = {
688 S32, S64, S16
689 };
690
691 const std::initializer_list<LLT> FPTypesPK16 = {
692 S32, S64, S16, V2S16
693 };
694
695 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
696
697 // s1 for VCC branches, s32 for SCC branches.
699
700 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
701 // elements for v3s16
704 .legalFor(AllS32Vectors)
706 .legalFor(AddrSpaces64)
707 .legalFor(AddrSpaces32)
708 .legalFor(AddrSpaces128)
709 .legalIf(isPointer(0))
710 .clampScalar(0, S16, S256)
712 .clampMaxNumElements(0, S32, 16)
714 .scalarize(0);
715
716 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
717 // Full set of gfx9 features.
718 if (ST.hasScalarAddSub64()) {
719 getActionDefinitionsBuilder({G_ADD, G_SUB})
720 .legalFor({S64, S32, S16, V2S16})
721 .clampMaxNumElementsStrict(0, S16, 2)
722 .scalarize(0)
723 .minScalar(0, S16)
725 .maxScalar(0, S32);
726 } else {
727 getActionDefinitionsBuilder({G_ADD, G_SUB})
728 .legalFor({S32, S16, V2S16})
729 .clampMaxNumElementsStrict(0, S16, 2)
730 .scalarize(0)
731 .minScalar(0, S16)
733 .maxScalar(0, S32);
734 }
735
736 if (ST.hasScalarSMulU64()) {
738 .legalFor({S64, S32, S16, V2S16})
739 .clampMaxNumElementsStrict(0, S16, 2)
740 .scalarize(0)
741 .minScalar(0, S16)
743 .custom();
744 } else {
746 .legalFor({S32, S16, V2S16})
747 .clampMaxNumElementsStrict(0, S16, 2)
748 .scalarize(0)
749 .minScalar(0, S16)
751 .custom();
752 }
753 assert(ST.hasMad64_32());
754
755 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
756 .legalFor({S32, S16, V2S16}) // Clamp modifier
757 .minScalarOrElt(0, S16)
759 .scalarize(0)
761 .lower();
762 } else if (ST.has16BitInsts()) {
763 getActionDefinitionsBuilder({G_ADD, G_SUB})
764 .legalFor({S32, S16})
765 .minScalar(0, S16)
767 .maxScalar(0, S32)
768 .scalarize(0);
769
771 .legalFor({S32, S16})
772 .scalarize(0)
773 .minScalar(0, S16)
774 .widenScalarToNextMultipleOf(0, 32)
775 .custom();
776 assert(ST.hasMad64_32());
777
778 // Technically the saturating operations require clamp bit support, but this
779 // was introduced at the same time as 16-bit operations.
780 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
781 .legalFor({S32, S16}) // Clamp modifier
782 .minScalar(0, S16)
783 .scalarize(0)
785 .lower();
786
787 // We're just lowering this, but it helps get a better result to try to
788 // coerce to the desired type first.
789 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
790 .minScalar(0, S16)
791 .scalarize(0)
792 .lower();
793 } else {
794 getActionDefinitionsBuilder({G_ADD, G_SUB})
795 .legalFor({S32})
796 .widenScalarToNextMultipleOf(0, 32)
797 .clampScalar(0, S32, S32)
798 .scalarize(0);
799
800 auto &Mul = getActionDefinitionsBuilder(G_MUL)
801 .legalFor({S32})
802 .scalarize(0)
803 .minScalar(0, S32)
804 .widenScalarToNextMultipleOf(0, 32);
805
806 if (ST.hasMad64_32())
807 Mul.custom();
808 else
809 Mul.maxScalar(0, S32);
810
811 if (ST.hasIntClamp()) {
812 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
813 .legalFor({S32}) // Clamp modifier.
814 .scalarize(0)
815 .minScalarOrElt(0, S32)
816 .lower();
817 } else {
818 // Clamp bit support was added in VI, along with 16-bit operations.
819 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
820 .minScalar(0, S32)
821 .scalarize(0)
822 .lower();
823 }
824
825 // FIXME: DAG expansion gets better results. The widening uses the smaller
826 // range values and goes for the min/max lowering directly.
827 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
828 .minScalar(0, S32)
829 .scalarize(0)
830 .lower();
831 }
832
834 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
835 .customFor({S32, S64})
836 .clampScalar(0, S32, S64)
838 .scalarize(0);
839
840 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
841 .legalFor({S32})
842 .maxScalar(0, S32);
843
844 if (ST.hasVOP3PInsts()) {
845 Mulh
846 .clampMaxNumElements(0, S8, 2)
847 .lowerFor({V2S8});
848 }
849
850 Mulh
851 .scalarize(0)
852 .lower();
853
854 // Report legal for any types we can handle anywhere. For the cases only legal
855 // on the SALU, RegBankSelect will be able to re-legalize.
856 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
857 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
858 .clampScalar(0, S32, S64)
862 .scalarize(0);
863
865 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
866 .legalFor({{S32, S1}, {S32, S32}})
867 .clampScalar(0, S32, S32)
868 .scalarize(0);
869
871 // Don't worry about the size constraint.
873 .lower();
874
876 .legalFor({S1, S32, S64, S16, GlobalPtr,
877 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
878 .legalIf(isPointer(0))
879 .clampScalar(0, S32, S64)
881
882 getActionDefinitionsBuilder(G_FCONSTANT)
883 .legalFor({S32, S64, S16})
884 .clampScalar(0, S16, S64);
885
886 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
887 .legalIf(isRegisterType(0))
888 // s1 and s16 are special cases because they have legal operations on
889 // them, but don't really occupy registers in the normal way.
890 .legalFor({S1, S16})
891 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
894 .clampMaxNumElements(0, S32, 16);
895
896 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
897
898 // If the amount is divergent, we have to do a wave reduction to get the
899 // maximum value, so this is expanded during RegBankSelect.
900 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
901 .legalFor({{PrivatePtr, S32}});
902
903 getActionDefinitionsBuilder(G_STACKSAVE)
904 .customFor({PrivatePtr});
905 getActionDefinitionsBuilder(G_STACKRESTORE)
906 .legalFor({PrivatePtr});
907
908 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
909
910 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
911 .customIf(typeIsNot(0, PrivatePtr));
912
913 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
914
915 auto &FPOpActions = getActionDefinitionsBuilder(
916 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
917 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
918 .legalFor({S32, S64});
919 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
920 .customFor({S32, S64});
921 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
922 .customFor({S32, S64});
923
924 if (ST.has16BitInsts()) {
925 if (ST.hasVOP3PInsts())
926 FPOpActions.legalFor({S16, V2S16});
927 else
928 FPOpActions.legalFor({S16});
929
930 TrigActions.customFor({S16});
931 FDIVActions.customFor({S16});
932 }
933
934 if (ST.hasPackedFP32Ops()) {
935 FPOpActions.legalFor({V2S32});
936 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
937 }
938
939 auto &MinNumMaxNum = getActionDefinitionsBuilder({
940 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
941
942 if (ST.hasVOP3PInsts()) {
943 MinNumMaxNum.customFor(FPTypesPK16)
946 .clampScalar(0, S16, S64)
947 .scalarize(0);
948 } else if (ST.has16BitInsts()) {
949 MinNumMaxNum.customFor(FPTypes16)
950 .clampScalar(0, S16, S64)
951 .scalarize(0);
952 } else {
953 MinNumMaxNum.customFor(FPTypesBase)
954 .clampScalar(0, S32, S64)
955 .scalarize(0);
956 }
957
958 if (ST.hasVOP3PInsts())
959 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
960
961 FPOpActions
962 .scalarize(0)
963 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
964
965 TrigActions
966 .scalarize(0)
967 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
968
969 FDIVActions
970 .scalarize(0)
971 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
972
973 getActionDefinitionsBuilder({G_FNEG, G_FABS})
974 .legalFor(FPTypesPK16)
976 .scalarize(0)
977 .clampScalar(0, S16, S64);
978
979 if (ST.has16BitInsts()) {
981 .legalFor({S16})
982 .customFor({S32, S64})
983 .scalarize(0)
984 .unsupported();
986 .legalFor({S32, S64, S16})
987 .scalarize(0)
988 .clampScalar(0, S16, S64);
989
990 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
991 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
992 .scalarize(0)
993 .maxScalarIf(typeIs(0, S16), 1, S16)
994 .clampScalar(1, S32, S32)
995 .lower();
996
998 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
999 .scalarize(0)
1000 .lower();
1001 } else {
1003 .customFor({S32, S64, S16})
1004 .scalarize(0)
1005 .unsupported();
1006
1007
1008 if (ST.hasFractBug()) {
1010 .customFor({S64})
1011 .legalFor({S32, S64})
1012 .scalarize(0)
1013 .clampScalar(0, S32, S64);
1014 } else {
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 }
1020
1021 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1022 .legalFor({{S32, S32}, {S64, S32}})
1023 .scalarize(0)
1024 .clampScalar(0, S32, S64)
1025 .clampScalar(1, S32, S32)
1026 .lower();
1027
1029 .customFor({{S32, S32}, {S64, S32}})
1030 .scalarize(0)
1031 .minScalar(0, S32)
1032 .clampScalar(1, S32, S32)
1033 .lower();
1034 }
1035
1037 .legalFor({{S32, S64}, {S16, S32}})
1038 .scalarize(0)
1039 .lower();
1040
1042 .legalFor({{S64, S32}, {S32, S16}})
1043 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1044 .scalarize(0);
1045
1046 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1047 if (ST.has16BitInsts()) {
1048 FSubActions
1049 // Use actual fsub instruction
1050 .legalFor({S32, S16})
1051 // Must use fadd + fneg
1052 .lowerFor({S64, V2S16});
1053 } else {
1054 FSubActions
1055 // Use actual fsub instruction
1056 .legalFor({S32})
1057 // Must use fadd + fneg
1058 .lowerFor({S64, S16, V2S16});
1059 }
1060
1061 FSubActions
1062 .scalarize(0)
1063 .clampScalar(0, S32, S64);
1064
1065 // Whether this is legal depends on the floating point mode for the function.
1066 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1067 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1068 FMad.customFor({S32, S16});
1069 else if (ST.hasMadMacF32Insts())
1070 FMad.customFor({S32});
1071 else if (ST.hasMadF16())
1072 FMad.customFor({S16});
1073 FMad.scalarize(0)
1074 .lower();
1075
1076 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1077 if (ST.has16BitInsts()) {
1078 FRem.customFor({S16, S32, S64});
1079 } else {
1080 FRem.minScalar(0, S32)
1081 .customFor({S32, S64});
1082 }
1083 FRem.scalarize(0);
1084
1085 // TODO: Do we need to clamp maximum bitwidth?
1087 .legalIf(isScalar(0))
1088 .legalFor({{V2S16, V2S32}})
1089 .clampMaxNumElements(0, S16, 2)
1090 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1091 // situations (like an invalid implicit use), we don't want to infinite loop
1092 // in the legalizer.
1094 .alwaysLegal();
1095
1096 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1097 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1098 {S32, S1}, {S64, S1}, {S16, S1}})
1099 .scalarize(0)
1100 .clampScalar(0, S32, S64)
1101 .widenScalarToNextPow2(1, 32);
1102
1103 // TODO: Split s1->s64 during regbankselect for VALU.
1104 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1105 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1106 .lowerIf(typeIs(1, S1))
1107 .customFor({{S32, S64}, {S64, S64}});
1108 if (ST.has16BitInsts())
1109 IToFP.legalFor({{S16, S16}});
1110 IToFP.clampScalar(1, S32, S64)
1111 .minScalar(0, S32)
1112 .scalarize(0)
1114
1115 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1116 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1117 .customFor({{S64, S32}, {S64, S64}})
1118 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1119 if (ST.has16BitInsts())
1120 FPToI.legalFor({{S16, S16}});
1121 else
1122 FPToI.minScalar(1, S32);
1123
1124 FPToI.minScalar(0, S32)
1125 .widenScalarToNextPow2(0, 32)
1126 .scalarize(0)
1127 .lower();
1128
1129 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1130 .customFor({S16, S32})
1131 .scalarize(0)
1132 .lower();
1133
1134 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1135 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1136 .scalarize(0)
1137 .lower();
1138
1139 if (ST.has16BitInsts()) {
1140 getActionDefinitionsBuilder(
1141 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1142 .legalFor({S16, S32, S64})
1143 .clampScalar(0, S16, S64)
1144 .scalarize(0);
1145 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1146 getActionDefinitionsBuilder(
1147 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1148 .legalFor({S32, S64})
1149 .clampScalar(0, S32, S64)
1150 .scalarize(0);
1151 } else {
1152 getActionDefinitionsBuilder(
1153 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1154 .legalFor({S32})
1155 .customFor({S64})
1156 .clampScalar(0, S32, S64)
1157 .scalarize(0);
1158 }
1159
1160 getActionDefinitionsBuilder(G_PTR_ADD)
1161 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1162 .legalIf(all(isPointer(0), sameSize(0, 1)))
1163 .scalarize(0)
1164 .scalarSameSizeAs(1, 0);
1165
1166 getActionDefinitionsBuilder(G_PTRMASK)
1167 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1168 .scalarSameSizeAs(1, 0)
1169 .scalarize(0);
1170
1171 auto &CmpBuilder =
1172 getActionDefinitionsBuilder(G_ICMP)
1173 // The compare output type differs based on the register bank of the output,
1174 // so make both s1 and s32 legal.
1175 //
1176 // Scalar compares producing output in scc will be promoted to s32, as that
1177 // is the allocatable register type that will be needed for the copy from
1178 // scc. This will be promoted during RegBankSelect, and we assume something
1179 // before that won't try to use s32 result types.
1180 //
1181 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1182 // bank.
1183 .legalForCartesianProduct(
1184 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1185 .legalForCartesianProduct(
1186 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1187 if (ST.has16BitInsts()) {
1188 CmpBuilder.legalFor({{S1, S16}});
1189 }
1190
1191 CmpBuilder
1192 .widenScalarToNextPow2(1)
1193 .clampScalar(1, S32, S64)
1194 .scalarize(0)
1195 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1196
1197 auto &FCmpBuilder =
1198 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1199 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1200
1201 if (ST.hasSALUFloatInsts())
1202 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1203
1204 FCmpBuilder
1205 .widenScalarToNextPow2(1)
1206 .clampScalar(1, S32, S64)
1207 .scalarize(0);
1208
1209 // FIXME: fpow has a selection pattern that should move to custom lowering.
1210 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1211 if (ST.has16BitInsts())
1212 ExpOps.customFor({{S32}, {S16}});
1213 else
1214 ExpOps.customFor({S32});
1215 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1216 .scalarize(0);
1217
1218 getActionDefinitionsBuilder(G_FPOWI)
1219 .clampScalar(0, MinScalarFPTy, S32)
1220 .lower();
1221
1222 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1223 Log2Ops.customFor({S32});
1224 if (ST.has16BitInsts())
1225 Log2Ops.legalFor({S16});
1226 else
1227 Log2Ops.customFor({S16});
1228 Log2Ops.scalarize(0)
1229 .lower();
1230
1231 auto &LogOps =
1232 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1233 LogOps.customFor({S32, S16});
1234 LogOps.clampScalar(0, MinScalarFPTy, S32)
1235 .scalarize(0);
1236
1237 // The 64-bit versions produce 32-bit results, but only on the SALU.
1238 getActionDefinitionsBuilder(G_CTPOP)
1239 .legalFor({{S32, S32}, {S32, S64}})
1240 .clampScalar(0, S32, S32)
1241 .widenScalarToNextPow2(1, 32)
1242 .clampScalar(1, S32, S64)
1243 .scalarize(0)
1244 .widenScalarToNextPow2(0, 32);
1245
1246 // If no 16 bit instr is available, lower into different instructions.
1247 if (ST.has16BitInsts())
1248 getActionDefinitionsBuilder(G_IS_FPCLASS)
1249 .legalForCartesianProduct({S1}, FPTypes16)
1250 .widenScalarToNextPow2(1)
1251 .scalarize(0)
1252 .lower();
1253 else
1254 getActionDefinitionsBuilder(G_IS_FPCLASS)
1255 .legalForCartesianProduct({S1}, FPTypesBase)
1256 .lowerFor({S1, S16})
1257 .widenScalarToNextPow2(1)
1258 .scalarize(0)
1259 .lower();
1260
1261 // The hardware instructions return a different result on 0 than the generic
1262 // instructions expect. The hardware produces -1, but these produce the
1263 // bitwidth.
1264 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1265 .scalarize(0)
1266 .clampScalar(0, S32, S32)
1267 .clampScalar(1, S32, S64)
1268 .widenScalarToNextPow2(0, 32)
1269 .widenScalarToNextPow2(1, 32)
1270 .custom();
1271
1272 // The 64-bit versions produce 32-bit results, but only on the SALU.
1273 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1274 .legalFor({{S32, S32}, {S32, S64}})
1275 .clampScalar(0, S32, S32)
1276 .clampScalar(1, S32, S64)
1277 .scalarize(0)
1278 .widenScalarToNextPow2(0, 32)
1279 .widenScalarToNextPow2(1, 32);
1280
1281 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1282 // RegBankSelect.
1283 getActionDefinitionsBuilder(G_BITREVERSE)
1284 .legalFor({S32, S64})
1285 .clampScalar(0, S32, S64)
1286 .scalarize(0)
1287 .widenScalarToNextPow2(0);
1288
1289 if (ST.has16BitInsts()) {
1290 getActionDefinitionsBuilder(G_BSWAP)
1291 .legalFor({S16, S32, V2S16})
1292 .clampMaxNumElementsStrict(0, S16, 2)
1293 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1294 // narrowScalar limitation.
1295 .widenScalarToNextPow2(0)
1296 .clampScalar(0, S16, S32)
1297 .scalarize(0);
1298
1299 if (ST.hasVOP3PInsts()) {
1300 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1301 .legalFor({S32, S16, V2S16})
1302 .clampMaxNumElements(0, S16, 2)
1303 .minScalar(0, S16)
1304 .widenScalarToNextPow2(0)
1305 .scalarize(0)
1306 .lower();
1307 } else {
1308 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1309 .legalFor({S32, S16})
1310 .widenScalarToNextPow2(0)
1311 .minScalar(0, S16)
1312 .scalarize(0)
1313 .lower();
1314 }
1315 } else {
1316 // TODO: Should have same legality without v_perm_b32
1317 getActionDefinitionsBuilder(G_BSWAP)
1318 .legalFor({S32})
1319 .lowerIf(scalarNarrowerThan(0, 32))
1320 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1321 // narrowScalar limitation.
1322 .widenScalarToNextPow2(0)
1323 .maxScalar(0, S32)
1324 .scalarize(0)
1325 .lower();
1326
1327 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1328 .legalFor({S32})
1329 .minScalar(0, S32)
1330 .widenScalarToNextPow2(0)
1331 .scalarize(0)
1332 .lower();
1333 }
1334
1335 getActionDefinitionsBuilder(G_INTTOPTR)
1336 // List the common cases
1337 .legalForCartesianProduct(AddrSpaces64, {S64})
1338 .legalForCartesianProduct(AddrSpaces32, {S32})
1339 .scalarize(0)
1340 // Accept any address space as long as the size matches
1341 .legalIf(sameSize(0, 1))
1342 .widenScalarIf(smallerThan(1, 0),
1343 [](const LegalityQuery &Query) {
1344 return std::pair(
1345 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1346 })
1347 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1348 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1349 });
1350
1351 getActionDefinitionsBuilder(G_PTRTOINT)
1352 // List the common cases
1353 .legalForCartesianProduct(AddrSpaces64, {S64})
1354 .legalForCartesianProduct(AddrSpaces32, {S32})
1355 .scalarize(0)
1356 // Accept any address space as long as the size matches
1357 .legalIf(sameSize(0, 1))
1358 .widenScalarIf(smallerThan(0, 1),
1359 [](const LegalityQuery &Query) {
1360 return std::pair(
1361 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1362 })
1363 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1364 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1365 });
1366
1367 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1368 .scalarize(0)
1369 .custom();
1370
1371 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1372 bool IsLoad) -> bool {
1373 const LLT DstTy = Query.Types[0];
1374
1375 // Split vector extloads.
1376 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1377
1378 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1379 return true;
1380
1381 const LLT PtrTy = Query.Types[1];
1382 unsigned AS = PtrTy.getAddressSpace();
1383 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1384 Query.MMODescrs[0].Ordering !=
1386 return true;
1387
1388 // Catch weird sized loads that don't evenly divide into the access sizes
1389 // TODO: May be able to widen depending on alignment etc.
1390 unsigned NumRegs = (MemSize + 31) / 32;
1391 if (NumRegs == 3) {
1392 if (!ST.hasDwordx3LoadStores())
1393 return true;
1394 } else {
1395 // If the alignment allows, these should have been widened.
1396 if (!isPowerOf2_32(NumRegs))
1397 return true;
1398 }
1399
1400 return false;
1401 };
1402
1403 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1404 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1405 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1406
1407 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1408 // LDS
1409 // TODO: Unsupported flat for SI.
1410
1411 for (unsigned Op : {G_LOAD, G_STORE}) {
1412 const bool IsStore = Op == G_STORE;
1413
1414 auto &Actions = getActionDefinitionsBuilder(Op);
1415 // Explicitly list some common cases.
1416 // TODO: Does this help compile time at all?
1417 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1418 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1419 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1420 {S64, GlobalPtr, S64, GlobalAlign32},
1421 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1422 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1423 {S32, GlobalPtr, S8, GlobalAlign8},
1424 {S32, GlobalPtr, S16, GlobalAlign16},
1425
1426 {S32, LocalPtr, S32, 32},
1427 {S64, LocalPtr, S64, 32},
1428 {V2S32, LocalPtr, V2S32, 32},
1429 {S32, LocalPtr, S8, 8},
1430 {S32, LocalPtr, S16, 16},
1431 {V2S16, LocalPtr, S32, 32},
1432
1433 {S32, PrivatePtr, S32, 32},
1434 {S32, PrivatePtr, S8, 8},
1435 {S32, PrivatePtr, S16, 16},
1436 {V2S16, PrivatePtr, S32, 32},
1437
1438 {S32, ConstantPtr, S32, GlobalAlign32},
1439 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1440 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1441 {S64, ConstantPtr, S64, GlobalAlign32},
1442 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1443 Actions.legalIf(
1444 [=](const LegalityQuery &Query) -> bool {
1445 return isLoadStoreLegal(ST, Query);
1446 });
1447
1448 // The custom pointers (fat pointers, buffer resources) don't work with load
1449 // and store at this level. Fat pointers should have been lowered to
1450 // intrinsics before the translation to MIR.
1451 Actions.unsupportedIf(
1452 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1453
1454 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1455 // ptrtoint. This is needed to account for the fact that we can't have i128
1456 // as a register class for SelectionDAG reasons.
1457 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1458 return hasBufferRsrcWorkaround(Query.Types[0]);
1459 });
1460
1461 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1462 // 64-bits.
1463 //
1464 // TODO: Should generalize bitcast action into coerce, which will also cover
1465 // inserting addrspacecasts.
1466 Actions.customIf(typeIs(1, Constant32Ptr));
1467
1468 // Turn any illegal element vectors into something easier to deal
1469 // with. These will ultimately produce 32-bit scalar shifts to extract the
1470 // parts anyway.
1471 //
1472 // For odd 16-bit element vectors, prefer to split those into pieces with
1473 // 16-bit vector parts.
1474 Actions.bitcastIf(
1475 [=](const LegalityQuery &Query) -> bool {
1476 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1477 Query.MMODescrs[0].MemoryTy);
1478 }, bitcastToRegisterType(0));
1479
1480 if (!IsStore) {
1481 // Widen suitably aligned loads by loading extra bytes. The standard
1482 // legalization actions can't properly express widening memory operands.
1483 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1484 return shouldWidenLoad(ST, Query, G_LOAD);
1485 });
1486 }
1487
1488 // FIXME: load/store narrowing should be moved to lower action
1489 Actions
1490 .narrowScalarIf(
1491 [=](const LegalityQuery &Query) -> bool {
1492 return !Query.Types[0].isVector() &&
1493 needToSplitMemOp(Query, Op == G_LOAD);
1494 },
1495 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1496 const LLT DstTy = Query.Types[0];
1497 const LLT PtrTy = Query.Types[1];
1498
1499 const unsigned DstSize = DstTy.getSizeInBits();
1500 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1501
1502 // Split extloads.
1503 if (DstSize > MemSize)
1504 return std::pair(0, LLT::scalar(MemSize));
1505
1506 unsigned MaxSize = maxSizeForAddrSpace(
1507 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1508 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1509 if (MemSize > MaxSize)
1510 return std::pair(0, LLT::scalar(MaxSize));
1511
1512 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1513 return std::pair(0, LLT::scalar(Align));
1514 })
1515 .fewerElementsIf(
1516 [=](const LegalityQuery &Query) -> bool {
1517 return Query.Types[0].isVector() &&
1518 needToSplitMemOp(Query, Op == G_LOAD);
1519 },
1520 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1521 const LLT DstTy = Query.Types[0];
1522 const LLT PtrTy = Query.Types[1];
1523
1524 LLT EltTy = DstTy.getElementType();
1525 unsigned MaxSize = maxSizeForAddrSpace(
1526 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1527 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1528
1529 // FIXME: Handle widened to power of 2 results better. This ends
1530 // up scalarizing.
1531 // FIXME: 3 element stores scalarized on SI
1532
1533 // Split if it's too large for the address space.
1534 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1535 if (MemSize > MaxSize) {
1536 unsigned NumElts = DstTy.getNumElements();
1537 unsigned EltSize = EltTy.getSizeInBits();
1538
1539 if (MaxSize % EltSize == 0) {
1540 return std::pair(
1542 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1543 }
1544
1545 unsigned NumPieces = MemSize / MaxSize;
1546
1547 // FIXME: Refine when odd breakdowns handled
1548 // The scalars will need to be re-legalized.
1549 if (NumPieces == 1 || NumPieces >= NumElts ||
1550 NumElts % NumPieces != 0)
1551 return std::pair(0, EltTy);
1552
1553 return std::pair(0,
1554 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1555 }
1556
1557 // FIXME: We could probably handle weird extending loads better.
1558 if (DstTy.getSizeInBits() > MemSize)
1559 return std::pair(0, EltTy);
1560
1561 unsigned EltSize = EltTy.getSizeInBits();
1562 unsigned DstSize = DstTy.getSizeInBits();
1563 if (!isPowerOf2_32(DstSize)) {
1564 // We're probably decomposing an odd sized store. Try to split
1565 // to the widest type. TODO: Account for alignment. As-is it
1566 // should be OK, since the new parts will be further legalized.
1567 unsigned FloorSize = llvm::bit_floor(DstSize);
1568 return std::pair(
1570 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1571 }
1572
1573 // May need relegalization for the scalars.
1574 return std::pair(0, EltTy);
1575 })
1576 .minScalar(0, S32)
1577 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1578 .widenScalarToNextPow2(0)
1579 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1580 .lower();
1581 }
1582
1583 // FIXME: Unaligned accesses not lowered.
1584 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1585 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1586 {S32, GlobalPtr, S16, 2 * 8},
1587 {S32, LocalPtr, S8, 8},
1588 {S32, LocalPtr, S16, 16},
1589 {S32, PrivatePtr, S8, 8},
1590 {S32, PrivatePtr, S16, 16},
1591 {S32, ConstantPtr, S8, 8},
1592 {S32, ConstantPtr, S16, 2 * 8}})
1593 .legalIf(
1594 [=](const LegalityQuery &Query) -> bool {
1595 return isLoadStoreLegal(ST, Query);
1596 });
1597
1598 if (ST.hasFlatAddressSpace()) {
1599 ExtLoads.legalForTypesWithMemDesc(
1600 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1601 }
1602
1603 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1604 // 64-bits.
1605 //
1606 // TODO: Should generalize bitcast action into coerce, which will also cover
1607 // inserting addrspacecasts.
1608 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1609
1610 ExtLoads.clampScalar(0, S32, S32)
1611 .widenScalarToNextPow2(0)
1612 .lower();
1613
1614 auto &Atomics = getActionDefinitionsBuilder(
1615 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1616 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1617 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1618 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1619 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1620 {S64, GlobalPtr}, {S64, LocalPtr},
1621 {S32, RegionPtr}, {S64, RegionPtr}});
1622 if (ST.hasFlatAddressSpace()) {
1623 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1624 }
1625
1626 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1627 if (ST.hasLDSFPAtomicAddF32()) {
1628 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1629 if (ST.hasLdsAtomicAddF64())
1630 Atomic.legalFor({{S64, LocalPtr}});
1631 if (ST.hasAtomicDsPkAdd16Insts())
1632 Atomic.legalFor({{V2S16, LocalPtr}});
1633 }
1634 if (ST.hasAtomicFaddInsts())
1635 Atomic.legalFor({{S32, GlobalPtr}});
1636 if (ST.hasFlatAtomicFaddF32Inst())
1637 Atomic.legalFor({{S32, FlatPtr}});
1638
1639 if (ST.hasGFX90AInsts()) {
1640 // These are legal with some caveats, and should have undergone expansion in
1641 // the IR in most situations
1642 // TODO: Move atomic expansion into legalizer
1643 Atomic.legalFor({
1644 {S32, GlobalPtr},
1645 {S64, GlobalPtr},
1646 {S64, FlatPtr}
1647 });
1648 }
1649
1650 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1651 // demarshalling
1652 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1653 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1654 {S32, FlatPtr}, {S64, FlatPtr}})
1655 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1656 {S32, RegionPtr}, {S64, RegionPtr}});
1657 // TODO: Pointer types, any 32-bit or 64-bit vector
1658
1659 // Condition should be s32 for scalar, s1 for vector.
1660 getActionDefinitionsBuilder(G_SELECT)
1661 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1662 LocalPtr, FlatPtr, PrivatePtr,
1663 LLT::fixed_vector(2, LocalPtr),
1664 LLT::fixed_vector(2, PrivatePtr)},
1665 {S1, S32})
1666 .clampScalar(0, S16, S64)
1667 .scalarize(1)
1668 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1669 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1670 .clampMaxNumElements(0, S32, 2)
1671 .clampMaxNumElements(0, LocalPtr, 2)
1672 .clampMaxNumElements(0, PrivatePtr, 2)
1673 .scalarize(0)
1674 .widenScalarToNextPow2(0)
1675 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1676
1677 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1678 // be more flexible with the shift amount type.
1679 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1680 .legalFor({{S32, S32}, {S64, S32}});
1681 if (ST.has16BitInsts()) {
1682 if (ST.hasVOP3PInsts()) {
1683 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1684 .clampMaxNumElements(0, S16, 2);
1685 } else
1686 Shifts.legalFor({{S16, S16}});
1687
1688 // TODO: Support 16-bit shift amounts for all types
1689 Shifts.widenScalarIf(
1690 [=](const LegalityQuery &Query) {
1691 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1692 // 32-bit amount.
1693 const LLT ValTy = Query.Types[0];
1694 const LLT AmountTy = Query.Types[1];
1695 return ValTy.getSizeInBits() <= 16 &&
1696 AmountTy.getSizeInBits() < 16;
1697 }, changeTo(1, S16));
1698 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1699 Shifts.clampScalar(1, S32, S32);
1700 Shifts.widenScalarToNextPow2(0, 16);
1701 Shifts.clampScalar(0, S16, S64);
1702
1703 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1704 .minScalar(0, S16)
1705 .scalarize(0)
1706 .lower();
1707 } else {
1708 // Make sure we legalize the shift amount type first, as the general
1709 // expansion for the shifted type will produce much worse code if it hasn't
1710 // been truncated already.
1711 Shifts.clampScalar(1, S32, S32);
1712 Shifts.widenScalarToNextPow2(0, 32);
1713 Shifts.clampScalar(0, S32, S64);
1714
1715 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1716 .minScalar(0, S32)
1717 .scalarize(0)
1718 .lower();
1719 }
1720 Shifts.scalarize(0);
1721
1722 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1723 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1724 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1725 unsigned IdxTypeIdx = 2;
1726
1727 getActionDefinitionsBuilder(Op)
1728 .customIf([=](const LegalityQuery &Query) {
1729 const LLT EltTy = Query.Types[EltTypeIdx];
1730 const LLT VecTy = Query.Types[VecTypeIdx];
1731 const LLT IdxTy = Query.Types[IdxTypeIdx];
1732 const unsigned EltSize = EltTy.getSizeInBits();
1733 const bool isLegalVecType =
1735 // Address space 8 pointers are 128-bit wide values, but the logic
1736 // below will try to bitcast them to 2N x s64, which will fail.
1737 // Therefore, as an intermediate step, wrap extracts/insertions from a
1738 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1739 // extraction result) in order to produce a vector operation that can
1740 // be handled by the logic below.
1741 if (EltTy.isPointer() && EltSize > 64)
1742 return true;
1743 return (EltSize == 32 || EltSize == 64) &&
1744 VecTy.getSizeInBits() % 32 == 0 &&
1745 VecTy.getSizeInBits() <= MaxRegisterSize &&
1746 IdxTy.getSizeInBits() == 32 &&
1747 isLegalVecType;
1748 })
1749 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1750 bitcastToVectorElement32(VecTypeIdx))
1751 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1752 .bitcastIf(
1753 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1754 [=](const LegalityQuery &Query) {
1755 // For > 64-bit element types, try to turn this into a 64-bit
1756 // element vector since we may be able to do better indexing
1757 // if this is scalar. If not, fall back to 32.
1758 const LLT EltTy = Query.Types[EltTypeIdx];
1759 const LLT VecTy = Query.Types[VecTypeIdx];
1760 const unsigned DstEltSize = EltTy.getSizeInBits();
1761 const unsigned VecSize = VecTy.getSizeInBits();
1762
1763 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1764 return std::pair(
1765 VecTypeIdx,
1766 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1767 })
1768 .clampScalar(EltTypeIdx, S32, S64)
1769 .clampScalar(VecTypeIdx, S32, S64)
1770 .clampScalar(IdxTypeIdx, S32, S32)
1771 .clampMaxNumElements(VecTypeIdx, S32, 32)
1772 // TODO: Clamp elements for 64-bit vectors?
1773 .moreElementsIf(
1774 isIllegalRegisterType(VecTypeIdx),
1776 // It should only be necessary with variable indexes.
1777 // As a last resort, lower to the stack
1778 .lower();
1779 }
1780
1781 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1782 .unsupportedIf([=](const LegalityQuery &Query) {
1783 const LLT &EltTy = Query.Types[1].getElementType();
1784 return Query.Types[0] != EltTy;
1785 });
1786
1787 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1788 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1789 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1790
1791 // FIXME: Doesn't handle extract of illegal sizes.
1792 getActionDefinitionsBuilder(Op)
1793 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1794 .lowerIf([=](const LegalityQuery &Query) {
1795 // Sub-vector(or single element) insert and extract.
1796 // TODO: verify immediate offset here since lower only works with
1797 // whole elements.
1798 const LLT BigTy = Query.Types[BigTyIdx];
1799 return BigTy.isVector();
1800 })
1801 // FIXME: Multiples of 16 should not be legal.
1802 .legalIf([=](const LegalityQuery &Query) {
1803 const LLT BigTy = Query.Types[BigTyIdx];
1804 const LLT LitTy = Query.Types[LitTyIdx];
1805 return (BigTy.getSizeInBits() % 32 == 0) &&
1806 (LitTy.getSizeInBits() % 16 == 0);
1807 })
1808 .widenScalarIf(
1809 [=](const LegalityQuery &Query) {
1810 const LLT BigTy = Query.Types[BigTyIdx];
1811 return (BigTy.getScalarSizeInBits() < 16);
1812 },
1814 .widenScalarIf(
1815 [=](const LegalityQuery &Query) {
1816 const LLT LitTy = Query.Types[LitTyIdx];
1817 return (LitTy.getScalarSizeInBits() < 16);
1818 },
1820 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1821 .widenScalarToNextPow2(BigTyIdx, 32);
1822
1823 }
1824
1825 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1826 .legalForCartesianProduct(AllS32Vectors, {S32})
1827 .legalForCartesianProduct(AllS64Vectors, {S64})
1828 .clampNumElements(0, V16S32, V32S32)
1829 .clampNumElements(0, V2S64, V16S64)
1830 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1831 .moreElementsIf(
1834
1835 if (ST.hasScalarPackInsts()) {
1836 BuildVector
1837 // FIXME: Should probably widen s1 vectors straight to s32
1838 .minScalarOrElt(0, S16)
1839 .minScalar(1, S16);
1840
1841 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1842 .legalFor({V2S16, S32})
1843 .lower();
1844 } else {
1845 BuildVector.customFor({V2S16, S16});
1846 BuildVector.minScalarOrElt(0, S32);
1847
1848 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1849 .customFor({V2S16, S32})
1850 .lower();
1851 }
1852
1853 BuildVector.legalIf(isRegisterType(0));
1854
1855 // FIXME: Clamp maximum size
1856 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1857 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1858 .clampMaxNumElements(0, S32, 32)
1859 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1860 .clampMaxNumElements(0, S16, 64);
1861
1862 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1863
1864 // Merge/Unmerge
1865 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1866 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1867 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1868
1869 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1870 const LLT Ty = Query.Types[TypeIdx];
1871 if (Ty.isVector()) {
1872 const LLT &EltTy = Ty.getElementType();
1873 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1874 return true;
1875 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1876 return true;
1877 }
1878 return false;
1879 };
1880
1881 auto &Builder = getActionDefinitionsBuilder(Op)
1882 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1883 .lowerFor({{S16, V2S16}})
1884 .lowerIf([=](const LegalityQuery &Query) {
1885 const LLT BigTy = Query.Types[BigTyIdx];
1886 return BigTy.getSizeInBits() == 32;
1887 })
1888 // Try to widen to s16 first for small types.
1889 // TODO: Only do this on targets with legal s16 shifts
1890 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1891 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1892 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1893 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1894 elementTypeIs(1, S16)),
1895 changeTo(1, V2S16))
1896 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1897 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1898 // valid.
1899 .clampScalar(LitTyIdx, S32, S512)
1900 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1901 // Break up vectors with weird elements into scalars
1902 .fewerElementsIf(
1903 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1904 scalarize(0))
1905 .fewerElementsIf(
1906 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1907 scalarize(1))
1908 .clampScalar(BigTyIdx, S32, MaxScalar);
1909
1910 if (Op == G_MERGE_VALUES) {
1911 Builder.widenScalarIf(
1912 // TODO: Use 16-bit shifts if legal for 8-bit values?
1913 [=](const LegalityQuery &Query) {
1914 const LLT Ty = Query.Types[LitTyIdx];
1915 return Ty.getSizeInBits() < 32;
1916 },
1917 changeTo(LitTyIdx, S32));
1918 }
1919
1920 Builder.widenScalarIf(
1921 [=](const LegalityQuery &Query) {
1922 const LLT Ty = Query.Types[BigTyIdx];
1923 return Ty.getSizeInBits() % 16 != 0;
1924 },
1925 [=](const LegalityQuery &Query) {
1926 // Pick the next power of 2, or a multiple of 64 over 128.
1927 // Whichever is smaller.
1928 const LLT &Ty = Query.Types[BigTyIdx];
1929 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1930 if (NewSizeInBits >= 256) {
1931 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1932 if (RoundedTo < NewSizeInBits)
1933 NewSizeInBits = RoundedTo;
1934 }
1935 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1936 })
1937 // Any vectors left are the wrong size. Scalarize them.
1938 .scalarize(0)
1939 .scalarize(1);
1940 }
1941
1942 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1943 // RegBankSelect.
1944 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1945 .legalFor({{S32}, {S64}});
1946
1947 if (ST.hasVOP3PInsts()) {
1948 SextInReg.lowerFor({{V2S16}})
1949 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1950 // get more vector shift opportunities, since we'll get those when
1951 // expanded.
1952 .clampMaxNumElementsStrict(0, S16, 2);
1953 } else if (ST.has16BitInsts()) {
1954 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1955 } else {
1956 // Prefer to promote to s32 before lowering if we don't have 16-bit
1957 // shifts. This avoid a lot of intermediate truncate and extend operations.
1958 SextInReg.lowerFor({{S32}, {S64}});
1959 }
1960
1961 SextInReg
1962 .scalarize(0)
1963 .clampScalar(0, S32, S64)
1964 .lower();
1965
1966 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1967 .scalarize(0)
1968 .lower();
1969
1970 // TODO: Only Try to form v2s16 with legal packed instructions.
1971 getActionDefinitionsBuilder(G_FSHR)
1972 .legalFor({{S32, S32}})
1973 .lowerFor({{V2S16, V2S16}})
1974 .clampMaxNumElementsStrict(0, S16, 2)
1975 .scalarize(0)
1976 .lower();
1977
1978 if (ST.hasVOP3PInsts()) {
1979 getActionDefinitionsBuilder(G_FSHL)
1980 .lowerFor({{V2S16, V2S16}})
1981 .clampMaxNumElementsStrict(0, S16, 2)
1982 .scalarize(0)
1983 .lower();
1984 } else {
1985 getActionDefinitionsBuilder(G_FSHL)
1986 .scalarize(0)
1987 .lower();
1988 }
1989
1990 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1991 .legalFor({S64});
1992
1993 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
1994
1995 getActionDefinitionsBuilder(G_FENCE)
1996 .alwaysLegal();
1997
1998 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1999 .scalarize(0)
2000 .minScalar(0, S32)
2001 .lower();
2002
2003 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2004 .legalFor({{S32, S32}, {S64, S32}})
2005 .clampScalar(1, S32, S32)
2006 .clampScalar(0, S32, S64)
2007 .widenScalarToNextPow2(0)
2008 .scalarize(0);
2009
2010 getActionDefinitionsBuilder(
2011 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2012 G_FCOPYSIGN,
2013
2014 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2015 G_READ_REGISTER, G_WRITE_REGISTER,
2016
2017 G_SADDO, G_SSUBO})
2018 .lower();
2019
2020 if (ST.hasIEEEMinMax()) {
2021 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2022 .legalFor(FPTypesPK16)
2023 .clampMaxNumElements(0, S16, 2)
2024 .scalarize(0);
2025 } else {
2026 // TODO: Implement
2027 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2028 }
2029
2030 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2031 .lower();
2032
2033 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2034
2035 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2036 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2037 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2038 .unsupported();
2039
2040 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2041
2042 getLegacyLegalizerInfo().computeTables();
2043 verify(*ST.getInstrInfo());
2044}
2045
2048 LostDebugLocObserver &LocObserver) const {
2049 MachineIRBuilder &B = Helper.MIRBuilder;
2050 MachineRegisterInfo &MRI = *B.getMRI();
2051
2052 switch (MI.getOpcode()) {
2053 case TargetOpcode::G_ADDRSPACE_CAST:
2054 return legalizeAddrSpaceCast(MI, MRI, B);
2055 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2056 return legalizeFroundeven(MI, MRI, B);
2057 case TargetOpcode::G_FCEIL:
2058 return legalizeFceil(MI, MRI, B);
2059 case TargetOpcode::G_FREM:
2060 return legalizeFrem(MI, MRI, B);
2061 case TargetOpcode::G_INTRINSIC_TRUNC:
2062 return legalizeIntrinsicTrunc(MI, MRI, B);
2063 case TargetOpcode::G_SITOFP:
2064 return legalizeITOFP(MI, MRI, B, true);
2065 case TargetOpcode::G_UITOFP:
2066 return legalizeITOFP(MI, MRI, B, false);
2067 case TargetOpcode::G_FPTOSI:
2068 return legalizeFPTOI(MI, MRI, B, true);
2069 case TargetOpcode::G_FPTOUI:
2070 return legalizeFPTOI(MI, MRI, B, false);
2071 case TargetOpcode::G_FMINNUM:
2072 case TargetOpcode::G_FMAXNUM:
2073 case TargetOpcode::G_FMINNUM_IEEE:
2074 case TargetOpcode::G_FMAXNUM_IEEE:
2075 return legalizeMinNumMaxNum(Helper, MI);
2076 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2077 return legalizeExtractVectorElt(MI, MRI, B);
2078 case TargetOpcode::G_INSERT_VECTOR_ELT:
2079 return legalizeInsertVectorElt(MI, MRI, B);
2080 case TargetOpcode::G_FSIN:
2081 case TargetOpcode::G_FCOS:
2082 return legalizeSinCos(MI, MRI, B);
2083 case TargetOpcode::G_GLOBAL_VALUE:
2084 return legalizeGlobalValue(MI, MRI, B);
2085 case TargetOpcode::G_LOAD:
2086 case TargetOpcode::G_SEXTLOAD:
2087 case TargetOpcode::G_ZEXTLOAD:
2088 return legalizeLoad(Helper, MI);
2089 case TargetOpcode::G_STORE:
2090 return legalizeStore(Helper, MI);
2091 case TargetOpcode::G_FMAD:
2092 return legalizeFMad(MI, MRI, B);
2093 case TargetOpcode::G_FDIV:
2094 return legalizeFDIV(MI, MRI, B);
2095 case TargetOpcode::G_FFREXP:
2096 return legalizeFFREXP(MI, MRI, B);
2097 case TargetOpcode::G_FSQRT:
2098 return legalizeFSQRT(MI, MRI, B);
2099 case TargetOpcode::G_UDIV:
2100 case TargetOpcode::G_UREM:
2101 case TargetOpcode::G_UDIVREM:
2102 return legalizeUnsignedDIV_REM(MI, MRI, B);
2103 case TargetOpcode::G_SDIV:
2104 case TargetOpcode::G_SREM:
2105 case TargetOpcode::G_SDIVREM:
2106 return legalizeSignedDIV_REM(MI, MRI, B);
2107 case TargetOpcode::G_ATOMIC_CMPXCHG:
2108 return legalizeAtomicCmpXChg(MI, MRI, B);
2109 case TargetOpcode::G_FLOG2:
2110 return legalizeFlog2(MI, B);
2111 case TargetOpcode::G_FLOG:
2112 case TargetOpcode::G_FLOG10:
2113 return legalizeFlogCommon(MI, B);
2114 case TargetOpcode::G_FEXP2:
2115 return legalizeFExp2(MI, B);
2116 case TargetOpcode::G_FEXP:
2117 case TargetOpcode::G_FEXP10:
2118 return legalizeFExp(MI, B);
2119 case TargetOpcode::G_FPOW:
2120 return legalizeFPow(MI, B);
2121 case TargetOpcode::G_FFLOOR:
2122 return legalizeFFloor(MI, MRI, B);
2123 case TargetOpcode::G_BUILD_VECTOR:
2124 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2125 return legalizeBuildVector(MI, MRI, B);
2126 case TargetOpcode::G_MUL:
2127 return legalizeMul(Helper, MI);
2128 case TargetOpcode::G_CTLZ:
2129 case TargetOpcode::G_CTTZ:
2130 return legalizeCTLZ_CTTZ(MI, MRI, B);
2131 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2132 return legalizeFPTruncRound(MI, B);
2133 case TargetOpcode::G_STACKSAVE:
2134 return legalizeStackSave(MI, B);
2135 case TargetOpcode::G_GET_FPENV:
2136 return legalizeGetFPEnv(MI, MRI, B);
2137 case TargetOpcode::G_SET_FPENV:
2138 return legalizeSetFPEnv(MI, MRI, B);
2139 case TargetOpcode::G_TRAP:
2140 return legalizeTrap(MI, MRI, B);
2141 case TargetOpcode::G_DEBUGTRAP:
2142 return legalizeDebugTrap(MI, MRI, B);
2143 default:
2144 return false;
2145 }
2146
2147 llvm_unreachable("expected switch to return");
2148}
2149
2151 unsigned AS,
2153 MachineIRBuilder &B) const {
2154 MachineFunction &MF = B.getMF();
2155 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2156 const LLT S32 = LLT::scalar(32);
2157 const LLT S64 = LLT::scalar(64);
2158
2160
2161 if (ST.hasApertureRegs()) {
2162 // Note: this register is somewhat broken. When used as a 32-bit operand,
2163 // it only returns zeroes. The real value is in the upper 32 bits.
2164 // Thus, we must emit extract the high 32 bits.
2165 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2166 ? AMDGPU::SRC_SHARED_BASE
2167 : AMDGPU::SRC_PRIVATE_BASE;
2168 // FIXME: It would be more natural to emit a COPY here, but then copy
2169 // coalescing would kick in and it would think it's okay to use the "HI"
2170 // subregister (instead of extracting the HI 32 bits) which is an artificial
2171 // (unusable) register.
2172 // Register TableGen definitions would need an overhaul to get rid of the
2173 // artificial "HI" aperture registers and prevent this kind of issue from
2174 // happening.
2175 Register Dst = MRI.createGenericVirtualRegister(S64);
2176 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2177 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2178 return B.buildUnmerge(S32, Dst).getReg(1);
2179 }
2180
2181 // TODO: can we be smarter about machine pointer info?
2183 Register LoadAddr = MRI.createGenericVirtualRegister(
2185 // For code object version 5, private_base and shared_base are passed through
2186 // implicit kernargs.
2193 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2194
2195 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2197
2198 if (!loadInputValue(KernargPtrReg, B,
2200 return Register();
2201
2203 PtrInfo,
2207
2208 // Pointer address
2209 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2210 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2211 // Load address
2212 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2213 }
2214
2215 Register QueuePtr = MRI.createGenericVirtualRegister(
2217
2219 return Register();
2220
2221 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2222 // private_segment_aperture_base_hi.
2223 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2224
2226 PtrInfo,
2229 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2230
2231 B.buildPtrAdd(LoadAddr, QueuePtr,
2232 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2233 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2234}
2235
2236/// Return true if the value is a known valid address, such that a null check is
2237/// not necessary.
2239 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2240 MachineInstr *Def = MRI.getVRegDef(Val);
2241 switch (Def->getOpcode()) {
2242 case AMDGPU::G_FRAME_INDEX:
2243 case AMDGPU::G_GLOBAL_VALUE:
2244 case AMDGPU::G_BLOCK_ADDR:
2245 return true;
2246 case AMDGPU::G_CONSTANT: {
2247 const ConstantInt *CI = Def->getOperand(1).getCImm();
2248 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2249 }
2250 default:
2251 return false;
2252 }
2253
2254 return false;
2255}
2256
2259 MachineIRBuilder &B) const {
2260 MachineFunction &MF = B.getMF();
2261
2262 // MI can either be a G_ADDRSPACE_CAST or a
2263 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2264 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2265 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2266 Intrinsic::amdgcn_addrspacecast_nonnull));
2267
2268 const LLT S32 = LLT::scalar(32);
2269 Register Dst = MI.getOperand(0).getReg();
2270 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2271 : MI.getOperand(1).getReg();
2272 LLT DstTy = MRI.getType(Dst);
2273 LLT SrcTy = MRI.getType(Src);
2274 unsigned DestAS = DstTy.getAddressSpace();
2275 unsigned SrcAS = SrcTy.getAddressSpace();
2276
2277 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2278 // vector element.
2279 assert(!DstTy.isVector());
2280
2281 const AMDGPUTargetMachine &TM
2282 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2283
2284 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2285 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2286 return true;
2287 }
2288
2289 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2290 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2291 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2292 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2293 // G_ADDRSPACE_CAST we need to guess.
2294 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2295 // Extract low 32-bits of the pointer.
2296 B.buildExtract(Dst, Src, 0);
2297 MI.eraseFromParent();
2298 return true;
2299 }
2300
2301 unsigned NullVal = TM.getNullPointerValue(DestAS);
2302
2303 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2304 auto FlatNull = B.buildConstant(SrcTy, 0);
2305
2306 // Extract low 32-bits of the pointer.
2307 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2308
2309 auto CmpRes =
2310 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2311 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2312
2313 MI.eraseFromParent();
2314 return true;
2315 }
2316
2317 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2318 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2319 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2320 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2321 if (!ApertureReg.isValid())
2322 return false;
2323
2324 // Coerce the type of the low half of the result so we can use merge_values.
2325 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2326
2327 // TODO: Should we allow mismatched types but matching sizes in merges to
2328 // avoid the ptrtoint?
2329 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2330
2331 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2332 // G_ADDRSPACE_CAST we need to guess.
2333 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2334 B.buildCopy(Dst, BuildPtr);
2335 MI.eraseFromParent();
2336 return true;
2337 }
2338
2339 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2340 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2341
2342 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2343 SegmentNull.getReg(0));
2344
2345 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2346
2347 MI.eraseFromParent();
2348 return true;
2349 }
2350
2351 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2352 SrcTy.getSizeInBits() == 64) {
2353 // Truncate.
2354 B.buildExtract(Dst, Src, 0);
2355 MI.eraseFromParent();
2356 return true;
2357 }
2358
2359 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2360 DstTy.getSizeInBits() == 64) {
2362 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2363 auto PtrLo = B.buildPtrToInt(S32, Src);
2364 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2365 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2366 MI.eraseFromParent();
2367 return true;
2368 }
2369
2370 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2371 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2372
2373 LLVMContext &Ctx = MF.getFunction().getContext();
2374 Ctx.diagnose(InvalidAddrSpaceCast);
2375 B.buildUndef(Dst);
2376 MI.eraseFromParent();
2377 return true;
2378}
2379
2382 MachineIRBuilder &B) const {
2383 Register Src = MI.getOperand(1).getReg();
2384 LLT Ty = MRI.getType(Src);
2385 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2386
2387 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2388 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2389
2390 auto C1 = B.buildFConstant(Ty, C1Val);
2391 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2392
2393 // TODO: Should this propagate fast-math-flags?
2394 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2395 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2396
2397 auto C2 = B.buildFConstant(Ty, C2Val);
2398 auto Fabs = B.buildFAbs(Ty, Src);
2399
2400 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2401 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2402 MI.eraseFromParent();
2403 return true;
2404}
2405
2408 MachineIRBuilder &B) const {
2409
2410 const LLT S1 = LLT::scalar(1);
2411 const LLT S64 = LLT::scalar(64);
2412
2413 Register Src = MI.getOperand(1).getReg();
2414 assert(MRI.getType(Src) == S64);
2415
2416 // result = trunc(src)
2417 // if (src > 0.0 && src != result)
2418 // result += 1.0
2419
2420 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2421
2422 const auto Zero = B.buildFConstant(S64, 0.0);
2423 const auto One = B.buildFConstant(S64, 1.0);
2424 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2425 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2426 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2427 auto Add = B.buildSelect(S64, And, One, Zero);
2428
2429 // TODO: Should this propagate fast-math-flags?
2430 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2431 MI.eraseFromParent();
2432 return true;
2433}
2434
2437 MachineIRBuilder &B) const {
2438 Register DstReg = MI.getOperand(0).getReg();
2439 Register Src0Reg = MI.getOperand(1).getReg();
2440 Register Src1Reg = MI.getOperand(2).getReg();
2441 auto Flags = MI.getFlags();
2442 LLT Ty = MRI.getType(DstReg);
2443
2444 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2445 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2446 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2447 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2448 MI.eraseFromParent();
2449 return true;
2450}
2451
2454 const unsigned FractBits = 52;
2455 const unsigned ExpBits = 11;
2456 LLT S32 = LLT::scalar(32);
2457
2458 auto Const0 = B.buildConstant(S32, FractBits - 32);
2459 auto Const1 = B.buildConstant(S32, ExpBits);
2460
2461 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2462 .addUse(Hi)
2463 .addUse(Const0.getReg(0))
2464 .addUse(Const1.getReg(0));
2465
2466 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2467}
2468
2471 MachineIRBuilder &B) const {
2472 const LLT S1 = LLT::scalar(1);
2473 const LLT S32 = LLT::scalar(32);
2474 const LLT S64 = LLT::scalar(64);
2475
2476 Register Src = MI.getOperand(1).getReg();
2477 assert(MRI.getType(Src) == S64);
2478
2479 // TODO: Should this use extract since the low half is unused?
2480 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2481 Register Hi = Unmerge.getReg(1);
2482
2483 // Extract the upper half, since this is where we will find the sign and
2484 // exponent.
2485 auto Exp = extractF64Exponent(Hi, B);
2486
2487 const unsigned FractBits = 52;
2488
2489 // Extract the sign bit.
2490 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2491 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2492
2493 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2494
2495 const auto Zero32 = B.buildConstant(S32, 0);
2496
2497 // Extend back to 64-bits.
2498 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2499
2500 auto Shr = B.buildAShr(S64, FractMask, Exp);
2501 auto Not = B.buildNot(S64, Shr);
2502 auto Tmp0 = B.buildAnd(S64, Src, Not);
2503 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2504
2505 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2506 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2507
2508 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2509 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2510 MI.eraseFromParent();
2511 return true;
2512}
2513
2516 MachineIRBuilder &B, bool Signed) const {
2517
2518 Register Dst = MI.getOperand(0).getReg();
2519 Register Src = MI.getOperand(1).getReg();
2520
2521 const LLT S64 = LLT::scalar(64);
2522 const LLT S32 = LLT::scalar(32);
2523
2524 assert(MRI.getType(Src) == S64);
2525
2526 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2527 auto ThirtyTwo = B.buildConstant(S32, 32);
2528
2529 if (MRI.getType(Dst) == S64) {
2530 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2531 : B.buildUITOFP(S64, Unmerge.getReg(1));
2532
2533 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2534 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2535
2536 // TODO: Should this propagate fast-math-flags?
2537 B.buildFAdd(Dst, LdExp, CvtLo);
2538 MI.eraseFromParent();
2539 return true;
2540 }
2541
2542 assert(MRI.getType(Dst) == S32);
2543
2544 auto One = B.buildConstant(S32, 1);
2545
2546 MachineInstrBuilder ShAmt;
2547 if (Signed) {
2548 auto ThirtyOne = B.buildConstant(S32, 31);
2549 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2550 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2551 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2552 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2553 .addUse(Unmerge.getReg(1));
2554 auto LS2 = B.buildSub(S32, LS, One);
2555 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2556 } else
2557 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2558 auto Norm = B.buildShl(S64, Src, ShAmt);
2559 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2560 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2561 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2562 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2563 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2564 B.buildFLdexp(Dst, FVal, Scale);
2565 MI.eraseFromParent();
2566 return true;
2567}
2568
2569// TODO: Copied from DAG implementation. Verify logic and document how this
2570// actually works.
2574 bool Signed) const {
2575
2576 Register Dst = MI.getOperand(0).getReg();
2577 Register Src = MI.getOperand(1).getReg();
2578
2579 const LLT S64 = LLT::scalar(64);
2580 const LLT S32 = LLT::scalar(32);
2581
2582 const LLT SrcLT = MRI.getType(Src);
2583 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2584
2585 unsigned Flags = MI.getFlags();
2586
2587 // The basic idea of converting a floating point number into a pair of 32-bit
2588 // integers is illustrated as follows:
2589 //
2590 // tf := trunc(val);
2591 // hif := floor(tf * 2^-32);
2592 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2593 // hi := fptoi(hif);
2594 // lo := fptoi(lof);
2595 //
2596 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2598 if (Signed && SrcLT == S32) {
2599 // However, a 32-bit floating point number has only 23 bits mantissa and
2600 // it's not enough to hold all the significant bits of `lof` if val is
2601 // negative. To avoid the loss of precision, We need to take the absolute
2602 // value after truncating and flip the result back based on the original
2603 // signedness.
2604 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2605 Trunc = B.buildFAbs(S32, Trunc, Flags);
2606 }
2607 MachineInstrBuilder K0, K1;
2608 if (SrcLT == S64) {
2609 K0 = B.buildFConstant(
2610 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2611 K1 = B.buildFConstant(
2612 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2613 } else {
2614 K0 = B.buildFConstant(
2615 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2616 K1 = B.buildFConstant(
2617 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2618 }
2619
2620 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2621 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2622 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2623
2624 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2625 : B.buildFPTOUI(S32, FloorMul);
2626 auto Lo = B.buildFPTOUI(S32, Fma);
2627
2628 if (Signed && SrcLT == S32) {
2629 // Flip the result based on the signedness, which is either all 0s or 1s.
2630 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2631 // r := xor({lo, hi}, sign) - sign;
2632 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2633 Sign);
2634 } else
2635 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2636 MI.eraseFromParent();
2637
2638 return true;
2639}
2640
2642 MachineInstr &MI) const {
2643 MachineFunction &MF = Helper.MIRBuilder.getMF();
2645
2646 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2647 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2648
2649 // With ieee_mode disabled, the instructions have the correct behavior
2650 // already for G_FMINNUM/G_FMAXNUM
2651 if (!MFI->getMode().IEEE)
2652 return !IsIEEEOp;
2653
2654 if (IsIEEEOp)
2655 return true;
2656
2658}
2659
2662 MachineIRBuilder &B) const {
2663 // TODO: Should move some of this into LegalizerHelper.
2664
2665 // TODO: Promote dynamic indexing of s16 to s32
2666
2667 Register Dst = MI.getOperand(0).getReg();
2668 Register Vec = MI.getOperand(1).getReg();
2669
2670 LLT VecTy = MRI.getType(Vec);
2671 LLT EltTy = VecTy.getElementType();
2672 assert(EltTy == MRI.getType(Dst));
2673
2674 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2675 // but we can't go directly to that logic becasue you can't bitcast a vector
2676 // of pointers to a vector of integers. Therefore, introduce an intermediate
2677 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2678 // drive the legalization forward.
2679 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2680 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2681 LLT IntVecTy = VecTy.changeElementType(IntTy);
2682
2683 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2684 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2685 B.buildIntToPtr(Dst, IntElt);
2686
2687 MI.eraseFromParent();
2688 return true;
2689 }
2690
2691 // FIXME: Artifact combiner probably should have replaced the truncated
2692 // constant before this, so we shouldn't need
2693 // getIConstantVRegValWithLookThrough.
2694 std::optional<ValueAndVReg> MaybeIdxVal =
2695 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2696 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2697 return true;
2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2699
2700 if (IdxVal < VecTy.getNumElements()) {
2701 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2702 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2703 } else {
2704 B.buildUndef(Dst);
2705 }
2706
2707 MI.eraseFromParent();
2708 return true;
2709}
2710
2713 MachineIRBuilder &B) const {
2714 // TODO: Should move some of this into LegalizerHelper.
2715
2716 // TODO: Promote dynamic indexing of s16 to s32
2717
2718 Register Dst = MI.getOperand(0).getReg();
2719 Register Vec = MI.getOperand(1).getReg();
2720 Register Ins = MI.getOperand(2).getReg();
2721
2722 LLT VecTy = MRI.getType(Vec);
2723 LLT EltTy = VecTy.getElementType();
2724 assert(EltTy == MRI.getType(Ins));
2725
2726 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2727 // but we can't go directly to that logic becasue you can't bitcast a vector
2728 // of pointers to a vector of integers. Therefore, make the pointer vector
2729 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2730 // new value, and then inttoptr the result vector back. This will then allow
2731 // the rest of legalization to take over.
2732 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2733 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2734 LLT IntVecTy = VecTy.changeElementType(IntTy);
2735
2736 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2737 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2738 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2739 MI.getOperand(3));
2740 B.buildIntToPtr(Dst, IntVecDest);
2741 MI.eraseFromParent();
2742 return true;
2743 }
2744
2745 // FIXME: Artifact combiner probably should have replaced the truncated
2746 // constant before this, so we shouldn't need
2747 // getIConstantVRegValWithLookThrough.
2748 std::optional<ValueAndVReg> MaybeIdxVal =
2749 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2750 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2751 return true;
2752
2753 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2754
2755 unsigned NumElts = VecTy.getNumElements();
2756 if (IdxVal < NumElts) {
2758 for (unsigned i = 0; i < NumElts; ++i)
2759 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2760 B.buildUnmerge(SrcRegs, Vec);
2761
2762 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2763 B.buildMergeLikeInstr(Dst, SrcRegs);
2764 } else {
2765 B.buildUndef(Dst);
2766 }
2767
2768 MI.eraseFromParent();
2769 return true;
2770}
2771
2774 MachineIRBuilder &B) const {
2775
2776 Register DstReg = MI.getOperand(0).getReg();
2777 Register SrcReg = MI.getOperand(1).getReg();
2778 LLT Ty = MRI.getType(DstReg);
2779 unsigned Flags = MI.getFlags();
2780
2781 Register TrigVal;
2782 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2783 if (ST.hasTrigReducedRange()) {
2784 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2785 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2786 .addUse(MulVal.getReg(0))
2787 .setMIFlags(Flags)
2788 .getReg(0);
2789 } else
2790 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2791
2792 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2793 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2794 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2795 .addUse(TrigVal)
2796 .setMIFlags(Flags);
2797 MI.eraseFromParent();
2798 return true;
2799}
2800
2803 const GlobalValue *GV,
2804 int64_t Offset,
2805 unsigned GAFlags) const {
2806 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2807 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2808 // to the following code sequence:
2809 //
2810 // For constant address space:
2811 // s_getpc_b64 s[0:1]
2812 // s_add_u32 s0, s0, $symbol
2813 // s_addc_u32 s1, s1, 0
2814 //
2815 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2816 // a fixup or relocation is emitted to replace $symbol with a literal
2817 // constant, which is a pc-relative offset from the encoding of the $symbol
2818 // operand to the global variable.
2819 //
2820 // For global address space:
2821 // s_getpc_b64 s[0:1]
2822 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2823 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2824 //
2825 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2826 // fixups or relocations are emitted to replace $symbol@*@lo and
2827 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2828 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2829 // operand to the global variable.
2830
2832
2833 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2834 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2835
2836 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2837 .addDef(PCReg);
2838
2839 MIB.addGlobalAddress(GV, Offset, GAFlags);
2840 if (GAFlags == SIInstrInfo::MO_NONE)
2841 MIB.addImm(0);
2842 else
2843 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2844
2845 if (!B.getMRI()->getRegClassOrNull(PCReg))
2846 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2847
2848 if (PtrTy.getSizeInBits() == 32)
2849 B.buildExtract(DstReg, PCReg, 0);
2850 return true;
2851}
2852
2853// Emit a ABS32_LO / ABS32_HI relocation stub.
2855 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2856 MachineRegisterInfo &MRI) const {
2857 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2858
2859 LLT S32 = LLT::scalar(32);
2860
2861 // Use the destination directly, if and only if we store the lower address
2862 // part only and we don't have a register class being set.
2863 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2864 ? DstReg
2865 : MRI.createGenericVirtualRegister(S32);
2866
2867 if (!MRI.getRegClassOrNull(AddrLo))
2868 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2869
2870 // Write the lower half.
2871 B.buildInstr(AMDGPU::S_MOV_B32)
2872 .addDef(AddrLo)
2873 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2874
2875 // If required, write the upper half as well.
2876 if (RequiresHighHalf) {
2877 assert(PtrTy.getSizeInBits() == 64 &&
2878 "Must provide a 64-bit pointer type!");
2879
2880 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2881 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2882
2883 B.buildInstr(AMDGPU::S_MOV_B32)
2884 .addDef(AddrHi)
2885 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2886
2887 // Use the destination directly, if and only if we don't have a register
2888 // class being set.
2889 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2890 ? DstReg
2891 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2892
2893 if (!MRI.getRegClassOrNull(AddrDst))
2894 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2895
2896 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2897
2898 // If we created a new register for the destination, cast the result into
2899 // the final output.
2900 if (AddrDst != DstReg)
2901 B.buildCast(DstReg, AddrDst);
2902 } else if (AddrLo != DstReg) {
2903 // If we created a new register for the destination, cast the result into
2904 // the final output.
2905 B.buildCast(DstReg, AddrLo);
2906 }
2907}
2908
2911 MachineIRBuilder &B) const {
2912 Register DstReg = MI.getOperand(0).getReg();
2913 LLT Ty = MRI.getType(DstReg);
2914 unsigned AS = Ty.getAddressSpace();
2915
2916 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2917 MachineFunction &MF = B.getMF();
2919
2921 if (!MFI->isModuleEntryFunction() &&
2922 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2923 const Function &Fn = MF.getFunction();
2924 DiagnosticInfoUnsupported BadLDSDecl(
2925 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2926 DS_Warning);
2927 Fn.getContext().diagnose(BadLDSDecl);
2928
2929 // We currently don't have a way to correctly allocate LDS objects that
2930 // aren't directly associated with a kernel. We do force inlining of
2931 // functions that use local objects. However, if these dead functions are
2932 // not eliminated, we don't want a compile time error. Just emit a warning
2933 // and a trap, since there should be no callable path here.
2934 B.buildTrap();
2935 B.buildUndef(DstReg);
2936 MI.eraseFromParent();
2937 return true;
2938 }
2939
2940 // TODO: We could emit code to handle the initialization somewhere.
2941 // We ignore the initializer for now and legalize it to allow selection.
2942 // The initializer will anyway get errored out during assembly emission.
2943 const SITargetLowering *TLI = ST.getTargetLowering();
2944 if (!TLI->shouldUseLDSConstAddress(GV)) {
2945 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2946 return true; // Leave in place;
2947 }
2948
2949 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2950 Type *Ty = GV->getValueType();
2951 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2952 // zero-sized type in other languages to declare the dynamic shared
2953 // memory which size is not known at the compile time. They will be
2954 // allocated by the runtime and placed directly after the static
2955 // allocated ones. They all share the same offset.
2956 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2957 // Adjust alignment for that dynamic shared memory array.
2958 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2959 LLT S32 = LLT::scalar(32);
2960 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2961 B.buildIntToPtr(DstReg, Sz);
2962 MI.eraseFromParent();
2963 return true;
2964 }
2965 }
2966
2967 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2968 *cast<GlobalVariable>(GV)));
2969 MI.eraseFromParent();
2970 return true;
2971 }
2972
2973 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2974 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2975 MI.eraseFromParent();
2976 return true;
2977 }
2978
2979 const SITargetLowering *TLI = ST.getTargetLowering();
2980
2981 if (TLI->shouldEmitFixup(GV)) {
2982 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2983 MI.eraseFromParent();
2984 return true;
2985 }
2986
2987 if (TLI->shouldEmitPCReloc(GV)) {
2988 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2989 MI.eraseFromParent();
2990 return true;
2991 }
2992
2994 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2995
2996 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3001 LoadTy, Align(8));
3002
3003 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3004
3005 if (Ty.getSizeInBits() == 32) {
3006 // Truncate if this is a 32-bit constant address.
3007 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3008 B.buildExtract(DstReg, Load, 0);
3009 } else
3010 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3011
3012 MI.eraseFromParent();
3013 return true;
3014}
3015
3017 if (Ty.isVector())
3018 return Ty.changeElementCount(
3021}
3022
3024 MachineInstr &MI) const {
3025 MachineIRBuilder &B = Helper.MIRBuilder;
3026 MachineRegisterInfo &MRI = *B.getMRI();
3027 GISelChangeObserver &Observer = Helper.Observer;
3028
3029 Register PtrReg = MI.getOperand(1).getReg();
3030 LLT PtrTy = MRI.getType(PtrReg);
3031 unsigned AddrSpace = PtrTy.getAddressSpace();
3032
3033 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3035 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3036 Observer.changingInstr(MI);
3037 MI.getOperand(1).setReg(Cast.getReg(0));
3038 Observer.changedInstr(MI);
3039 return true;
3040 }
3041
3042 if (MI.getOpcode() != AMDGPU::G_LOAD)
3043 return false;
3044
3045 Register ValReg = MI.getOperand(0).getReg();
3046 LLT ValTy = MRI.getType(ValReg);
3047
3048 if (hasBufferRsrcWorkaround(ValTy)) {
3049 Observer.changingInstr(MI);
3051 Observer.changedInstr(MI);
3052 return true;
3053 }
3054
3055 MachineMemOperand *MMO = *MI.memoperands_begin();
3056 const unsigned ValSize = ValTy.getSizeInBits();
3057 const LLT MemTy = MMO->getMemoryType();
3058 const Align MemAlign = MMO->getAlign();
3059 const unsigned MemSize = MemTy.getSizeInBits();
3060 const uint64_t AlignInBits = 8 * MemAlign.value();
3061
3062 // Widen non-power-of-2 loads to the alignment if needed
3063 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3064 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3065
3066 // This was already the correct extending load result type, so just adjust
3067 // the memory type.
3068 if (WideMemSize == ValSize) {
3069 MachineFunction &MF = B.getMF();
3070
3071 MachineMemOperand *WideMMO =
3072 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3073 Observer.changingInstr(MI);
3074 MI.setMemRefs(MF, {WideMMO});
3075 Observer.changedInstr(MI);
3076 return true;
3077 }
3078
3079 // Don't bother handling edge case that should probably never be produced.
3080 if (ValSize > WideMemSize)
3081 return false;
3082
3083 LLT WideTy = widenToNextPowerOf2(ValTy);
3084
3085 Register WideLoad;
3086 if (!WideTy.isVector()) {
3087 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3088 B.buildTrunc(ValReg, WideLoad).getReg(0);
3089 } else {
3090 // Extract the subvector.
3091
3092 if (isRegisterType(ValTy)) {
3093 // If this a case where G_EXTRACT is legal, use it.
3094 // (e.g. <3 x s32> -> <4 x s32>)
3095 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3096 B.buildExtract(ValReg, WideLoad, 0);
3097 } else {
3098 // For cases where the widened type isn't a nice register value, unmerge
3099 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3100 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3101 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3102 }
3103 }
3104
3105 MI.eraseFromParent();
3106 return true;
3107 }
3108
3109 return false;
3110}
3111
3113 MachineInstr &MI) const {
3114 MachineIRBuilder &B = Helper.MIRBuilder;
3115 MachineRegisterInfo &MRI = *B.getMRI();
3116 GISelChangeObserver &Observer = Helper.Observer;
3117
3118 Register DataReg = MI.getOperand(0).getReg();
3119 LLT DataTy = MRI.getType(DataReg);
3120
3121 if (hasBufferRsrcWorkaround(DataTy)) {
3122 Observer.changingInstr(MI);
3124 Observer.changedInstr(MI);
3125 return true;
3126 }
3127 return false;
3128}
3129
3132 MachineIRBuilder &B) const {
3133 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3134 assert(Ty.isScalar());
3135
3136 MachineFunction &MF = B.getMF();
3138
3139 // TODO: Always legal with future ftz flag.
3140 // FIXME: Do we need just output?
3141 if (Ty == LLT::float32() &&
3143 return true;
3144 if (Ty == LLT::float16() &&
3146 return true;
3147
3148 MachineIRBuilder HelperBuilder(MI);
3149 GISelObserverWrapper DummyObserver;
3150 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3151 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3152}
3153
3156 Register DstReg = MI.getOperand(0).getReg();
3157 Register PtrReg = MI.getOperand(1).getReg();
3158 Register CmpVal = MI.getOperand(2).getReg();
3159 Register NewVal = MI.getOperand(3).getReg();
3160
3161 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3162 "this should not have been custom lowered");
3163
3164 LLT ValTy = MRI.getType(CmpVal);
3165 LLT VecTy = LLT::fixed_vector(2, ValTy);
3166
3167 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3168
3169 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3170 .addDef(DstReg)
3171 .addUse(PtrReg)
3172 .addUse(PackedVal)
3173 .setMemRefs(MI.memoperands());
3174
3175 MI.eraseFromParent();
3176 return true;
3177}
3178
3179/// Return true if it's known that \p Src can never be an f32 denormal value.
3181 Register Src) {
3182 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3183 switch (DefMI->getOpcode()) {
3184 case TargetOpcode::G_INTRINSIC: {
3185 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3186 case Intrinsic::amdgcn_frexp_mant:
3187 return true;
3188 default:
3189 break;
3190 }
3191
3192 break;
3193 }
3194 case TargetOpcode::G_FFREXP: {
3195 if (DefMI->getOperand(0).getReg() == Src)
3196 return true;
3197 break;
3198 }
3199 case TargetOpcode::G_FPEXT: {
3200 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3201 }
3202 default:
3203 return false;
3204 }
3205
3206 return false;
3207}
3208
3209static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3210 if (Flags & MachineInstr::FmAfn)
3211 return true;
3212 const auto &Options = MF.getTarget().Options;
3213 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3214}
3215
3217 unsigned Flags) {
3218 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3221}
3222
3223std::pair<Register, Register>
3225 unsigned Flags) const {
3226 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3227 return {};
3228
3229 const LLT F32 = LLT::scalar(32);
3230 auto SmallestNormal = B.buildFConstant(
3232 auto IsLtSmallestNormal =
3233 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3234
3235 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3236 auto One = B.buildFConstant(F32, 1.0);
3237 auto ScaleFactor =
3238 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3239 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3240
3241 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3242}
3243
3245 MachineIRBuilder &B) const {
3246 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3247 // If we have to handle denormals, scale up the input and adjust the result.
3248
3249 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3250 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3251
3252 Register Dst = MI.getOperand(0).getReg();
3253 Register Src = MI.getOperand(1).getReg();
3254 LLT Ty = B.getMRI()->getType(Dst);
3255 unsigned Flags = MI.getFlags();
3256
3257 if (Ty == LLT::scalar(16)) {
3258 const LLT F32 = LLT::scalar(32);
3259 // Nothing in half is a denormal when promoted to f32.
3260 auto Ext = B.buildFPExt(F32, Src, Flags);
3261 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3262 .addUse(Ext.getReg(0))
3263 .setMIFlags(Flags);
3264 B.buildFPTrunc(Dst, Log2, Flags);
3265 MI.eraseFromParent();
3266 return true;
3267 }
3268
3269 assert(Ty == LLT::scalar(32));
3270
3271 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3272 if (!ScaledInput) {
3273 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3274 .addUse(Src)
3275 .setMIFlags(Flags);
3276 MI.eraseFromParent();
3277 return true;
3278 }
3279
3280 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3281 .addUse(ScaledInput)
3282 .setMIFlags(Flags);
3283
3284 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3285 auto Zero = B.buildFConstant(Ty, 0.0);
3286 auto ResultOffset =
3287 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3288 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3289
3290 MI.eraseFromParent();
3291 return true;
3292}
3293
3295 Register Z, unsigned Flags) {
3296 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3297 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3298}
3299
3301 MachineIRBuilder &B) const {
3302 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3303 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3304
3305 MachineRegisterInfo &MRI = *B.getMRI();
3306 Register Dst = MI.getOperand(0).getReg();
3307 Register X = MI.getOperand(1).getReg();
3308 unsigned Flags = MI.getFlags();
3309 const LLT Ty = MRI.getType(X);
3310 MachineFunction &MF = B.getMF();
3311
3312 const LLT F32 = LLT::scalar(32);
3313 const LLT F16 = LLT::scalar(16);
3314
3315 const AMDGPUTargetMachine &TM =
3316 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3317
3318 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3319 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3320 if (Ty == F16 && !ST.has16BitInsts()) {
3321 Register LogVal = MRI.createGenericVirtualRegister(F32);
3322 auto PromoteSrc = B.buildFPExt(F32, X);
3323 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3324 B.buildFPTrunc(Dst, LogVal);
3325 } else {
3326 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3327 }
3328
3329 MI.eraseFromParent();
3330 return true;
3331 }
3332
3333 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3334 if (ScaledInput)
3335 X = ScaledInput;
3336
3337 auto Y =
3338 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3339
3340 Register R;
3341 if (ST.hasFastFMAF32()) {
3342 // c+cc are ln(2)/ln(10) to more than 49 bits
3343 const float c_log10 = 0x1.344134p-2f;
3344 const float cc_log10 = 0x1.09f79ep-26f;
3345
3346 // c + cc is ln(2) to more than 49 bits
3347 const float c_log = 0x1.62e42ep-1f;
3348 const float cc_log = 0x1.efa39ep-25f;
3349
3350 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3351 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3352
3353 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3354 auto NegR = B.buildFNeg(Ty, R, Flags);
3355 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3356 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3357 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3358 } else {
3359 // ch+ct is ln(2)/ln(10) to more than 36 bits
3360 const float ch_log10 = 0x1.344000p-2f;
3361 const float ct_log10 = 0x1.3509f6p-18f;
3362
3363 // ch + ct is ln(2) to more than 36 bits
3364 const float ch_log = 0x1.62e000p-1f;
3365 const float ct_log = 0x1.0bfbe8p-15f;
3366
3367 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3368 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3369
3370 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3371 auto YH = B.buildAnd(Ty, Y, MaskConst);
3372 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3373 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3374
3375 Register Mad0 =
3376 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3377 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3378 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3379 }
3380
3381 const bool IsFiniteOnly =
3382 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3383 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3384
3385 if (!IsFiniteOnly) {
3386 // Expand isfinite(x) => fabs(x) < inf
3387 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3388 auto Fabs = B.buildFAbs(Ty, Y);
3389 auto IsFinite =
3390 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3391 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3392 }
3393
3394 if (ScaledInput) {
3395 auto Zero = B.buildFConstant(Ty, 0.0);
3396 auto ShiftK =
3397 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3398 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3399 B.buildFSub(Dst, R, Shift, Flags);
3400 } else {
3401 B.buildCopy(Dst, R);
3402 }
3403
3404 MI.eraseFromParent();
3405 return true;
3406}
3407
3409 Register Src, bool IsLog10,
3410 unsigned Flags) const {
3411 const double Log2BaseInverted =
3413
3414 LLT Ty = B.getMRI()->getType(Dst);
3415
3416 if (Ty == LLT::scalar(32)) {
3417 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3418 if (ScaledInput) {
3419 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3420 .addUse(Src)
3421 .setMIFlags(Flags);
3422 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3423 auto Zero = B.buildFConstant(Ty, 0.0);
3424 auto ResultOffset =
3425 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3426 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3427
3428 if (ST.hasFastFMAF32())
3429 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3430 else {
3431 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3432 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3433 }
3434
3435 return true;
3436 }
3437 }
3438
3439 auto Log2Operand = Ty == LLT::scalar(16)
3440 ? B.buildFLog2(Ty, Src, Flags)
3441 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3442 .addUse(Src)
3443 .setMIFlags(Flags);
3444 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3445 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3446 return true;
3447}
3448
3450 MachineIRBuilder &B) const {
3451 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3452 // If we have to handle denormals, scale up the input and adjust the result.
3453
3454 Register Dst = MI.getOperand(0).getReg();
3455 Register Src = MI.getOperand(1).getReg();
3456 unsigned Flags = MI.getFlags();
3457 LLT Ty = B.getMRI()->getType(Dst);
3458 const LLT F16 = LLT::scalar(16);
3459 const LLT F32 = LLT::scalar(32);
3460
3461 if (Ty == F16) {
3462 // Nothing in half is a denormal when promoted to f32.
3463 auto Ext = B.buildFPExt(F32, Src, Flags);
3464 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3465 .addUse(Ext.getReg(0))
3466 .setMIFlags(Flags);
3467 B.buildFPTrunc(Dst, Log2, Flags);
3468 MI.eraseFromParent();
3469 return true;
3470 }
3471
3472 assert(Ty == F32);
3473
3474 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3475 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3476 .addUse(Src)
3477 .setMIFlags(Flags);
3478 MI.eraseFromParent();
3479 return true;
3480 }
3481
3482 // bool needs_scaling = x < -0x1.f80000p+6f;
3483 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3484
3485 // -nextafter(128.0, -1)
3486 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3487 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3488 RangeCheckConst, Flags);
3489
3490 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3491 auto Zero = B.buildFConstant(Ty, 0.0);
3492 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3493 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3494
3495 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3496 .addUse(AddInput.getReg(0))
3497 .setMIFlags(Flags);
3498
3499 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3500 auto One = B.buildFConstant(Ty, 1.0);
3501 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3502 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3503 MI.eraseFromParent();
3504 return true;
3505}
3506
3508 Register X, unsigned Flags) const {
3509 LLT Ty = B.getMRI()->getType(Dst);
3510 LLT F32 = LLT::scalar(32);
3511
3512 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3513 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3514 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3515
3516 if (Ty == F32) {
3517 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3518 .addUse(Mul.getReg(0))
3519 .setMIFlags(Flags);
3520 } else {
3521 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3522 }
3523
3524 return true;
3525 }
3526
3527 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3528 auto NeedsScaling =
3529 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3530 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3531 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3532 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3533
3534 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3535 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3536
3537 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3538 .addUse(ExpInput.getReg(0))
3539 .setMIFlags(Flags);
3540
3541 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3542 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3543 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3544 return true;
3545}
3546
3548 MachineIRBuilder &B) const {
3549 Register Dst = MI.getOperand(0).getReg();
3550 Register X = MI.getOperand(1).getReg();
3551 const unsigned Flags = MI.getFlags();
3552 MachineFunction &MF = B.getMF();
3553 MachineRegisterInfo &MRI = *B.getMRI();
3554 LLT Ty = MRI.getType(Dst);
3555 const LLT F16 = LLT::scalar(16);
3556 const LLT F32 = LLT::scalar(32);
3557 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3558
3559 if (Ty == F16) {
3560 // v_exp_f16 (fmul x, log2e)
3561 if (allowApproxFunc(MF, Flags)) {
3562 // TODO: Does this really require fast?
3563 legalizeFExpUnsafe(B, Dst, X, Flags);
3564 MI.eraseFromParent();
3565 return true;
3566 }
3567
3568 // exp(f16 x) ->
3569 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3570
3571 // Nothing in half is a denormal when promoted to f32.
3572 auto Ext = B.buildFPExt(F32, X, Flags);
3573 Register Lowered = MRI.createGenericVirtualRegister(F32);
3574 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3575 B.buildFPTrunc(Dst, Lowered, Flags);
3576 MI.eraseFromParent();
3577 return true;
3578 }
3579
3580 assert(Ty == F32);
3581
3582 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3583 // library behavior. Also, is known-not-daz source sufficient?
3584 if (allowApproxFunc(MF, Flags)) {
3585 legalizeFExpUnsafe(B, Dst, X, Flags);
3586 MI.eraseFromParent();
3587 return true;
3588 }
3589
3590 // Algorithm:
3591 //
3592 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3593 //
3594 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3595 // n = 64*m + j, 0 <= j < 64
3596 //
3597 // e^x = 2^((64*m + j + f)/64)
3598 // = (2^m) * (2^(j/64)) * 2^(f/64)
3599 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3600 //
3601 // f = x*(64/ln(2)) - n
3602 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3603 //
3604 // e^x = (2^m) * (2^(j/64)) * e^r
3605 //
3606 // (2^(j/64)) is precomputed
3607 //
3608 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3609 // e^r = 1 + q
3610 //
3611 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3612 //
3613 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3614 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3615 Register PH, PL;
3616
3617 if (ST.hasFastFMAF32()) {
3618 const float c_exp = numbers::log2ef;
3619 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3620 const float c_exp10 = 0x1.a934f0p+1f;
3621 const float cc_exp10 = 0x1.2f346ep-24f;
3622
3623 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3624 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3625 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3626 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3627
3628 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3629 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3630 } else {
3631 const float ch_exp = 0x1.714000p+0f;
3632 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3633
3634 const float ch_exp10 = 0x1.a92000p+1f;
3635 const float cl_exp10 = 0x1.4f0978p-11f;
3636
3637 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3638 auto XH = B.buildAnd(Ty, X, MaskConst);
3639 auto XL = B.buildFSub(Ty, X, XH, Flags);
3640
3641 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3642 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3643
3644 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3645 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3646
3647 Register Mad0 =
3648 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3649 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3650 }
3651
3652 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3653
3654 // It is unsafe to contract this fsub into the PH multiply.
3655 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3656 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3657 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3658
3659 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3660 .addUse(A.getReg(0))
3661 .setMIFlags(Flags);
3662 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3663
3664 auto UnderflowCheckConst =
3665 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3666 auto Zero = B.buildFConstant(Ty, 0.0);
3667 auto Underflow =
3668 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3669
3670 R = B.buildSelect(Ty, Underflow, Zero, R);
3671
3672 const auto &Options = MF.getTarget().Options;
3673
3674 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3675 auto OverflowCheckConst =
3676 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3677
3678 auto Overflow =
3679 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3680 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3681 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3682 }
3683
3684 B.buildCopy(Dst, R);
3685 MI.eraseFromParent();
3686 return true;
3687}
3688
3690 MachineIRBuilder &B) const {
3691 Register Dst = MI.getOperand(0).getReg();
3692 Register Src0 = MI.getOperand(1).getReg();
3693 Register Src1 = MI.getOperand(2).getReg();
3694 unsigned Flags = MI.getFlags();
3695 LLT Ty = B.getMRI()->getType(Dst);
3696 const LLT F16 = LLT::float16();
3697 const LLT F32 = LLT::float32();
3698
3699 if (Ty == F32) {
3700 auto Log = B.buildFLog2(F32, Src0, Flags);
3701 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3702 .addUse(Log.getReg(0))
3703 .addUse(Src1)
3704 .setMIFlags(Flags);
3705 B.buildFExp2(Dst, Mul, Flags);
3706 } else if (Ty == F16) {
3707 // There's no f16 fmul_legacy, so we need to convert for it.
3708 auto Log = B.buildFLog2(F16, Src0, Flags);
3709 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3710 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3711 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3712 .addUse(Ext0.getReg(0))
3713 .addUse(Ext1.getReg(0))
3714 .setMIFlags(Flags);
3715 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3716 } else
3717 return false;
3718
3719 MI.eraseFromParent();
3720 return true;
3721}
3722
3723// Find a source register, ignoring any possible source modifiers.
3725 Register ModSrc = OrigSrc;
3726 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3727 ModSrc = SrcFNeg->getOperand(1).getReg();
3728 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3729 ModSrc = SrcFAbs->getOperand(1).getReg();
3730 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3731 ModSrc = SrcFAbs->getOperand(1).getReg();
3732 return ModSrc;
3733}
3734
3737 MachineIRBuilder &B) const {
3738
3739 const LLT S1 = LLT::scalar(1);
3740 const LLT F64 = LLT::float64();
3741 Register Dst = MI.getOperand(0).getReg();
3742 Register OrigSrc = MI.getOperand(1).getReg();
3743 unsigned Flags = MI.getFlags();
3744 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3745 "this should not have been custom lowered");
3746
3747 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3748 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3749 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3750 // V_FRACT bug is:
3751 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3752 //
3753 // Convert floor(x) to (x - fract(x))
3754
3755 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3756 .addUse(OrigSrc)
3757 .setMIFlags(Flags);
3758
3759 // Give source modifier matching some assistance before obscuring a foldable
3760 // pattern.
3761
3762 // TODO: We can avoid the neg on the fract? The input sign to fract
3763 // shouldn't matter?
3764 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3765
3766 auto Const =
3767 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3768
3769 Register Min = MRI.createGenericVirtualRegister(F64);
3770
3771 // We don't need to concern ourselves with the snan handling difference, so
3772 // use the one which will directly select.
3773 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3774 if (MFI->getMode().IEEE)
3775 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3776 else
3777 B.buildFMinNum(Min, Fract, Const, Flags);
3778
3779 Register CorrectedFract = Min;
3780 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3781 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3782 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3783 }
3784
3785 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3786 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3787
3788 MI.eraseFromParent();
3789 return true;
3790}
3791
3792// Turn an illegal packed v2s16 build vector into bit operations.
3793// TODO: This should probably be a bitcast action in LegalizerHelper.
3796 Register Dst = MI.getOperand(0).getReg();
3797 const LLT S32 = LLT::scalar(32);
3798 const LLT S16 = LLT::scalar(16);
3799 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3800
3801 Register Src0 = MI.getOperand(1).getReg();
3802 Register Src1 = MI.getOperand(2).getReg();
3803
3804 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3805 assert(MRI.getType(Src0) == S32);
3806 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3807 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3808 }
3809
3810 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3811 B.buildBitcast(Dst, Merge);
3812
3813 MI.eraseFromParent();
3814 return true;
3815}
3816
3817// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3818//
3819// Source and accumulation registers must all be 32-bits.
3820//
3821// TODO: When the multiply is uniform, we should produce a code sequence
3822// that is better suited to instruction selection on the SALU. Instead of
3823// the outer loop going over parts of the result, the outer loop should go
3824// over parts of one of the factors. This should result in instruction
3825// selection that makes full use of S_ADDC_U32 instructions.
3828 ArrayRef<Register> Src0,
3829 ArrayRef<Register> Src1,
3830 bool UsePartialMad64_32,
3831 bool SeparateOddAlignedProducts) const {
3832 // Use (possibly empty) vectors of S1 registers to represent the set of
3833 // carries from one pair of positions to the next.
3834 using Carry = SmallVector<Register, 2>;
3835
3836 MachineIRBuilder &B = Helper.MIRBuilder;
3837 GISelKnownBits &KB = *Helper.getKnownBits();
3838
3839 const LLT S1 = LLT::scalar(1);
3840 const LLT S32 = LLT::scalar(32);
3841 const LLT S64 = LLT::scalar(64);
3842
3843 Register Zero32;
3844 Register Zero64;
3845
3846 auto getZero32 = [&]() -> Register {
3847 if (!Zero32)
3848 Zero32 = B.buildConstant(S32, 0).getReg(0);
3849 return Zero32;
3850 };
3851 auto getZero64 = [&]() -> Register {
3852 if (!Zero64)
3853 Zero64 = B.buildConstant(S64, 0).getReg(0);
3854 return Zero64;
3855 };
3856
3857 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3858 for (unsigned i = 0; i < Src0.size(); ++i) {
3859 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3860 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3861 }
3862
3863 // Merge the given carries into the 32-bit LocalAccum, which is modified
3864 // in-place.
3865 //
3866 // Returns the carry-out, which is a single S1 register or null.
3867 auto mergeCarry =
3868 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3869 if (CarryIn.empty())
3870 return Register();
3871
3872 bool HaveCarryOut = true;
3873 Register CarryAccum;
3874 if (CarryIn.size() == 1) {
3875 if (!LocalAccum) {
3876 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3877 return Register();
3878 }
3879
3880 CarryAccum = getZero32();
3881 } else {
3882 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3883 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3884 CarryAccum =
3885 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3886 .getReg(0);
3887 }
3888
3889 if (!LocalAccum) {
3890 LocalAccum = getZero32();
3891 HaveCarryOut = false;
3892 }
3893 }
3894
3895 auto Add =
3896 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3897 LocalAccum = Add.getReg(0);
3898 return HaveCarryOut ? Add.getReg(1) : Register();
3899 };
3900
3901 // Build a multiply-add chain to compute
3902 //
3903 // LocalAccum + (partial products at DstIndex)
3904 // + (opportunistic subset of CarryIn)
3905 //
3906 // LocalAccum is an array of one or two 32-bit registers that are updated
3907 // in-place. The incoming registers may be null.
3908 //
3909 // In some edge cases, carry-ins can be consumed "for free". In that case,
3910 // the consumed carry bits are removed from CarryIn in-place.
3911 auto buildMadChain =
3912 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3913 -> Carry {
3914 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3915 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3916
3917 Carry CarryOut;
3918 unsigned j0 = 0;
3919
3920 // Use plain 32-bit multiplication for the most significant part of the
3921 // result by default.
3922 if (LocalAccum.size() == 1 &&
3923 (!UsePartialMad64_32 || !CarryIn.empty())) {
3924 do {
3925 // Skip multiplication if one of the operands is 0
3926 unsigned j1 = DstIndex - j0;
3927 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3928 ++j0;
3929 continue;
3930 }
3931 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3932 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3933 LocalAccum[0] = Mul.getReg(0);
3934 } else {
3935 if (CarryIn.empty()) {
3936 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3937 } else {
3938 LocalAccum[0] =
3939 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3940 .getReg(0);
3941 CarryIn.pop_back();
3942 }
3943 }
3944 ++j0;
3945 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3946 }
3947
3948 // Build full 64-bit multiplies.
3949 if (j0 <= DstIndex) {
3950 bool HaveSmallAccum = false;
3951 Register Tmp;
3952
3953 if (LocalAccum[0]) {
3954 if (LocalAccum.size() == 1) {
3955 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3956 HaveSmallAccum = true;
3957 } else if (LocalAccum[1]) {
3958 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3959 HaveSmallAccum = false;
3960 } else {
3961 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3962 HaveSmallAccum = true;
3963 }
3964 } else {
3965 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3966 Tmp = getZero64();
3967 HaveSmallAccum = true;
3968 }
3969
3970 do {
3971 unsigned j1 = DstIndex - j0;
3972 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3973 ++j0;
3974 continue;
3975 }
3976 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3977 {Src0[j0], Src1[j1], Tmp});
3978 Tmp = Mad.getReg(0);
3979 if (!HaveSmallAccum)
3980 CarryOut.push_back(Mad.getReg(1));
3981 HaveSmallAccum = false;
3982
3983 ++j0;
3984 } while (j0 <= DstIndex);
3985
3986 auto Unmerge = B.buildUnmerge(S32, Tmp);
3987 LocalAccum[0] = Unmerge.getReg(0);
3988 if (LocalAccum.size() > 1)
3989 LocalAccum[1] = Unmerge.getReg(1);
3990 }
3991
3992 return CarryOut;
3993 };
3994
3995 // Outer multiply loop, iterating over destination parts from least
3996 // significant to most significant parts.
3997 //
3998 // The columns of the following diagram correspond to the destination parts
3999 // affected by one iteration of the outer loop (ignoring boundary
4000 // conditions).
4001 //
4002 // Dest index relative to 2 * i: 1 0 -1
4003 // ------
4004 // Carries from previous iteration: e o
4005 // Even-aligned partial product sum: E E .
4006 // Odd-aligned partial product sum: O O
4007 //
4008 // 'o' is OddCarry, 'e' is EvenCarry.
4009 // EE and OO are computed from partial products via buildMadChain and use
4010 // accumulation where possible and appropriate.
4011 //
4012 Register SeparateOddCarry;
4013 Carry EvenCarry;
4014 Carry OddCarry;
4015
4016 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4017 Carry OddCarryIn = std::move(OddCarry);
4018 Carry EvenCarryIn = std::move(EvenCarry);
4019 OddCarry.clear();
4020 EvenCarry.clear();
4021
4022 // Partial products at offset 2 * i.
4023 if (2 * i < Accum.size()) {
4024 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4025 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4026 }
4027
4028 // Partial products at offset 2 * i - 1.
4029 if (i > 0) {
4030 if (!SeparateOddAlignedProducts) {
4031 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4032 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4033 } else {
4034 bool IsHighest = 2 * i >= Accum.size();
4035 Register SeparateOddOut[2];
4036 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4037 .take_front(IsHighest ? 1 : 2);
4038 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4039
4041
4042 if (i == 1) {
4043 if (!IsHighest)
4044 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4045 else
4046 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4047 } else {
4048 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4049 SeparateOddCarry);
4050 }
4051 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4052
4053 if (!IsHighest) {
4054 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4055 Lo->getOperand(1).getReg());
4056 Accum[2 * i] = Hi.getReg(0);
4057 SeparateOddCarry = Hi.getReg(1);
4058 }
4059 }
4060 }
4061
4062 // Add in the carries from the previous iteration
4063 if (i > 0) {
4064 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4065 EvenCarryIn.push_back(CarryOut);
4066
4067 if (2 * i < Accum.size()) {
4068 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4069 OddCarry.push_back(CarryOut);
4070 }
4071 }
4072 }
4073}
4074
4075// Custom narrowing of wide multiplies using wide multiply-add instructions.
4076//
4077// TODO: If the multiply is followed by an addition, we should attempt to
4078// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4080 MachineInstr &MI) const {
4081 assert(ST.hasMad64_32());
4082 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4083
4084 MachineIRBuilder &B = Helper.MIRBuilder;
4085 MachineRegisterInfo &MRI = *B.getMRI();
4086
4087 Register DstReg = MI.getOperand(0).getReg();
4088 Register Src0 = MI.getOperand(1).getReg();
4089 Register Src1 = MI.getOperand(2).getReg();
4090
4091 LLT Ty = MRI.getType(DstReg);
4092 assert(Ty.isScalar());
4093
4094 unsigned Size = Ty.getSizeInBits();
4095 unsigned NumParts = Size / 32;
4096 assert((Size % 32) == 0);
4097 assert(NumParts >= 2);
4098
4099 // Whether to use MAD_64_32 for partial products whose high half is
4100 // discarded. This avoids some ADD instructions but risks false dependency
4101 // stalls on some subtargets in some cases.
4102 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4103
4104 // Whether to compute odd-aligned partial products separately. This is
4105 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4106 // in an even-aligned VGPR.
4107 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4108
4109 LLT S32 = LLT::scalar(32);
4110 SmallVector<Register, 2> Src0Parts, Src1Parts;
4111 for (unsigned i = 0; i < NumParts; ++i) {
4112 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4113 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4114 }
4115 B.buildUnmerge(Src0Parts, Src0);
4116 B.buildUnmerge(Src1Parts, Src1);
4117
4118 SmallVector<Register, 2> AccumRegs(NumParts);
4119 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4120 SeparateOddAlignedProducts);
4121
4122 B.buildMergeLikeInstr(DstReg, AccumRegs);
4123 MI.eraseFromParent();
4124 return true;
4125}
4126
4127// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4128// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4129// case with a single min instruction instead of a compare+select.
4132 MachineIRBuilder &B) const {
4133 Register Dst = MI.getOperand(0).getReg();
4134 Register Src = MI.getOperand(1).getReg();
4135 LLT DstTy = MRI.getType(Dst);
4136 LLT SrcTy = MRI.getType(Src);
4137
4138 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4139 ? AMDGPU::G_AMDGPU_FFBH_U32
4140 : AMDGPU::G_AMDGPU_FFBL_B32;
4141 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4142 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4143
4144 MI.eraseFromParent();
4145 return true;
4146}
4147
4148// Check that this is a G_XOR x, -1
4149static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4150 if (MI.getOpcode() != TargetOpcode::G_XOR)
4151 return false;
4152 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4153 return ConstVal && *ConstVal == -1;
4154}
4155
4156// Return the use branch instruction, otherwise null if the usage is invalid.
4157static MachineInstr *
4159 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4160 Register CondDef = MI.getOperand(0).getReg();
4161 if (!MRI.hasOneNonDBGUse(CondDef))
4162 return nullptr;
4163
4164 MachineBasicBlock *Parent = MI.getParent();
4165 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4166
4167 if (isNot(MRI, *UseMI)) {
4168 Register NegatedCond = UseMI->getOperand(0).getReg();
4169 if (!MRI.hasOneNonDBGUse(NegatedCond))
4170 return nullptr;
4171
4172 // We're deleting the def of this value, so we need to remove it.
4173 eraseInstr(*UseMI, MRI);
4174
4175 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4176 Negated = true;
4177 }
4178
4179 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4180 return nullptr;
4181
4182 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4183 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4184 if (Next == Parent->end()) {
4185 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4186 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4187 return nullptr;
4188 UncondBrTarget = &*NextMBB;
4189 } else {
4190 if (Next->getOpcode() != AMDGPU::G_BR)
4191 return nullptr;
4192 Br = &*Next;
4193 UncondBrTarget = Br->getOperand(0).getMBB();
4194 }
4195
4196 return UseMI;
4197}
4198
4200 const ArgDescriptor *Arg,
4201 const TargetRegisterClass *ArgRC,
4202 LLT ArgTy) const {
4203 MCRegister SrcReg = Arg->getRegister();
4204 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4205 assert(DstReg.isVirtual() && "Virtual register expected");
4206
4207 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4208 *ArgRC, B.getDebugLoc(), ArgTy);
4209 if (Arg->isMasked()) {
4210 // TODO: Should we try to emit this once in the entry block?
4211 const LLT S32 = LLT::scalar(32);
4212 const unsigned Mask = Arg->getMask();
4213 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4214
4215 Register AndMaskSrc = LiveIn;
4216
4217 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4218 // 0.
4219 if (Shift != 0) {
4220 auto ShiftAmt = B.buildConstant(S32, Shift);
4221 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4222 }
4223
4224 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4225 } else {
4226 B.buildCopy(DstReg, LiveIn);
4227 }
4228
4229 return true;
4230}
4231
4233 Register DstReg, MachineIRBuilder &B,
4235 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4236 const ArgDescriptor *Arg = nullptr;
4237 const TargetRegisterClass *ArgRC;
4238 LLT ArgTy;
4239
4240 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4241 const ArgDescriptor WorkGroupIDX =
4242 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4243 // If GridZ is not programmed in an entry function then the hardware will set
4244 // it to all zeros, so there is no need to mask the GridY value in the low
4245 // order bits.
4246 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4247 AMDGPU::TTMP7,
4248 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4249 const ArgDescriptor WorkGroupIDZ =
4250 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4251 if (ST.hasArchitectedSGPRs() &&
4253 switch (ArgType) {
4255 Arg = &WorkGroupIDX;
4256 ArgRC = &AMDGPU::SReg_32RegClass;
4257 ArgTy = LLT::scalar(32);
4258 break;
4260 Arg = &WorkGroupIDY;
4261 ArgRC = &AMDGPU::SReg_32RegClass;
4262 ArgTy = LLT::scalar(32);
4263 break;
4265 Arg = &WorkGroupIDZ;
4266 ArgRC = &AMDGPU::SReg_32RegClass;
4267 ArgTy = LLT::scalar(32);
4268 break;
4269 default:
4270 break;
4271 }
4272 }
4273
4274 if (!Arg)
4275 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4276
4277 if (!Arg) {
4279 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4280 // case the pointer argument may be missing and we use null.
4281 B.buildConstant(DstReg, 0);
4282 return true;
4283 }
4284
4285 // It's undefined behavior if a function marked with the amdgpu-no-*
4286 // attributes uses the corresponding intrinsic.
4287 B.buildUndef(DstReg);
4288 return true;
4289 }
4290
4291 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4292 return false; // TODO: Handle these
4293 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4294}
4295
4299 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4300 return false;
4301
4302 MI.eraseFromParent();
4303 return true;
4304}
4305
4307 int64_t C) {
4308 B.buildConstant(MI.getOperand(0).getReg(), C);
4309 MI.eraseFromParent();
4310 return true;
4311}
4312
4315 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4316 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4317 if (MaxID == 0)
4318 return replaceWithConstant(B, MI, 0);
4319
4320 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4321 const ArgDescriptor *Arg;
4322 const TargetRegisterClass *ArgRC;
4323 LLT ArgTy;
4324 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4325
4326 Register DstReg = MI.getOperand(0).getReg();
4327 if (!Arg) {
4328 // It's undefined behavior if a function marked with the amdgpu-no-*
4329 // attributes uses the corresponding intrinsic.
4330 B.buildUndef(DstReg);
4331 MI.eraseFromParent();
4332 return true;
4333 }
4334
4335 if (Arg->isMasked()) {
4336 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4337 // masking operations anyway.
4338 //
4339 // TODO: We could assert the top bit is 0 for the source copy.
4340 if (!loadInputValue(DstReg, B, ArgType))
4341 return false;
4342 } else {
4343 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4344 if (!loadInputValue(TmpReg, B, ArgType))
4345 return false;
4346 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4347 }
4348
4349 MI.eraseFromParent();
4350 return true;
4351}
4352
4354 int64_t Offset) const {
4356 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4357
4358 // TODO: If we passed in the base kernel offset we could have a better
4359 // alignment than 4, but we don't really need it.
4360 if (!loadInputValue(KernArgReg, B,
4362 llvm_unreachable("failed to find kernarg segment ptr");
4363
4364 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4365 // TODO: Should get nuw
4366 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4367}
4368
4369/// Legalize a value that's loaded from kernel arguments. This is only used by
4370/// legacy intrinsics.
4374 Align Alignment) const {
4375 Register DstReg = MI.getOperand(0).getReg();
4376
4377 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4378 "unexpected kernarg parameter type");
4379
4382 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4385 MI.eraseFromParent();
4386 return true;
4387}
4388
4391 MachineIRBuilder &B) const {
4392 Register Dst = MI.getOperand(0).getReg();
4393 LLT DstTy = MRI.getType(Dst);
4394 LLT S16 = LLT::scalar(16);
4395 LLT S32 = LLT::scalar(32);
4396 LLT S64 = LLT::scalar(64);
4397
4398 if (DstTy == S16)
4399 return legalizeFDIV16(MI, MRI, B);
4400 if (DstTy == S32)
4401 return legalizeFDIV32(MI, MRI, B);
4402 if (DstTy == S64)
4403 return legalizeFDIV64(MI, MRI, B);
4404
4405 return false;
4406}
4407
4409 Register DstDivReg,
4410 Register DstRemReg,
4411 Register X,
4412 Register Y) const {
4413 const LLT S1 = LLT::scalar(1);
4414 const LLT S32 = LLT::scalar(32);
4415
4416 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4417 // algorithm used here.
4418
4419 // Initial estimate of inv(y).
4420 auto FloatY = B.buildUITOFP(S32, Y);
4421 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4422 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4423 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4424 auto Z = B.buildFPTOUI(S32, ScaledY);
4425
4426 // One round of UNR.
4427 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4428 auto NegYZ = B.buildMul(S32, NegY, Z);
4429 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4430
4431 // Quotient/remainder estimate.
4432 auto Q = B.buildUMulH(S32, X, Z);
4433 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4434
4435 // First quotient/remainder refinement.
4436 auto One = B.buildConstant(S32, 1);
4437 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4438 if (DstDivReg)
4439 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4440 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4441
4442 // Second quotient/remainder refinement.
4443 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4444 if (DstDivReg)
4445 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4446
4447 if (DstRemReg)
4448 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4449}
4450
4451// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4452//
4453// Return lo, hi of result
4454//
4455// %cvt.lo = G_UITOFP Val.lo
4456// %cvt.hi = G_UITOFP Val.hi
4457// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4458// %rcp = G_AMDGPU_RCP_IFLAG %mad
4459// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4460// %mul2 = G_FMUL %mul1, 2**(-32)
4461// %trunc = G_INTRINSIC_TRUNC %mul2
4462// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4463// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4464static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4465 Register Val) {
4466 const LLT S32 = LLT::scalar(32);
4467 auto Unmerge = B.buildUnmerge(S32, Val);
4468
4469 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4470 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4471
4472 auto Mad = B.buildFMAD(
4473 S32, CvtHi, // 2**32
4474 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4475
4476 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4477 auto Mul1 = B.buildFMul(
4478 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4479
4480 // 2**(-32)
4481 auto Mul2 = B.buildFMul(
4482 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4483 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4484
4485 // -(2**32)
4486 auto Mad2 = B.buildFMAD(
4487 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4488 Mul1);
4489
4490 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4491 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4492
4493 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4494}
4495
4497 Register DstDivReg,
4498 Register DstRemReg,
4499 Register Numer,
4500 Register Denom) const {
4501 const LLT S32 = LLT::scalar(32);
4502 const LLT S64 = LLT::scalar(64);
4503 const LLT S1 = LLT::scalar(1);
4504 Register RcpLo, RcpHi;
4505
4506 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4507
4508 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4509
4510 auto Zero64 = B.buildConstant(S64, 0);
4511 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4512
4513 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4514 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4515
4516 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4517 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4518 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4519
4520 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4521 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4522 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4523
4524 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4525 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4526 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4527 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4528 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4529
4530 auto Zero32 = B.buildConstant(S32, 0);
4531 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4532 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4533 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4534
4535 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4536 Register NumerLo = UnmergeNumer.getReg(0);
4537 Register NumerHi = UnmergeNumer.getReg(1);
4538
4539 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4540 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4541 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4542 Register Mul3_Lo = UnmergeMul3.getReg(0);
4543 Register Mul3_Hi = UnmergeMul3.getReg(1);
4544 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4545 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4546 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4547 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4548
4549 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4550 Register DenomLo = UnmergeDenom.getReg(0);
4551 Register DenomHi = UnmergeDenom.getReg(1);
4552
4553 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4554 auto C1 = B.buildSExt(S32, CmpHi);
4555
4556 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4557 auto C2 = B.buildSExt(S32, CmpLo);
4558
4559 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4560 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4561
4562 // TODO: Here and below portions of the code can be enclosed into if/endif.
4563 // Currently control flow is unconditional and we have 4 selects after
4564 // potential endif to substitute PHIs.
4565
4566 // if C3 != 0 ...
4567 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4568 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4569 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4570 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4571
4572 auto One64 = B.buildConstant(S64, 1);
4573 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4574
4575 auto C4 =
4576 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4577 auto C5 =
4578 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4579 auto C6 = B.buildSelect(
4580 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4581
4582 // if (C6 != 0)
4583 auto Add4 = B.buildAdd(S64, Add3, One64);
4584 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4585
4586 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4587 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4588 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4589
4590 // endif C6
4591 // endif C3
4592
4593 if (DstDivReg) {
4594 auto Sel1 = B.buildSelect(
4595 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4596 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4597 Sel1, MulHi3);
4598 }
4599
4600 if (DstRemReg) {
4601 auto Sel2 = B.buildSelect(
4602 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4603 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4604 Sel2, Sub1);
4605 }
4606}
4607
4610 MachineIRBuilder &B) const {
4611 Register DstDivReg, DstRemReg;
4612 switch (MI.getOpcode()) {
4613 default:
4614 llvm_unreachable("Unexpected opcode!");
4615 case AMDGPU::G_UDIV: {
4616 DstDivReg = MI.getOperand(0).getReg();
4617 break;
4618 }
4619 case AMDGPU::G_UREM: {
4620 DstRemReg = MI.getOperand(0).getReg();
4621 break;
4622 }
4623 case AMDGPU::G_UDIVREM: {
4624 DstDivReg = MI.getOperand(0).getReg();
4625 DstRemReg = MI.getOperand(1).getReg();
4626 break;
4627 }
4628 }
4629
4630 const LLT S64 = LLT::scalar(64);
4631 const LLT S32 = LLT::scalar(32);
4632 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4633 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4634 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4635 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4636
4637 if (Ty == S32)
4638 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4639 else if (Ty == S64)
4640 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4641 else
4642 return false;
4643
4644 MI.eraseFromParent();
4645 return true;
4646}
4647
4650 MachineIRBuilder &B) const {
4651 const LLT S64 = LLT::scalar(64);
4652 const LLT S32 = LLT::scalar(32);
4653
4654 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4655 if (Ty != S32 && Ty != S64)
4656 return false;
4657
4658 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4659 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4660 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4661
4662 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4663 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4664 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4665
4666 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4667 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4668
4669 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4670 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4671
4672 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4673 switch (MI.getOpcode()) {
4674 default:
4675 llvm_unreachable("Unexpected opcode!");
4676 case AMDGPU::G_SDIV: {
4677 DstDivReg = MI.getOperand(0).getReg();
4678 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4679 break;
4680 }
4681 case AMDGPU::G_SREM: {
4682 DstRemReg = MI.getOperand(0).getReg();
4683 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4684 break;
4685 }
4686 case AMDGPU::G_SDIVREM: {
4687 DstDivReg = MI.getOperand(0).getReg();
4688 DstRemReg = MI.getOperand(1).getReg();
4689 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4690 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4691 break;
4692 }
4693 }
4694
4695 if (Ty == S32)
4696 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4697 else
4698 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4699
4700 if (DstDivReg) {
4701 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4702 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4703 B.buildSub(DstDivReg, SignXor, Sign);
4704 }
4705
4706 if (DstRemReg) {
4707 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4708 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4709 B.buildSub(DstRemReg, SignXor, Sign);
4710 }
4711
4712 MI.eraseFromParent();
4713 return true;
4714}
4715
4718 MachineIRBuilder &B) const {
4719 Register Res = MI.getOperand(0).getReg();
4720 Register LHS = MI.getOperand(1).getReg();
4721 Register RHS = MI.getOperand(2).getReg();
4722 uint16_t Flags = MI.getFlags();
4723 LLT ResTy = MRI.getType(Res);
4724
4725 const MachineFunction &MF = B.getMF();
4726 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4728
4729 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4730 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4731 return false;
4732
4733 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4734 // the CI documentation has a worst case error of 1 ulp.
4735 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4736 // use it as long as we aren't trying to use denormals.
4737 //
4738 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4739
4740 // 1 / x -> RCP(x)
4741 if (CLHS->isExactlyValue(1.0)) {
4742 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4743 .addUse(RHS)
4744 .setMIFlags(Flags);
4745
4746 MI.eraseFromParent();
4747 return true;
4748 }
4749
4750 // -1 / x -> RCP( FNEG(x) )
4751 if (CLHS->isExactlyValue(-1.0)) {
4752 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4753 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4754 .addUse(FNeg.getReg(0))
4755 .setMIFlags(Flags);
4756
4757 MI.eraseFromParent();
4758 return true;
4759 }
4760 }
4761
4762 // For f16 require afn or arcp.
4763 // For f32 require afn.
4764 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4765 !MI.getFlag(MachineInstr::FmArcp)))
4766 return false;
4767
4768 // x / y -> x * (1.0 / y)
4769 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4770 .addUse(RHS)
4771 .setMIFlags(Flags);
4772 B.buildFMul(Res, LHS, RCP, Flags);
4773
4774 MI.eraseFromParent();
4775 return true;
4776}
4777
4780 MachineIRBuilder &B) const {
4781 Register Res = MI.getOperand(0).getReg();
4782 Register X = MI.getOperand(1).getReg();
4783 Register Y = MI.getOperand(2).getReg();
4784 uint16_t Flags = MI.getFlags();
4785 LLT ResTy = MRI.getType(Res);
4786
4787 const MachineFunction &MF = B.getMF();
4788 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4789 MI.getFlag(MachineInstr::FmAfn);
4790
4791 if (!AllowInaccurateRcp)
4792 return false;
4793
4794 auto NegY = B.buildFNeg(ResTy, Y);
4795 auto One = B.buildFConstant(ResTy, 1.0);
4796
4797 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4798 .addUse(Y)
4799 .setMIFlags(Flags);
4800
4801 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4802 R = B.buildFMA(ResTy, Tmp0, R, R);
4803
4804 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4805 R = B.buildFMA(ResTy, Tmp1, R, R);
4806
4807 auto Ret = B.buildFMul(ResTy, X, R);
4808 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4809
4810 B.buildFMA(Res, Tmp2, R, Ret);
4811 MI.eraseFromParent();
4812 return true;
4813}
4814
4817 MachineIRBuilder &B) const {
4819 return true;
4820
4821 Register Res = MI.getOperand(0).getReg();
4822 Register LHS = MI.getOperand(1).getReg();
4823 Register RHS = MI.getOperand(2).getReg();
4824
4825 uint16_t Flags = MI.getFlags();
4826
4827 LLT S16 = LLT::scalar(16);
4828 LLT S32 = LLT::scalar(32);
4829
4830 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4831 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4832
4833 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4834 .addUse(RHSExt.getReg(0))
4835 .setMIFlags(Flags);
4836
4837 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4838 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4839
4840 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4841 .addUse(RDst.getReg(0))
4842 .addUse(RHS)
4843 .addUse(LHS)
4844 .setMIFlags(Flags);
4845
4846 MI.eraseFromParent();
4847 return true;
4848}
4849
4850static constexpr unsigned SPDenormModeBitField =
4852
4853// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4854// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4856 const GCNSubtarget &ST,
4858 // Set SP denorm mode to this value.
4859 unsigned SPDenormMode =
4860 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4861
4862 if (ST.hasDenormModeInst()) {
4863 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4864 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4865
4866 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4867 B.buildInstr(AMDGPU::S_DENORM_MODE)
4868 .addImm(NewDenormModeValue);
4869
4870 } else {
4871 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4872 .addImm(SPDenormMode)
4873 .addImm(SPDenormModeBitField);
4874 }
4875}
4876
4879 MachineIRBuilder &B) const {
4881 return true;
4882
4883 Register Res = MI.getOperand(0).getReg();
4884 Register LHS = MI.getOperand(1).getReg();
4885 Register RHS = MI.getOperand(2).getReg();
4886 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4887 SIModeRegisterDefaults Mode = MFI->getMode();
4888
4889 uint16_t Flags = MI.getFlags();
4890
4891 LLT S32 = LLT::scalar(32);
4892 LLT S1 = LLT::scalar(1);
4893
4894 auto One = B.buildFConstant(S32, 1.0f);
4895
4896 auto DenominatorScaled =
4897 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4898 .addUse(LHS)
4899 .addUse(RHS)
4900 .addImm(0)
4901 .setMIFlags(Flags);
4902 auto NumeratorScaled =
4903 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4904 .addUse(LHS)
4905 .addUse(RHS)
4906 .addImm(1)
4907 .setMIFlags(Flags);
4908
4909 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4910 .addUse(DenominatorScaled.getReg(0))
4911 .setMIFlags(Flags);
4912 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4913
4914 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4915 const bool HasDynamicDenormals =
4916 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4917 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4918
4919 Register SavedSPDenormMode;
4920 if (!PreservesDenormals) {
4921 if (HasDynamicDenormals) {
4922 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4923 B.buildInstr(AMDGPU::S_GETREG_B32)
4924 .addDef(SavedSPDenormMode)
4925 .addImm(SPDenormModeBitField);
4926 }
4927 toggleSPDenormMode(true, B, ST, Mode);
4928 }
4929
4930 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4931 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4932 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4933 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4934 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4935 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4936
4937 if (!PreservesDenormals) {
4938 if (HasDynamicDenormals) {
4939 assert(SavedSPDenormMode);
4940 B.buildInstr(AMDGPU::S_SETREG_B32)
4941 .addReg(SavedSPDenormMode)
4942 .addImm(SPDenormModeBitField);
4943 } else
4944 toggleSPDenormMode(false, B, ST, Mode);
4945 }
4946
4947 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4948 .addUse(Fma4.getReg(0))
4949 .addUse(Fma1.getReg(0))
4950 .addUse(Fma3.getReg(0))
4951 .addUse(NumeratorScaled.getReg(1))
4952 .setMIFlags(Flags);
4953
4954 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4955 .addUse(Fmas.getReg(0))
4956 .addUse(RHS)
4957 .addUse(LHS)
4958 .setMIFlags(Flags);
4959
4960 MI.eraseFromParent();
4961 return true;
4962}
4963
4966 MachineIRBuilder &B) const {
4968 return true;
4969
4970 Register Res = MI.getOperand(0).getReg();
4971 Register LHS = MI.getOperand(1).getReg();
4972 Register RHS = MI.getOperand(2).getReg();
4973
4974 uint16_t Flags = MI.getFlags();
4975
4976 LLT S64 = LLT::scalar(64);
4977 LLT S1 = LLT::scalar(1);
4978
4979 auto One = B.buildFConstant(S64, 1.0);
4980
4981 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4982 .addUse(LHS)
4983 .addUse(RHS)
4984 .addImm(0)
4985 .setMIFlags(Flags);
4986
4987 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4988
4989 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4990 .addUse(DivScale0.getReg(0))
4991 .setMIFlags(Flags);
4992
4993 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4994 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4995 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4996
4997 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4998 .addUse(LHS)
4999 .addUse(RHS)
5000 .addImm(1)
5001 .setMIFlags(Flags);
5002
5003 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5004 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5005 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5006
5007 Register Scale;
5009 // Workaround a hardware bug on SI where the condition output from div_scale
5010 // is not usable.
5011
5012 LLT S32 = LLT::scalar(32);
5013
5014 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5015 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5016 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5017 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5018
5019 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5020 Scale1Unmerge.getReg(1));
5021 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5022 Scale0Unmerge.getReg(1));
5023 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5024 } else {
5025 Scale = DivScale1.getReg(1);
5026 }
5027
5028 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5029 .addUse(Fma4.getReg(0))
5030 .addUse(Fma3.getReg(0))
5031 .addUse(Mul.getReg(0))
5032 .addUse(Scale)
5033 .setMIFlags(Flags);
5034
5035 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5036 .addUse(Fmas.getReg(0))
5037 .addUse(RHS)
5038 .addUse(LHS)
5039 .setMIFlags(Flags);
5040
5041 MI.eraseFromParent();
5042 return true;
5043}
5044
5047 MachineIRBuilder &B) const {
5048 Register Res0 = MI.getOperand(0).getReg();
5049 Register Res1 = MI.getOperand(1).getReg();
5050 Register Val = MI.getOperand(2).getReg();
5051 uint16_t Flags = MI.getFlags();
5052
5053 LLT Ty = MRI.getType(Res0);
5054 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5055
5056 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5057 .addUse(Val)
5058 .setMIFlags(Flags);
5059 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5060 .addUse(Val)
5061 .setMIFlags(Flags);
5062
5063 if (ST.hasFractBug()) {
5064 auto Fabs = B.buildFAbs(Ty, Val);
5065 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5066 auto IsFinite =
5067 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5068 auto Zero = B.buildConstant(InstrExpTy, 0);
5069 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5070 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5071 }
5072
5073 B.buildCopy(Res0, Mant);
5074 B.buildSExtOrTrunc(Res1, Exp);
5075
5076 MI.eraseFromParent();
5077 return true;
5078}
5079
5082 MachineIRBuilder &B) const {
5083 Register Res = MI.getOperand(0).getReg();
5084 Register LHS = MI.getOperand(2).getReg();
5085 Register RHS = MI.getOperand(3).getReg();
5086 uint16_t Flags = MI.getFlags();
5087
5088 LLT S32 = LLT::scalar(32);
5089 LLT S1 = LLT::scalar(1);
5090
5091 auto Abs = B.buildFAbs(S32, RHS, Flags);
5092 const APFloat C0Val(1.0f);
5093
5094 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5095 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5096 auto C2 = B.buildFConstant(S32, 1.0f);
5097
5098 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5099 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5100
5101 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5102
5103 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5104 .addUse(Mul0.getReg(0))
5105 .setMIFlags(Flags);
5106
5107 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5108
5109 B.buildFMul(Res, Sel, Mul1, Flags);
5110
5111 MI.eraseFromParent();
5112 return true;
5113}
5114
5117 MachineIRBuilder &B) const {
5118 // Bypass the correct expansion a standard promotion through G_FSQRT would
5119 // get. The f32 op is accurate enough for the f16 cas.
5120 unsigned Flags = MI.getFlags();
5121 assert(!ST.has16BitInsts());
5122 const LLT F32 = LLT::scalar(32);
5123 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5124 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5125 .addUse(Ext.getReg(0))
5126 .setMIFlags(Flags);
5127 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5128 MI.eraseFromParent();
5129 return true;
5130}
5131
5134 MachineIRBuilder &B) const {
5135 MachineFunction &MF = B.getMF();
5136 Register Dst = MI.getOperand(0).getReg();
5137 Register X = MI.getOperand(1).getReg();
5138 const unsigned Flags = MI.getFlags();
5139 const LLT S1 = LLT::scalar(1);
5140 const LLT F32 = LLT::scalar(32);
5141 const LLT I32 = LLT::scalar(32);
5142
5143 if (allowApproxFunc(MF, Flags)) {
5144 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5145 .addUse(X)
5146 .setMIFlags(Flags);
5147 MI.eraseFromParent();
5148 return true;
5149 }
5150
5151 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5152 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5153 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5154 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5155 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5156
5157 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5158 if (needsDenormHandlingF32(MF, X, Flags)) {
5159 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5160 .addUse(SqrtX.getReg(0))
5161 .setMIFlags(Flags);
5162
5163 auto NegOne = B.buildConstant(I32, -1);
5164 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5165
5166 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5167 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5168
5169 auto PosOne = B.buildConstant(I32, 1);
5170 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5171
5172 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5173 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5174
5175 auto Zero = B.buildFConstant(F32, 0.0f);
5176 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5177
5178 SqrtS =
5179 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5180
5181 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5182 SqrtS =
5183 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5184 } else {
5185 auto SqrtR =
5186 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5187 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5188
5189 auto Half = B.buildFConstant(F32, 0.5f);
5190 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5191 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5192 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5193 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5194 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5195 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5196 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5197 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5198 }
5199
5200 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5201
5202 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5203
5204 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5205
5206 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5207 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5208
5209 MI.eraseFromParent();
5210 return true;
5211}
5212
5215 MachineIRBuilder &B) const {
5216 // For double type, the SQRT and RSQ instructions don't have required
5217 // precision, we apply Goldschmidt's algorithm to improve the result:
5218 //
5219 // y0 = rsq(x)
5220 // g0 = x * y0
5221 // h0 = 0.5 * y0
5222 //
5223 // r0 = 0.5 - h0 * g0
5224 // g1 = g0 * r0 + g0
5225 // h1 = h0 * r0 + h0
5226 //
5227 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5228 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5229 // h2 = h1 * r1 + h1
5230 //
5231 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5232 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5233 //
5234 // sqrt(x) = g3
5235
5236 const LLT S1 = LLT::scalar(1);
5237 const LLT S32 = LLT::scalar(32);
5238 const LLT F64 = LLT::scalar(64);
5239
5240 Register Dst = MI.getOperand(0).getReg();
5241 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5242
5243 Register X = MI.getOperand(1).getReg();
5244 unsigned Flags = MI.getFlags();
5245
5246 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5247
5248 auto ZeroInt = B.buildConstant(S32, 0);
5249 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5250
5251 // Scale up input if it is too small.
5252 auto ScaleUpFactor = B.buildConstant(S32, 256);
5253 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5254 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5255
5256 auto SqrtY =
5257 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5258
5259 auto Half = B.buildFConstant(F64, 0.5);
5260 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5261 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5262
5263 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5264 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5265
5266 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5267 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5268
5269 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5270 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5271
5272 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5273
5274 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5275 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5276
5277 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5278
5279 // Scale down the result.
5280 auto ScaleDownFactor = B.buildConstant(S32, -128);
5281 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5282 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5283
5284 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5285 // with finite only or nsz because rsq(+/-0) = +/-inf
5286
5287 // TODO: Check for DAZ and expand to subnormals
5288 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5289
5290 // If x is +INF, +0, or -0, use its original value
5291 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5292
5293 MI.eraseFromParent();
5294 return true;
5295}
5296
5299 MachineIRBuilder &B) const {
5300 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5301 if (Ty == LLT::scalar(32))
5302 return legalizeFSQRTF32(MI, MRI, B);
5303 if (Ty == LLT::scalar(64))
5304 return legalizeFSQRTF64(MI, MRI, B);
5305 if (Ty == LLT::scalar(16))
5306 return legalizeFSQRTF16(MI, MRI, B);
5307 return false;
5308}
5309
5310// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5311// FIXME: Why do we handle this one but not other removed instructions?
5312//
5313// Reciprocal square root. The clamp prevents infinite results, clamping
5314// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5315// +-max_float.
5318 MachineIRBuilder &B) const {
5320 return true;
5321
5322 Register Dst = MI.getOperand(0).getReg();
5323 Register Src = MI.getOperand(2).getReg();
5324 auto Flags = MI.getFlags();
5325
5326 LLT Ty = MRI.getType(Dst);
5327
5328 const fltSemantics *FltSemantics;
5329 if (Ty == LLT::scalar(32))
5330 FltSemantics = &APFloat::IEEEsingle();
5331 else if (Ty == LLT::scalar(64))
5332 FltSemantics = &APFloat::IEEEdouble();
5333 else
5334 return false;
5335
5336 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5337 .addUse(Src)
5338 .setMIFlags(Flags);
5339
5340 // We don't need to concern ourselves with the snan handling difference, since
5341 // the rsq quieted (or not) so use the one which will directly select.
5342 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5343 const bool UseIEEE = MFI->getMode().IEEE;
5344
5345 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5346 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5347 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5348
5349 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5350
5351 if (UseIEEE)
5352 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5353 else
5354 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5355 MI.eraseFromParent();
5356 return true;
5357}
5358
5360 switch (IID) {
5361 case Intrinsic::amdgcn_ds_fadd:
5362 return AMDGPU::G_ATOMICRMW_FADD;
5363 case Intrinsic::amdgcn_ds_fmin:
5364 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5365 case Intrinsic::amdgcn_ds_fmax:
5366 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5367 default:
5368 llvm_unreachable("not a DS FP intrinsic");
5369 }
5370}
5371
5374 Intrinsic::ID IID) const {
5375 GISelChangeObserver &Observer = Helper.Observer;
5376 Observer.changingInstr(MI);
5377
5378 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5379
5380 // The remaining operands were used to set fields in the MemOperand on
5381 // construction.
5382 for (int I = 6; I > 3; --I)
5383 MI.removeOperand(I);
5384
5385 MI.removeOperand(1); // Remove the intrinsic ID.
5386 Observer.changedInstr(MI);
5387 return true;
5388}
5389
5392 MachineIRBuilder &B) const {
5396 LLT DstTy = MRI.getType(DstReg);
5397 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5398
5399 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5400 if (!loadInputValue(KernargPtrReg, B,
5402 return false;
5403
5404 // FIXME: This should be nuw
5405 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5406 return true;
5407}
5408
5409/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5410/// bits of the pointer and replace them with the stride argument, then
5411/// merge_values everything together. In the common case of a raw buffer (the
5412/// stride component is 0), we can just AND off the upper half.
5415 Register Result = MI.getOperand(0).getReg();
5416 Register Pointer = MI.getOperand(2).getReg();
5417 Register Stride = MI.getOperand(3).getReg();
5418 Register NumRecords = MI.getOperand(4).getReg();
5419 Register Flags = MI.getOperand(5).getReg();
5420
5421 LLT S32 = LLT::scalar(32);
5422
5423 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5424 auto Unmerge = B.buildUnmerge(S32, Pointer);
5425 Register LowHalf = Unmerge.getReg(0);
5426 Register HighHalf = Unmerge.getReg(1);
5427
5428 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5429 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5430
5431 MachineInstrBuilder NewHighHalf = Masked;
5432 std::optional<ValueAndVReg> StrideConst =
5434 if (!StrideConst || !StrideConst->Value.isZero()) {
5435 MachineInstrBuilder ShiftedStride;
5436 if (StrideConst) {
5437 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5438 uint32_t ShiftedStrideVal = StrideVal << 16;
5439 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5440 } else {
5441 auto ExtStride = B.buildAnyExt(S32, Stride);
5442 auto ShiftConst = B.buildConstant(S32, 16);
5443 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5444 }
5445 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5446 }
5447 Register NewHighHalfReg = NewHighHalf.getReg(0);
5448 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5449 MI.eraseFromParent();
5450 return true;
5451}
5452
5455 MachineIRBuilder &B) const {
5456 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5457 if (!MFI->isEntryFunction()) {
5460 }
5461
5462 Register DstReg = MI.getOperand(0).getReg();
5463 if (!getImplicitArgPtr(DstReg, MRI, B))
5464 return false;
5465
5466 MI.eraseFromParent();
5467 return true;
5468}
5469
5472 MachineIRBuilder &B) const {
5473 Function &F = B.getMF().getFunction();
5474 std::optional<uint32_t> KnownSize =
5476 if (KnownSize.has_value())
5477 B.buildConstant(DstReg, *KnownSize);
5478 return false;
5479}
5480
5483 MachineIRBuilder &B) const {
5484
5485 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5486 if (!MFI->isEntryFunction()) {
5489 }
5490
5491 Register DstReg = MI.getOperand(0).getReg();
5492 if (!getLDSKernelId(DstReg, MRI, B))
5493 return false;
5494
5495 MI.eraseFromParent();
5496 return true;
5497}
5498
5502 unsigned AddrSpace) const {
5503 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5504 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5505 Register Hi32 = Unmerge.getReg(1);
5506
5507 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5508 MI.eraseFromParent();
5509 return true;
5510}
5511
5512// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5513// offset (the offset that is included in bounds checking and swizzling, to be
5514// split between the instruction's voffset and immoffset fields) and soffset
5515// (the offset that is excluded from bounds checking and swizzling, to go in
5516// the instruction's soffset field). This function takes the first kind of
5517// offset and figures out how to split it between voffset and immoffset.
5518std::pair<Register, unsigned>
5520 Register OrigOffset) const {
5521 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5522 Register BaseReg;
5523 unsigned ImmOffset;
5524 const LLT S32 = LLT::scalar(32);
5525 MachineRegisterInfo &MRI = *B.getMRI();
5526
5527 std::tie(BaseReg, ImmOffset) =
5529
5530 // If BaseReg is a pointer, convert it to int.
5531 if (MRI.getType(BaseReg).isPointer())
5532 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5533
5534 // If the immediate value is too big for the immoffset field, put only bits
5535 // that would normally fit in the immoffset field. The remaining value that
5536 // is copied/added for the voffset field is a large power of 2, and it
5537 // stands more chance of being CSEd with the copy/add for another similar
5538 // load/store.
5539 // However, do not do that rounding down if that is a negative
5540 // number, as it appears to be illegal to have a negative offset in the
5541 // vgpr, even if adding the immediate offset makes it positive.
5542 unsigned Overflow = ImmOffset & ~MaxImm;
5543 ImmOffset -= Overflow;
5544 if ((int32_t)Overflow < 0) {
5545 Overflow += ImmOffset;
5546 ImmOffset = 0;
5547 }
5548
5549 if (Overflow != 0) {
5550 if (!BaseReg) {
5551 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5552 } else {
5553 auto OverflowVal = B.buildConstant(S32, Overflow);
5554 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5555 }
5556 }
5557
5558 if (!BaseReg)
5559 BaseReg = B.buildConstant(S32, 0).getReg(0);
5560
5561 return std::pair(BaseReg, ImmOffset);
5562}
5563
5564/// Handle register layout difference for f16 images for some subtargets.
5567 Register Reg,
5568 bool ImageStore) const {
5569 const LLT S16 = LLT::scalar(16);
5570 const LLT S32 = LLT::scalar(32);
5571 LLT StoreVT = MRI.getType(Reg);
5572 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5573
5574 if (ST.hasUnpackedD16VMem()) {
5575 auto Unmerge = B.buildUnmerge(S16, Reg);
5576
5577 SmallVector<Register, 4> WideRegs;
5578 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5579 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5580
5581 int NumElts = StoreVT.getNumElements();
5582
5583 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5584 .getReg(0);
5585 }
5586
5587 if (ImageStore && ST.hasImageStoreD16Bug()) {
5588 if (StoreVT.getNumElements() == 2) {
5589 SmallVector<Register, 4> PackedRegs;
5590 Reg = B.buildBitcast(S32, Reg).getReg(0);
5591 PackedRegs.push_back(Reg);
5592 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5593 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5594 .getReg(0);
5595 }
5596
5597 if (StoreVT.getNumElements() == 3) {
5598 SmallVector<Register, 4> PackedRegs;
5599 auto Unmerge = B.buildUnmerge(S16, Reg);
5600 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5601 PackedRegs.push_back(Unmerge.getReg(I));
5602 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5603 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5604 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5605 }
5606
5607 if (StoreVT.getNumElements() == 4) {
5608 SmallVector<Register, 4> PackedRegs;
5609 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5610 auto Unmerge = B.buildUnmerge(S32, Reg);
5611 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5612 PackedRegs.push_back(Unmerge.getReg(I));
5613 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5614 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5615 .getReg(0);
5616 }
5617
5618 llvm_unreachable("invalid data type");
5619 }
5620
5621 if (StoreVT == LLT::fixed_vector(3, S16)) {
5622 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5623 .getReg(0);
5624 }
5625 return Reg;
5626}
5627
5629 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5630 MachineRegisterInfo *MRI = B.getMRI();
5631 LLT Ty = MRI->getType(VData);
5632
5633 const LLT S16 = LLT::scalar(16);
5634
5635 // Fixup buffer resources themselves needing to be v4i128.
5637 return castBufferRsrcToV4I32(VData, B);
5638
5639 // Fixup illegal register types for i8 stores.
5640 if (Ty == LLT::scalar(8) || Ty == S16) {
5641 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5642 return AnyExt;
5643 }
5644
5645 if (Ty.isVector()) {
5646 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5647 if (IsFormat)
5648 return handleD16VData(B, *MRI, VData);
5649 }
5650 }
5651
5652 return VData;
5653}
5654
5658 bool IsTyped,
5659 bool IsFormat) const {
5660 Register VData = MI.getOperand(1).getReg();
5661 LLT Ty = MRI.getType(VData);
5662 LLT EltTy = Ty.getScalarType();
5663 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5664 const LLT S32 = LLT::scalar(32);
5665
5666 VData = fixStoreSourceType(B, VData, IsFormat);
5668 Register RSrc = MI.getOperand(2).getReg();
5669
5670 MachineMemOperand *MMO = *MI.memoperands_begin();
5671 const int MemSize = MMO->getSize().getValue();
5672
5673 unsigned ImmOffset;
5674
5675 // The typed intrinsics add an immediate after the registers.
5676 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5677
5678 // The struct intrinsic variants add one additional operand over raw.
5679 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5680 Register VIndex;
5681 int OpOffset = 0;
5682 if (HasVIndex) {
5683 VIndex = MI.getOperand(3).getReg();
5684 OpOffset = 1;
5685 } else {
5686 VIndex = B.buildConstant(S32, 0).getReg(0);
5687 }
5688
5689 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5690 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5691
5692 unsigned Format = 0;
5693 if (IsTyped) {
5694 Format = MI.getOperand(5 + OpOffset).getImm();
5695 ++OpOffset;
5696 }
5697
5698 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5699
5700 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5701
5702 unsigned Opc;
5703 if (IsTyped) {
5704 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5705 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5706 } else if (IsFormat) {
5707 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5708 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5709 } else {
5710 switch (MemSize) {
5711 case 1:
5712 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5713 break;
5714 case 2:
5715 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5716 break;
5717 default:
5718 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5719 break;
5720 }
5721 }
5722
5723 auto MIB = B.buildInstr(Opc)
5724 .addUse(VData) // vdata
5725 .addUse(RSrc) // rsrc
5726 .addUse(VIndex) // vindex
5727 .addUse(VOffset) // voffset
5728 .addUse(SOffset) // soffset
5729 .addImm(ImmOffset); // offset(imm)
5730
5731 if (IsTyped)
5732 MIB.addImm(Format);
5733
5734 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5735 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5736 .addMemOperand(MMO);
5737
5738 MI.eraseFromParent();
5739 return true;
5740}
5741
5742static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5743 Register VIndex, Register VOffset, Register SOffset,
5744 unsigned ImmOffset, unsigned Format,
5745 unsigned AuxiliaryData, MachineMemOperand *MMO,
5746 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5747 auto MIB = B.buildInstr(Opc)
5748 .addDef(LoadDstReg) // vdata
5749 .addUse(RSrc) // rsrc
5750 .addUse(VIndex) // vindex
5751 .addUse(VOffset) // voffset
5752 .addUse(SOffset) // soffset
5753 .addImm(ImmOffset); // offset(imm)
5754
5755 if (IsTyped)
5756 MIB.addImm(Format);
5757
5758 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5759 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5760 .addMemOperand(MMO);
5761}
5762
5766 bool IsFormat,
5767 bool IsTyped) const {
5768 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5769 MachineMemOperand *MMO = *MI.memoperands_begin();
5770 const LLT MemTy = MMO->getMemoryType();
5771 const LLT S32 = LLT::scalar(32);
5772
5773 Register Dst = MI.getOperand(0).getReg();
5774
5775 Register StatusDst;
5776 int OpOffset = 0;
5777 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5778 bool IsTFE = MI.getNumExplicitDefs() == 2;
5779 if (IsTFE) {
5780 StatusDst = MI.getOperand(1).getReg();
5781 ++OpOffset;
5782 }
5783
5784 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5785 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5786
5787 // The typed intrinsics add an immediate after the registers.
5788 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5789
5790 // The struct intrinsic variants add one additional operand over raw.
5791 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5792 Register VIndex;
5793 if (HasVIndex) {
5794 VIndex = MI.getOperand(3 + OpOffset).getReg();
5795 ++OpOffset;
5796 } else {
5797 VIndex = B.buildConstant(S32, 0).getReg(0);
5798 }
5799
5800 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5801 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5802
5803 unsigned Format = 0;
5804 if (IsTyped) {
5805 Format = MI.getOperand(5 + OpOffset).getImm();
5806 ++OpOffset;
5807 }
5808
5809 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5810 unsigned ImmOffset;
5811
5812 LLT Ty = MRI.getType(Dst);
5813 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5814 // logic doesn't have to handle that case.
5815 if (hasBufferRsrcWorkaround(Ty)) {
5816 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5817 Dst = MI.getOperand(0).getReg();
5818 }
5819 LLT EltTy = Ty.getScalarType();
5820 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5821 const bool Unpacked = ST.hasUnpackedD16VMem();
5822
5823 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5824
5825 unsigned Opc;
5826
5827 // TODO: Support TFE for typed and narrow loads.
5828 if (IsTyped) {
5829 if (IsTFE)
5830 return false;
5831 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5832 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5833 } else if (IsFormat) {
5834 if (IsD16) {
5835 if (IsTFE)
5836 return false;
5837 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5838 } else {
5839 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5840 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5841 }
5842 } else {
5843 if (IsTFE)
5844 return false;
5845 switch (MemTy.getSizeInBits()) {
5846 case 8:
5847 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5848 break;
5849 case 16:
5850 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5851 break;
5852 default:
5853 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5854 break;
5855 }
5856 }
5857
5858 if (IsTFE) {
5859 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5860 unsigned NumLoadDWords = NumValueDWords + 1;
5861 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5862 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5863 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5864 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5865 if (NumValueDWords == 1) {
5866 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5867 } else {
5868 SmallVector<Register, 5> LoadElts;
5869 for (unsigned I = 0; I != NumValueDWords; ++I)
5870 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5871 LoadElts.push_back(StatusDst);
5872 B.buildUnmerge(LoadElts, LoadDstReg);
5873 LoadElts.truncate(NumValueDWords);
5874 B.buildMergeLikeInstr(Dst, LoadElts);
5875 }
5876 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5877 (IsD16 && !Ty.isVector())) {
5878 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5879 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5880 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5881 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5882 B.buildTrunc(Dst, LoadDstReg);
5883 } else if (Unpacked && IsD16 && Ty.isVector()) {
5884 LLT UnpackedTy = Ty.changeElementSize(32);
5885 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5886 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5887 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5888 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5889 // FIXME: G_TRUNC should work, but legalization currently fails
5890 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
5892 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
5893 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5894 B.buildMergeLikeInstr(Dst, Repack);
5895 } else {
5896 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5897 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5898 }
5899
5900 MI.eraseFromParent();
5901 return true;
5902}
5903
5904static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5905 switch (IntrID) {
5906 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5908 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5911 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5913 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5916 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5918 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5921 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5923 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5926 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5928 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5931 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5933 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5936 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5938 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5941 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5943 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5946 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5948 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5951 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5953 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5956 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5958 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5961 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5963 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5966 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5968 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5971 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5973 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5976 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5977 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5978 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
5979 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5980 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5981 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5982 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5984 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5985 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5986 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5987 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5988 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5989 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5990 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5991 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
5992 default:
5993 llvm_unreachable("unhandled atomic opcode");
5994 }
5995}
5996
5999 Intrinsic::ID IID) const {
6000 const bool IsCmpSwap =
6001 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6002 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6003 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6004 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6005
6006 Register Dst = MI.getOperand(0).getReg();
6007 // Since we don't have 128-bit atomics, we don't need to handle the case of
6008 // p8 argmunents to the atomic itself
6009 Register VData = MI.getOperand(2).getReg();
6010
6011 Register CmpVal;
6012 int OpOffset = 0;
6013
6014 if (IsCmpSwap) {
6015 CmpVal = MI.getOperand(3).getReg();
6016 ++OpOffset;
6017 }
6018
6019 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6020 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6021 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6022
6023 // The struct intrinsic variants add one additional operand over raw.
6024 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6025 Register VIndex;
6026 if (HasVIndex) {
6027 VIndex = MI.getOperand(4 + OpOffset).getReg();
6028 ++OpOffset;
6029 } else {
6030 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6031 }
6032
6033 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6034 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6035 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6036
6037 MachineMemOperand *MMO = *MI.memoperands_begin();
6038
6039 unsigned ImmOffset;
6040 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6041
6042 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6043 .addDef(Dst)
6044 .addUse(VData); // vdata
6045
6046 if (IsCmpSwap)
6047 MIB.addReg(CmpVal);
6048
6049 MIB.addUse(RSrc) // rsrc
6050 .addUse(VIndex) // vindex
6051 .addUse(VOffset) // voffset
6052 .addUse(SOffset) // soffset
6053 .addImm(ImmOffset) // offset(imm)
6054 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6055 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6056 .addMemOperand(MMO);
6057
6058 MI.eraseFromParent();
6059 return true;
6060}
6061
6062/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6063/// vector with s16 typed elements.
6065 SmallVectorImpl<Register> &PackedAddrs,
6066 unsigned ArgOffset,
6068 bool IsA16, bool IsG16) {
6069 const LLT S16 = LLT::scalar(16);
6070 const LLT V2S16 = LLT::fixed_vector(2, 16);
6071 auto EndIdx = Intr->VAddrEnd;
6072
6073 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6074 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6075 if (!SrcOp.isReg())
6076 continue; // _L to _LZ may have eliminated this.
6077
6078 Register AddrReg = SrcOp.getReg();
6079
6080 if ((I < Intr->GradientStart) ||
6081 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6082 (I >= Intr->CoordStart && !IsA16)) {
6083 if ((I < Intr->GradientStart) && IsA16 &&
6084 (B.getMRI()->getType(AddrReg) == S16)) {
6085 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6086 // Special handling of bias when A16 is on. Bias is of type half but
6087 // occupies full 32-bit.
6088 PackedAddrs.push_back(
6089 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6090 .getReg(0));
6091 } else {
6092 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6093 "Bias needs to be converted to 16 bit in A16 mode");
6094 // Handle any gradient or coordinate operands that should not be packed
6095 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6096 PackedAddrs.push_back(AddrReg);
6097 }
6098 } else {
6099 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6100 // derivatives dx/dh and dx/dv are packed with undef.
6101 if (((I + 1) >= EndIdx) ||
6102 ((Intr->NumGradients / 2) % 2 == 1 &&
6103 (I == static_cast<unsigned>(Intr->GradientStart +
6104 (Intr->NumGradients / 2) - 1) ||
6105 I == static_cast<unsigned>(Intr->GradientStart +
6106 Intr->NumGradients - 1))) ||
6107 // Check for _L to _LZ optimization
6108 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6109 PackedAddrs.push_back(
6110 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6111 .getReg(0));
6112 } else {
6113 PackedAddrs.push_back(
6114 B.buildBuildVector(
6115 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6116 .getReg(0));
6117 ++I;
6118 }
6119 }
6120 }
6121}
6122
6123/// Convert from separate vaddr components to a single vector address register,
6124/// and replace the remaining operands with $noreg.
6126 int DimIdx, int NumVAddrs) {
6127 const LLT S32 = LLT::scalar(32);
6128 (void)S32;
6129 SmallVector<Register, 8> AddrRegs;
6130 for (int I = 0; I != NumVAddrs; ++I) {
6131 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6132 if (SrcOp.isReg()) {
6133 AddrRegs.push_back(SrcOp.getReg());
6134 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6135 }
6136 }
6137
6138 int NumAddrRegs = AddrRegs.size();
6139 if (NumAddrRegs != 1) {
6140 auto VAddr =
6141 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6142 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6143 }
6144
6145 for (int I = 1; I != NumVAddrs; ++I) {
6146 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6147 if (SrcOp.isReg())
6148 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6149 }
6150}
6151
6152/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6153///
6154/// Depending on the subtarget, load/store with 16-bit element data need to be
6155/// rewritten to use the low half of 32-bit registers, or directly use a packed
6156/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6157/// registers.
6158///
6159/// We don't want to directly select image instructions just yet, but also want
6160/// to exposes all register repacking to the legalizer/combiners. We also don't
6161/// want a selected instruction entering RegBankSelect. In order to avoid
6162/// defining a multitude of intermediate image instructions, directly hack on
6163/// the intrinsic's arguments. In cases like a16 addresses, this requires
6164/// padding now unnecessary arguments with $noreg.
6167 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6168
6169 const MachineFunction &MF = *MI.getMF();
6170 const unsigned NumDefs = MI.getNumExplicitDefs();
6171 const unsigned ArgOffset = NumDefs + 1;
6172 bool IsTFE = NumDefs == 2;
6173 // We are only processing the operands of d16 image operations on subtargets
6174 // that use the unpacked register layout, or need to repack the TFE result.
6175
6176 // TODO: Do we need to guard against already legalized intrinsics?
6177 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6179
6180 MachineRegisterInfo *MRI = B.getMRI();
6181 const LLT S32 = LLT::scalar(32);
6182 const LLT S16 = LLT::scalar(16);
6183 const LLT V2S16 = LLT::fixed_vector(2, 16);
6184
6185 unsigned DMask = 0;
6186 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6187 LLT Ty = MRI->getType(VData);
6188
6189 const bool IsAtomicPacked16Bit =
6190 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6191 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6192
6193 // Check for 16 bit addresses and pack if true.
6194 LLT GradTy =
6195 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6196 LLT AddrTy =
6197 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6198 const bool IsG16 =
6199 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6200 const bool IsA16 = AddrTy == S16;
6201 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6202
6203 int DMaskLanes = 0;
6204 if (!BaseOpcode->Atomic) {
6205 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6206 if (BaseOpcode->Gather4) {
6207 DMaskLanes = 4;
6208 } else if (DMask != 0) {
6209 DMaskLanes = llvm::popcount(DMask);
6210 } else if (!IsTFE && !BaseOpcode->Store) {
6211 // If dmask is 0, this is a no-op load. This can be eliminated.
6212 B.buildUndef(MI.getOperand(0));
6213 MI.eraseFromParent();
6214 return true;
6215 }
6216 }
6217
6218 Observer.changingInstr(MI);
6219 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6220
6221 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6222 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6223 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6224 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6225 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6226
6227 // Track that we legalized this
6228 MI.setDesc(B.getTII().get(NewOpcode));
6229
6230 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6231 // dmask to be at least 1 otherwise the instruction will fail
6232 if (IsTFE && DMask == 0) {
6233 DMask = 0x1;
6234 DMaskLanes = 1;
6235 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6236 }
6237
6238 if (BaseOpcode->Atomic) {
6239 Register VData0 = MI.getOperand(2).getReg();
6240 LLT Ty = MRI->getType(VData0);
6241
6242 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6243 if (Ty.isVector() && !IsAtomicPacked16Bit)
6244 return false;
6245
6246 if (BaseOpcode->AtomicX2) {
6247 Register VData1 = MI.getOperand(3).getReg();
6248 // The two values are packed in one register.
6249 LLT PackedTy = LLT::fixed_vector(2, Ty);
6250 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6251 MI.getOperand(2).setReg(Concat.getReg(0));
6252 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6253 }
6254 }
6255
6256 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6257
6258 // Rewrite the addressing register layout before doing anything else.
6259 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6260 // 16 bit gradients are supported, but are tied to the A16 control
6261 // so both gradients and addresses must be 16 bit
6262 return false;
6263 }
6264
6265 if (IsA16 && !ST.hasA16()) {
6266 // A16 not supported
6267 return false;
6268 }
6269
6270 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6271 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6272
6273 if (IsA16 || IsG16) {
6274 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6275 // instructions expect VGPR_32
6276 SmallVector<Register, 4> PackedRegs;
6277
6278 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6279
6280 // See also below in the non-a16 branch
6281 const bool UseNSA = ST.hasNSAEncoding() &&
6282 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6283 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6284 const bool UsePartialNSA =
6285 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6286
6287 if (UsePartialNSA) {
6288 // Pack registers that would go over NSAMaxSize into last VAddr register
6289 LLT PackedAddrTy =
6290 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6291 auto Concat = B.buildConcatVectors(
6292 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6293 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6294 PackedRegs.resize(NSAMaxSize);
6295 } else if (!UseNSA && PackedRegs.size() > 1) {
6296 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6297 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6298 PackedRegs[0] = Concat.getReg(0);
6299 PackedRegs.resize(1);
6300 }
6301
6302 const unsigned NumPacked = PackedRegs.size();
6303 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6304 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6305 if (!SrcOp.isReg()) {
6306 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6307 continue;
6308 }
6309
6310 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6311
6312 if (I - Intr->VAddrStart < NumPacked)
6313 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6314 else
6315 SrcOp.setReg(AMDGPU::NoRegister);
6316 }
6317 } else {
6318 // If the register allocator cannot place the address registers contiguously
6319 // without introducing moves, then using the non-sequential address encoding
6320 // is always preferable, since it saves VALU instructions and is usually a
6321 // wash in terms of code size or even better.
6322 //
6323 // However, we currently have no way of hinting to the register allocator
6324 // that MIMG addresses should be placed contiguously when it is possible to
6325 // do so, so force non-NSA for the common 2-address case as a heuristic.
6326 //
6327 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6328 // allocation when possible.
6329 //
6330 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6331 // set of the remaining addresses.
6332 const bool UseNSA = ST.hasNSAEncoding() &&
6333 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6334 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6335 const bool UsePartialNSA =
6336 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6337
6338 if (UsePartialNSA) {
6340 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6341 Intr->NumVAddrs - NSAMaxSize + 1);
6342 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6343 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6344 Intr->NumVAddrs);
6345 }
6346 }
6347
6348 int Flags = 0;
6349 if (IsA16)
6350 Flags |= 1;
6351 if (IsG16)
6352 Flags |= 2;
6353 MI.addOperand(MachineOperand::CreateImm(Flags));
6354
6355 if (BaseOpcode->Store) { // No TFE for stores?
6356 // TODO: Handle dmask trim
6357 if (!Ty.isVector() || !IsD16)
6358 return true;
6359
6360 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6361 if (RepackedReg != VData) {
6362 MI.getOperand(1).setReg(RepackedReg);
6363 }
6364
6365 return true;
6366 }
6367
6368 Register DstReg = MI.getOperand(0).getReg();
6369 const LLT EltTy = Ty.getScalarType();
6370 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6371
6372 // Confirm that the return type is large enough for the dmask specified
6373 if (NumElts < DMaskLanes)
6374 return false;
6375
6376 if (NumElts > 4 || DMaskLanes > 4)
6377 return false;
6378
6379 // Image atomic instructions are using DMask to specify how many bits
6380 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6381 // DMaskLanes for image atomic has default value '0'.
6382 // We must be sure that atomic variants (especially packed) will not be
6383 // truncated from v2s16 or v4s16 to s16 type.
6384 //
6385 // ChangeElementCount will be needed for image load where Ty is always scalar.
6386 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6387 const LLT AdjustedTy =
6388 DMaskLanes == 0
6389 ? Ty
6390 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6391
6392 // The raw dword aligned data component of the load. The only legal cases
6393 // where this matters should be when using the packed D16 format, for
6394 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6395 LLT RoundedTy;
6396
6397 // S32 vector to cover all data, plus TFE result element.
6398 LLT TFETy;
6399
6400 // Register type to use for each loaded component. Will be S32 or V2S16.
6401 LLT RegTy;
6402
6403 if (IsD16 && ST.hasUnpackedD16VMem()) {
6404 RoundedTy =
6405 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6406 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6407 RegTy = S32;
6408 } else {
6409 unsigned EltSize = EltTy.getSizeInBits();
6410 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6411 unsigned RoundedSize = 32 * RoundedElts;
6412 RoundedTy = LLT::scalarOrVector(
6413 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6414 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6415 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6416 }
6417
6418 // The return type does not need adjustment.
6419 // TODO: Should we change s16 case to s32 or <2 x s16>?
6420 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6421 return true;
6422
6423 Register Dst1Reg;
6424
6425 // Insert after the instruction.
6426 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6427
6428 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6429 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6430 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6431 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6432
6433 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6434
6435 MI.getOperand(0).setReg(NewResultReg);
6436
6437 // In the IR, TFE is supposed to be used with a 2 element struct return
6438 // type. The instruction really returns these two values in one contiguous
6439 // register, with one additional dword beyond the loaded data. Rewrite the
6440 // return type to use a single register result.
6441
6442 if (IsTFE) {
6443 Dst1Reg = MI.getOperand(1).getReg();
6444 if (MRI->getType(Dst1Reg) != S32)
6445 return false;
6446
6447 // TODO: Make sure the TFE operand bit is set.
6448 MI.removeOperand(1);
6449
6450 // Handle the easy case that requires no repack instructions.
6451 if (Ty == S32) {
6452 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6453 return true;
6454 }
6455 }
6456
6457 // Now figure out how to copy the new result register back into the old
6458 // result.
6459 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6460
6461 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6462
6463 if (ResultNumRegs == 1) {
6464 assert(!IsTFE);
6465 ResultRegs[0] = NewResultReg;
6466 } else {
6467 // We have to repack into a new vector of some kind.
6468 for (int I = 0; I != NumDataRegs; ++I)
6469 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6470 B.buildUnmerge(ResultRegs, NewResultReg);
6471
6472 // Drop the final TFE element to get the data part. The TFE result is
6473 // directly written to the right place already.
6474 if (IsTFE)
6475 ResultRegs.resize(NumDataRegs);
6476 }
6477
6478 // For an s16 scalar result, we form an s32 result with a truncate regardless
6479 // of packed vs. unpacked.
6480 if (IsD16 && !Ty.isVector()) {
6481 B.buildTrunc(DstReg, ResultRegs[0]);
6482 return true;
6483 }
6484
6485 // Avoid a build/concat_vector of 1 entry.
6486 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6487 B.buildBitcast(DstReg, ResultRegs[0]);
6488 return true;
6489 }
6490
6491 assert(Ty.isVector());
6492
6493 if (IsD16) {
6494 // For packed D16 results with TFE enabled, all the data components are
6495 // S32. Cast back to the expected type.
6496 //
6497 // TODO: We don't really need to use load s32 elements. We would only need one
6498 // cast for the TFE result if a multiple of v2s16 was used.
6499 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6500 for (Register &Reg : ResultRegs)
6501 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6502 } else if (ST.hasUnpackedD16VMem()) {
6503 for (Register &Reg : ResultRegs)
6504 Reg = B.buildTrunc(S16, Reg).getReg(0);
6505 }
6506 }
6507
6508 auto padWithUndef = [&](LLT Ty, int NumElts) {
6509 if (NumElts == 0)
6510 return;
6511 Register Undef = B.buildUndef(Ty).getReg(0);
6512 for (int I = 0; I != NumElts; ++I)
6513 ResultRegs.push_back(Undef);
6514 };
6515
6516 // Pad out any elements eliminated due to the dmask.
6517 LLT ResTy = MRI->getType(ResultRegs[0]);
6518 if (!ResTy.isVector()) {
6519 padWithUndef(ResTy, NumElts - ResultRegs.size());
6520 B.buildBuildVector(DstReg, ResultRegs);
6521 return true;
6522 }
6523
6524 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6525 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6526
6527 // Deal with the one annoying legal case.
6528 const LLT V3S16 = LLT::fixed_vector(3, 16);
6529 if (Ty == V3S16) {
6530 if (IsTFE) {
6531 if (ResultRegs.size() == 1) {
6532 NewResultReg = ResultRegs[0];
6533 } else if (ResultRegs.size() == 2) {
6534 LLT V4S16 = LLT::fixed_vector(4, 16);
6535 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6536 } else {
6537 return false;
6538 }
6539 }
6540
6541 if (MRI->getType(DstReg).getNumElements() <
6542 MRI->getType(NewResultReg).getNumElements()) {
6543 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6544 } else {
6545 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6546 }
6547 return true;
6548 }
6549
6550 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6551 B.buildConcatVectors(DstReg, ResultRegs);
6552 return true;
6553}
6554
6556 MachineInstr &MI) const {
6557 MachineIRBuilder &B = Helper.MIRBuilder;
6558 GISelChangeObserver &Observer = Helper.Observer;
6559
6560 Register OrigDst = MI.getOperand(0).getReg();
6561 Register Dst;
6562 LLT Ty = B.getMRI()->getType(OrigDst);
6563 unsigned Size = Ty.getSizeInBits();
6564 MachineFunction &MF = B.getMF();
6565 unsigned Opc = 0;
6566 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6567 assert(Size == 8 || Size == 16);
6568 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6569 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6570 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6571 // destination register.
6572 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6573 } else {
6574 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6575 Dst = OrigDst;
6576 }
6577
6578 Observer.changingInstr(MI);
6579
6580 // Handle needing to s.buffer.load() a p8 value.
6581 if (hasBufferRsrcWorkaround(Ty)) {
6582 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6583 B.setInsertPt(B.getMBB(), MI);
6584 }
6586 Ty = getBitcastRegisterType(Ty);
6587 Helper.bitcastDst(MI, Ty, 0);
6588 B.setInsertPt(B.getMBB(), MI);
6589 }
6590
6591 // FIXME: We don't really need this intermediate instruction. The intrinsic
6592 // should be fixed to have a memory operand. Since it's readnone, we're not
6593 // allowed to add one.
6594 MI.setDesc(B.getTII().get(Opc));
6595 MI.removeOperand(1); // Remove intrinsic ID
6596
6597 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6598 // TODO: Should this use datalayout alignment?
6599 const unsigned MemSize = (Size + 7) / 8;
6600 const Align MemAlign(std::min(MemSize, 4u));
6605 MemSize, MemAlign);
6606 MI.addMemOperand(MF, MMO);
6607 if (Dst != OrigDst) {
6608 MI.getOperand(0).setReg(Dst);
6609 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6610 B.buildTrunc(OrigDst, Dst);
6611 }
6612
6613 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6614 // always be legal. We may need to restore this to a 96-bit result if it turns
6615 // out this needs to be converted to a vector load during RegBankSelect.
6616 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6617 if (Ty.isVector())
6619 else
6620 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6621 }
6622
6623 Observer.changedInstr(MI);
6624 return true;
6625}
6626
6627// TODO: Move to selection
6630 MachineIRBuilder &B) const {
6631 if (!ST.isTrapHandlerEnabled() ||
6633 return legalizeTrapEndpgm(MI, MRI, B);
6634
6635 return ST.supportsGetDoorbellID() ?
6637}
6638
6641 const DebugLoc &DL = MI.getDebugLoc();
6642 MachineBasicBlock &BB = B.getMBB();
6643 MachineFunction *MF = BB.getParent();
6644
6645 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6646 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6647 .addImm(0);
6648 MI.eraseFromParent();
6649 return true;
6650 }
6651
6652 // We need a block split to make the real endpgm a terminator. We also don't
6653 // want to break phis in successor blocks, so we can't just delete to the
6654 // end of the block.
6655 BB.splitAt(MI, false /*UpdateLiveIns*/);
6657 MF->push_back(TrapBB);
6658 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6659 .addImm(0);
6660 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6661 .addMBB(TrapBB);
6662
6663 BB.addSuccessor(TrapBB);
6664 MI.eraseFromParent();
6665 return true;
6666}
6667
6670 MachineFunction &MF = B.getMF();
6671 const LLT S64 = LLT::scalar(64);
6672
6673 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6674 // For code object version 5, queue_ptr is passed through implicit kernarg.
6680 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6681
6682 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6684
6685 if (!loadInputValue(KernargPtrReg, B,
6687 return false;
6688
6689 // TODO: can we be smarter about machine pointer info?
6692 PtrInfo,
6696
6697 // Pointer address
6698 Register LoadAddr = MRI.createGenericVirtualRegister(
6700 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6701 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6702 // Load address
6703 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6704 B.buildCopy(SGPR01, Temp);
6705 B.buildInstr(AMDGPU::S_TRAP)
6706 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6707 .addReg(SGPR01, RegState::Implicit);
6708 MI.eraseFromParent();
6709 return true;
6710 }
6711
6712 // Pass queue pointer to trap handler as input, and insert trap instruction
6713 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6714 Register LiveIn =
6715 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6717 return false;
6718
6719 B.buildCopy(SGPR01, LiveIn);
6720 B.buildInstr(AMDGPU::S_TRAP)
6721 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6722 .addReg(SGPR01, RegState::Implicit);
6723
6724 MI.eraseFromParent();
6725 return true;
6726}
6727
6730 MachineIRBuilder &B) const {
6731 // We need to simulate the 's_trap 2' instruction on targets that run in
6732 // PRIV=1 (where it is treated as a nop).
6733 if (ST.hasPrivEnabledTrap2NopBug()) {
6734 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6735 MI.getDebugLoc());
6736 MI.eraseFromParent();
6737 return true;
6738 }
6739
6740 B.buildInstr(AMDGPU::S_TRAP)
6741 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6742 MI.eraseFromParent();
6743 return true;
6744}
6745
6748 MachineIRBuilder &B) const {
6749 // Is non-HSA path or trap-handler disabled? Then, report a warning
6750 // accordingly
6751 if (!ST.isTrapHandlerEnabled() ||
6753 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6754 "debugtrap handler not supported",
6755 MI.getDebugLoc(), DS_Warning);
6756 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6757 Ctx.diagnose(NoTrap);
6758 } else {
6759 // Insert debug-trap instruction
6760 B.buildInstr(AMDGPU::S_TRAP)
6761 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6762 }
6763
6764 MI.eraseFromParent();
6765 return true;
6766}
6767
6769 MachineIRBuilder &B) const {
6770 MachineRegisterInfo &MRI = *B.getMRI();
6771 const LLT S16 = LLT::scalar(16);
6772 const LLT S32 = LLT::scalar(32);
6773 const LLT V2S16 = LLT::fixed_vector(2, 16);
6774 const LLT V3S32 = LLT::fixed_vector(3, 32);
6775
6776 Register DstReg = MI.getOperand(0).getReg();
6777 Register NodePtr = MI.getOperand(2).getReg();
6778 Register RayExtent = MI.getOperand(3).getReg();
6779 Register RayOrigin = MI.getOperand(4).getReg();
6780 Register RayDir = MI.getOperand(5).getReg();
6781 Register RayInvDir = MI.getOperand(6).getReg();
6782 Register TDescr = MI.getOperand(7).getReg();
6783
6784 if (!ST.hasGFX10_AEncoding()) {
6785 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6786 "intrinsic not supported on subtarget",
6787 MI.getDebugLoc());
6788 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6789 return false;
6790 }
6791
6792 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6793 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6794 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6795 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6796 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6797 const unsigned NumVDataDwords = 4;
6798 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6799 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6800 const bool UseNSA =
6801 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6802
6803 const unsigned BaseOpcodes[2][2] = {
6804 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6805 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6806 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6807 int Opcode;
6808 if (UseNSA) {
6809 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6810 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6811 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6812 : AMDGPU::MIMGEncGfx10NSA,
6813 NumVDataDwords, NumVAddrDwords);
6814 } else {
6815 assert(!IsGFX12Plus);
6816 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6817 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6818 : AMDGPU::MIMGEncGfx10Default,
6819 NumVDataDwords, NumVAddrDwords);
6820 }
6821 assert(Opcode != -1);
6822
6824 if (UseNSA && IsGFX11Plus) {
6825 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6826 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6827 auto Merged = B.buildMergeLikeInstr(
6828 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6829 Ops.push_back(Merged.getReg(0));
6830 };
6831
6832 Ops.push_back(NodePtr);
6833 Ops.push_back(RayExtent);
6834 packLanes(RayOrigin);
6835
6836 if (IsA16) {
6837 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6838 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6839 auto MergedDir = B.buildMergeLikeInstr(
6840 V3S32,
6841 {B.buildBitcast(
6842 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
6843 UnmergeRayDir.getReg(0)}))
6844 .getReg(0),
6845 B.buildBitcast(
6846 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
6847 UnmergeRayDir.getReg(1)}))
6848 .getReg(0),
6849 B.buildBitcast(
6850 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
6851 UnmergeRayDir.getReg(2)}))
6852 .getReg(0)});
6853 Ops.push_back(MergedDir.getReg(0));
6854 } else {
6855 packLanes(RayDir);
6856 packLanes(RayInvDir);
6857 }
6858 } else {
6859 if (Is64) {
6860 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6861 Ops.push_back(Unmerge.getReg(0));
6862 Ops.push_back(Unmerge.getReg(1));
6863 } else {
6864 Ops.push_back(NodePtr);
6865 }
6866 Ops.push_back(RayExtent);
6867
6868 auto packLanes = [&Ops, &S32, &B](Register Src) {
6869 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6870 Ops.push_back(Unmerge.getReg(0));
6871 Ops.push_back(Unmerge.getReg(1));
6872 Ops.push_back(Unmerge.getReg(2));
6873 };
6874
6875 packLanes(RayOrigin);
6876 if (IsA16) {
6877 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6878 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6879 Register R1 = MRI.createGenericVirtualRegister(S32);
6880 Register R2 = MRI.createGenericVirtualRegister(S32);
6881 Register R3 = MRI.createGenericVirtualRegister(S32);
6882 B.buildMergeLikeInstr(R1,
6883 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6884 B.buildMergeLikeInstr(
6885 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6886 B.buildMergeLikeInstr(
6887 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6888 Ops.push_back(R1);
6889 Ops.push_back(R2);
6890 Ops.push_back(R3);
6891 } else {
6892 packLanes(RayDir);
6893 packLanes(RayInvDir);
6894 }
6895 }
6896
6897 if (!UseNSA) {
6898 // Build a single vector containing all the operands so far prepared.
6899 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6900 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6901 Ops.clear();
6902 Ops.push_back(MergedOps);
6903 }
6904
6905 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6906 .addDef(DstReg)
6907 .addImm(Opcode);
6908
6909 for (Register R : Ops) {
6910 MIB.addUse(R);
6911 }
6912
6913 MIB.addUse(TDescr)
6914 .addImm(IsA16 ? 1 : 0)
6915 .cloneMemRefs(MI);
6916
6917 MI.eraseFromParent();
6918 return true;
6919}
6920
6922 MachineIRBuilder &B) const {
6923 unsigned Opc;
6924 int RoundMode = MI.getOperand(2).getImm();
6925
6926 if (RoundMode == (int)RoundingMode::TowardPositive)
6927 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6928 else if (RoundMode == (int)RoundingMode::TowardNegative)
6929 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6930 else
6931 return false;
6932
6933 B.buildInstr(Opc)
6934 .addDef(MI.getOperand(0).getReg())
6935 .addUse(MI.getOperand(1).getReg());
6936
6937 MI.eraseFromParent();
6938
6939 return true;
6940}
6941
6943 MachineIRBuilder &B) const {
6944 const SITargetLowering *TLI = ST.getTargetLowering();
6946 Register DstReg = MI.getOperand(0).getReg();
6947 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6948 MI.eraseFromParent();
6949 return true;
6950}
6951
6953 MachineIRBuilder &B) const {
6954 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
6955 if (!ST.hasArchitectedSGPRs())
6956 return false;
6957 LLT S32 = LLT::scalar(32);
6958 Register DstReg = MI.getOperand(0).getReg();
6959 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
6960 auto LSB = B.buildConstant(S32, 25);
6961 auto Width = B.buildConstant(S32, 5);
6962 B.buildUbfx(DstReg, TTMP8, LSB, Width);
6963 MI.eraseFromParent();
6964 return true;
6965}
6966
6967static constexpr unsigned FPEnvModeBitField =
6969
6970static constexpr unsigned FPEnvTrapBitField =
6972
6975 MachineIRBuilder &B) const {
6976 Register Src = MI.getOperand(0).getReg();
6977 if (MRI.getType(Src) != S64)
6978 return false;
6979
6980 auto ModeReg =
6981 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6982 /*HasSideEffects=*/true, /*isConvergent=*/false)
6983 .addImm(FPEnvModeBitField);
6984 auto TrapReg =
6985 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6986 /*HasSideEffects=*/true, /*isConvergent=*/false)
6987 .addImm(FPEnvTrapBitField);
6988 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
6989 MI.eraseFromParent();
6990 return true;
6991}
6992
6995 MachineIRBuilder &B) const {
6996 Register Src = MI.getOperand(0).getReg();
6997 if (MRI.getType(Src) != S64)
6998 return false;
6999
7000 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7001 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7002 /*HasSideEffects=*/true, /*isConvergent=*/false)
7003 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7004 .addReg(Unmerge.getReg(0));
7005 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7006 /*HasSideEffects=*/true, /*isConvergent=*/false)
7007 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7008 .addReg(Unmerge.getReg(1));
7009 MI.eraseFromParent();
7010 return true;
7011}
7012
7014 MachineInstr &MI) const {
7015 MachineIRBuilder &B = Helper.MIRBuilder;
7016 MachineRegisterInfo &MRI = *B.getMRI();
7017
7018 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7019 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7020 switch (IntrID) {
7021 case Intrinsic::amdgcn_if:
7022 case Intrinsic::amdgcn_else: {
7023 MachineInstr *Br = nullptr;
7024 MachineBasicBlock *UncondBrTarget = nullptr;
7025 bool Negated = false;
7026 if (MachineInstr *BrCond =
7027 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7028 const SIRegisterInfo *TRI
7029 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7030
7031 Register Def = MI.getOperand(1).getReg();
7032 Register Use = MI.getOperand(3).getReg();
7033
7034 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7035
7036 if (Negated)
7037 std::swap(CondBrTarget, UncondBrTarget);
7038
7039 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7040 if (IntrID == Intrinsic::amdgcn_if) {
7041 B.buildInstr(AMDGPU::SI_IF)
7042 .addDef(Def)
7043 .addUse(Use)
7044 .addMBB(UncondBrTarget);
7045 } else {
7046 B.buildInstr(AMDGPU::SI_ELSE)
7047 .addDef(Def)
7048 .addUse(Use)
7049 .addMBB(UncondBrTarget);
7050 }
7051
7052 if (Br) {
7053 Br->getOperand(0).setMBB(CondBrTarget);
7054 } else {
7055 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7056 // since we're swapping branch targets it needs to be reinserted.
7057 // FIXME: IRTranslator should probably not do this
7058 B.buildBr(*CondBrTarget);
7059 }
7060
7061 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7062 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7063 MI.eraseFromParent();
7064 BrCond->eraseFromParent();
7065 return true;
7066 }
7067
7068 return false;
7069 }
7070 case Intrinsic::amdgcn_loop: {
7071 MachineInstr *Br = nullptr;
7072 MachineBasicBlock *UncondBrTarget = nullptr;
7073 bool Negated = false;
7074 if (MachineInstr *BrCond =
7075 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7076 const SIRegisterInfo *TRI
7077 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7078
7079 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7080 Register Reg = MI.getOperand(2).getReg();
7081
7082 if (Negated)
7083 std::swap(CondBrTarget, UncondBrTarget);
7084
7085 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7086 B.buildInstr(AMDGPU::SI_LOOP)
7087 .addUse(Reg)
7088 .addMBB(UncondBrTarget);
7089
7090 if (Br)
7091 Br->getOperand(0).setMBB(CondBrTarget);
7092 else
7093 B.buildBr(*CondBrTarget);
7094
7095 MI.eraseFromParent();
7096 BrCond->eraseFromParent();
7097 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7098 return true;
7099 }
7100
7101 return false;
7102 }
7103 case Intrinsic::amdgcn_addrspacecast_nonnull:
7104 return legalizeAddrSpaceCast(MI, MRI, B);
7105 case Intrinsic::amdgcn_make_buffer_rsrc:
7107 case Intrinsic::amdgcn_kernarg_segment_ptr:
7108 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7109 // This only makes sense to call in a kernel, so just lower to null.
7110 B.buildConstant(MI.getOperand(0).getReg(), 0);
7111 MI.eraseFromParent();
7112 return true;
7113 }
7114
7117 case Intrinsic::amdgcn_implicitarg_ptr:
7118 return legalizeImplicitArgPtr(MI, MRI, B);
7119 case Intrinsic::amdgcn_workitem_id_x:
7120 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7122 case Intrinsic::amdgcn_workitem_id_y:
7123 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7125 case Intrinsic::amdgcn_workitem_id_z:
7126 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7128 case Intrinsic::amdgcn_workgroup_id_x:
7131 case Intrinsic::amdgcn_workgroup_id_y:
7134 case Intrinsic::amdgcn_workgroup_id_z:
7137 case Intrinsic::amdgcn_wave_id:
7138 return legalizeWaveID(MI, B);
7139 case Intrinsic::amdgcn_lds_kernel_id:
7142 case Intrinsic::amdgcn_dispatch_ptr:
7145 case Intrinsic::amdgcn_queue_ptr:
7148 case Intrinsic::amdgcn_implicit_buffer_ptr:
7151 case Intrinsic::amdgcn_dispatch_id:
7154 case Intrinsic::r600_read_ngroups_x:
7155 // TODO: Emit error for hsa
7158 case Intrinsic::r600_read_ngroups_y:
7161 case Intrinsic::r600_read_ngroups_z:
7164 case Intrinsic::r600_read_local_size_x:
7165 // TODO: Could insert G_ASSERT_ZEXT from s16
7167 case Intrinsic::r600_read_local_size_y:
7168 // TODO: Could insert G_ASSERT_ZEXT from s16
7170 // TODO: Could insert G_ASSERT_ZEXT from s16
7171 case Intrinsic::r600_read_local_size_z:
7173 case Intrinsic::r600_read_global_size_x:
7175 case Intrinsic::r600_read_global_size_y:
7177 case Intrinsic::r600_read_global_size_z:
7179 case Intrinsic::amdgcn_fdiv_fast:
7180 return legalizeFDIVFastIntrin(MI, MRI, B);
7181 case Intrinsic::amdgcn_is_shared:
7183 case Intrinsic::amdgcn_is_private:
7185 case Intrinsic::amdgcn_wavefrontsize: {
7186 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7187 MI.eraseFromParent();
7188 return true;
7189 }
7190 case Intrinsic::amdgcn_s_buffer_load:
7191 return legalizeSBufferLoad(Helper, MI);
7192 case Intrinsic::amdgcn_raw_buffer_store:
7193 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7194 case Intrinsic::amdgcn_struct_buffer_store:
7195 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7196 return legalizeBufferStore(MI, MRI, B, false, false);
7197 case Intrinsic::amdgcn_raw_buffer_store_format:
7198 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7199 case Intrinsic::amdgcn_struct_buffer_store_format:
7200 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7201 return legalizeBufferStore(MI, MRI, B, false, true);
7202 case Intrinsic::amdgcn_raw_tbuffer_store:
7203 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7204 case Intrinsic::amdgcn_struct_tbuffer_store:
7205 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7206 return legalizeBufferStore(MI, MRI, B, true, true);
7207 case Intrinsic::amdgcn_raw_buffer_load:
7208 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7209 case Intrinsic::amdgcn_struct_buffer_load:
7210 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7211 return legalizeBufferLoad(MI, MRI, B, false, false);
7212 case Intrinsic::amdgcn_raw_buffer_load_format:
7213 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7214 case Intrinsic::amdgcn_struct_buffer_load_format:
7215 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7216 return legalizeBufferLoad(MI, MRI, B, true, false);
7217 case Intrinsic::amdgcn_raw_tbuffer_load:
7218 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7219 case Intrinsic::amdgcn_struct_tbuffer_load:
7220 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7221 return legalizeBufferLoad(MI, MRI, B, true, true);
7222 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7224 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7225 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7226 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7227 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7228 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7229 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7230 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7231 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7232 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7233 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7234 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7235 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7236 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7237 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7238 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7239 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7240 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7241 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7242 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7243 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7244 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7245 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7246 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7247 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7248 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7249 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7250 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7252 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7254 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7255 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7256 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7257 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7258 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7259 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7260 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7262 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7263 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7264 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7265 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7266 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7268 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7269 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7270 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7271 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7272 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7274 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7276 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7278 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7280 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7282 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7284 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7286 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7288 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7289 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
7290 return legalizeBufferAtomic(MI, B, IntrID);
7291 case Intrinsic::amdgcn_rsq_clamp:
7293 case Intrinsic::amdgcn_ds_fadd:
7294 case Intrinsic::amdgcn_ds_fmin:
7295 case Intrinsic::amdgcn_ds_fmax:
7296 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
7297 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7298 return legalizeBVHIntrinsic(MI, B);
7299 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7300 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7301 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7302 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7303 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7304 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7305 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7306 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7307 Register Index = MI.getOperand(5).getReg();
7308 LLT S32 = LLT::scalar(32);
7309 if (MRI.getType(Index) != S32)
7310 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7311 return true;
7312 }
7313 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7314 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7315 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7316 Register Index = MI.getOperand(7).getReg();
7317 LLT S32 = LLT::scalar(32);
7318 if (MRI.getType(Index) != S32)
7319 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7320 return true;
7321 }
7322 case Intrinsic::amdgcn_fmed3: {
7323 GISelChangeObserver &Observer = Helper.Observer;
7324
7325 // FIXME: This is to workaround the inability of tablegen match combiners to
7326 // match intrinsics in patterns.
7327 Observer.changingInstr(MI);
7328 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7329 MI.removeOperand(1);
7330 Observer.changedInstr(MI);
7331 return true;
7332 }
7333 default: {
7334 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7336 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7337 return true;
7338 }
7339 }
7340
7341 return true;
7342}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
unsigned Intr
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static const LLT V3S64
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static const LLT V16S16
static const LLT S128
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static const LLT V4S32
static const LLT V2S32
static const LLT V8S64
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static const LLT V12S32
static const LLT V8S32
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static const LLT V2S16
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static const LLT V4S64
static const LLT S1
static const LLT V3S32
static const LLT S64
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static const LLT S32
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static const LLT V6S32
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static const LLT V7S32
static const LLT V5S32
static const LLT V4S16
static const LLT V11S32
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static const LLT V32S32
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static const LLT V9S32
static const LLT V10S32
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static const LLT V12S16
static const LLT V16S64
static const LLT S512
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V16S32
static const LLT V7S64
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static const LLT V5S64
static const LLT S160
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static const LLT V4S128
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static const LLT V6S64
static constexpr unsigned MaxRegisterSize
static const LLT V2S8
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static const LLT S96
static const LLT V2S64
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static const LLT S16
static const LLT V10S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V2S128
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static const LLT S256
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static const LLT S8
static const LLT V6S16
static bool isRegisterVectorType(LLT Ty)
static const LLT S224
static const LLT V8S16
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static Error unsupported(const char *Str, const Triple &T)
Definition: MachO.cpp:71
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define R2(n)
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
ppc ctr loops verify
const char LLVMTargetMachineRef TM
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1174
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
Value * RHS
Value * LHS
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1026
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1006
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:1002
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:160
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
bool hasA16() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:464
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:253
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:442
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:448
bool hasMad64_32() const
Definition: GCNSubtarget.h:731
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:444
bool hasIntClamp() const
Definition: GCNSubtarget.h:344
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:261
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:995
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:364
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:588
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:720
bool hasNSAEncoding() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:954
Generation getGeneration() const
Definition: GCNSubtarget.h:304
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:718
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:722
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:714
bool hasFractBug() const
Definition: GCNSubtarget.h:382
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
static constexpr LLT float64()
Get a 64-bit IEEE double value.
Definition: LowLevelType.h:94
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
Definition: LowLevelType.h:214
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:64
constexpr bool isPointerVector() const
Definition: LowLevelType.h:152
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:159
constexpr bool isVector() const
Definition: LowLevelType.h:148
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr bool isPointer() const
Definition: LowLevelType.h:149
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:184
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
static constexpr LLT float16()
Get a 16-bit IEEE half value.
Definition: LowLevelType.h:84
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr bool isPointerOrPointerVector() const
Definition: LowLevelType.h:153
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
Definition: LowLevelType.h:230
constexpr LLT getScalarType() const
Definition: LowLevelType.h:208
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
Definition: LowLevelType.h:124
static constexpr LLT float32()
Get a 32-bit IEEE float value.
Definition: LowLevelType.h:89
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:558
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:341
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:387
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:412
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void truncate(size_type N)
Like resize, but requires that N is less than size().
Definition: SmallVector.h:657
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:271
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double inv_pi
Definition: MathExtras.h:38
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:882
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:337
@ Offset
Definition: DWP.cpp:456
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:625
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:438
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
void * PointerTy
Definition: GenericValue.h:21
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:305
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Add
Sum of integers.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition: Utils.cpp:1645
@ DS_Warning
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:413
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:360
@ Enable
Enable colors.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static constexpr uint64_t encode(Fields... Values)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:77
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.