LLVM 20.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157 return std::pair(TypeIdx,
158 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT F32 = LLT::float32();
287static const LLT S64 = LLT::scalar(64);
288static const LLT F64 = LLT::float64();
289static const LLT S96 = LLT::scalar(96);
290static const LLT S128 = LLT::scalar(128);
291static const LLT S160 = LLT::scalar(160);
292static const LLT S192 = LLT::scalar(192);
293static const LLT S224 = LLT::scalar(224);
294static const LLT S256 = LLT::scalar(256);
295static const LLT S512 = LLT::scalar(512);
296static const LLT S1024 = LLT::scalar(1024);
298
299static const LLT V2S8 = LLT::fixed_vector(2, 8);
300static const LLT V2S16 = LLT::fixed_vector(2, 16);
301static const LLT V4S16 = LLT::fixed_vector(4, 16);
302static const LLT V6S16 = LLT::fixed_vector(6, 16);
303static const LLT V8S16 = LLT::fixed_vector(8, 16);
304static const LLT V10S16 = LLT::fixed_vector(10, 16);
305static const LLT V12S16 = LLT::fixed_vector(12, 16);
306static const LLT V16S16 = LLT::fixed_vector(16, 16);
307
309static const LLT V2BF16 = V2F16; // FIXME
310
311static const LLT V2S32 = LLT::fixed_vector(2, 32);
312static const LLT V3S32 = LLT::fixed_vector(3, 32);
313static const LLT V4S32 = LLT::fixed_vector(4, 32);
314static const LLT V5S32 = LLT::fixed_vector(5, 32);
315static const LLT V6S32 = LLT::fixed_vector(6, 32);
316static const LLT V7S32 = LLT::fixed_vector(7, 32);
317static const LLT V8S32 = LLT::fixed_vector(8, 32);
318static const LLT V9S32 = LLT::fixed_vector(9, 32);
319static const LLT V10S32 = LLT::fixed_vector(10, 32);
320static const LLT V11S32 = LLT::fixed_vector(11, 32);
321static const LLT V12S32 = LLT::fixed_vector(12, 32);
322static const LLT V16S32 = LLT::fixed_vector(16, 32);
323static const LLT V32S32 = LLT::fixed_vector(32, 32);
324
325static const LLT V2S64 = LLT::fixed_vector(2, 64);
326static const LLT V3S64 = LLT::fixed_vector(3, 64);
327static const LLT V4S64 = LLT::fixed_vector(4, 64);
328static const LLT V5S64 = LLT::fixed_vector(5, 64);
329static const LLT V6S64 = LLT::fixed_vector(6, 64);
330static const LLT V7S64 = LLT::fixed_vector(7, 64);
331static const LLT V8S64 = LLT::fixed_vector(8, 64);
332static const LLT V16S64 = LLT::fixed_vector(16, 64);
333
334static const LLT V2S128 = LLT::fixed_vector(2, 128);
335static const LLT V4S128 = LLT::fixed_vector(4, 128);
336
337static std::initializer_list<LLT> AllScalarTypes = {
339
340static std::initializer_list<LLT> AllS16Vectors{
342
343static std::initializer_list<LLT> AllS32Vectors = {
346
347static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
349
350// Checks whether a type is in the list of legal register types.
351static bool isRegisterClassType(LLT Ty) {
354
357}
358
359static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
360 return [TypeIdx](const LegalityQuery &Query) {
361 return isRegisterClassType(Query.Types[TypeIdx]);
362 };
363}
364
365// If we have a truncating store or an extending load with a data size larger
366// than 32-bits, we need to reduce to a 32-bit type.
368 return [=](const LegalityQuery &Query) {
369 const LLT Ty = Query.Types[TypeIdx];
370 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
371 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
372 };
373}
374
375// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
376// handle some operations by just promoting the register during
377// selection. There are also d16 loads on GFX9+ which preserve the high bits.
378static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
379 bool IsLoad, bool IsAtomic) {
380 switch (AS) {
382 // FIXME: Private element size.
383 return ST.enableFlatScratch() ? 128 : 32;
385 return ST.useDS128() ? 128 : 64;
390 // Treat constant and global as identical. SMRD loads are sometimes usable for
391 // global loads (ideally constant address space should be eliminated)
392 // depending on the context. Legality cannot be context dependent, but
393 // RegBankSelect can split the load as necessary depending on the pointer
394 // register bank/uniformity and if the memory is invariant or not written in a
395 // kernel.
396 return IsLoad ? 512 : 128;
397 default:
398 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
399 // if they may alias scratch depending on the subtarget. This needs to be
400 // moved to custom handling to use addressMayBeAccessedAsPrivate
401 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
402 }
403}
404
405static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
406 const LegalityQuery &Query) {
407 const LLT Ty = Query.Types[0];
408
409 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
410 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
411
412 unsigned RegSize = Ty.getSizeInBits();
413 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
414 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
415 unsigned AS = Query.Types[1].getAddressSpace();
416
417 // All of these need to be custom lowered to cast the pointer operand.
419 return false;
420
421 // Do not handle extending vector loads.
422 if (Ty.isVector() && MemSize != RegSize)
423 return false;
424
425 // TODO: We should be able to widen loads if the alignment is high enough, but
426 // we also need to modify the memory access size.
427#if 0
428 // Accept widening loads based on alignment.
429 if (IsLoad && MemSize < Size)
430 MemSize = std::max(MemSize, Align);
431#endif
432
433 // Only 1-byte and 2-byte to 32-bit extloads are valid.
434 if (MemSize != RegSize && RegSize != 32)
435 return false;
436
437 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
438 Query.MMODescrs[0].Ordering !=
439 AtomicOrdering::NotAtomic))
440 return false;
441
442 switch (MemSize) {
443 case 8:
444 case 16:
445 case 32:
446 case 64:
447 case 128:
448 break;
449 case 96:
450 if (!ST.hasDwordx3LoadStores())
451 return false;
452 break;
453 case 256:
454 case 512:
455 // These may contextually need to be broken down.
456 break;
457 default:
458 return false;
459 }
460
461 assert(RegSize >= MemSize);
462
463 if (AlignBits < MemSize) {
464 const SITargetLowering *TLI = ST.getTargetLowering();
465 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
466 Align(AlignBits / 8)))
467 return false;
468 }
469
470 return true;
471}
472
473// The newer buffer intrinsic forms take their resource arguments as
474// pointers in address space 8, aka s128 values. However, in order to not break
475// SelectionDAG, the underlying operations have to continue to take v4i32
476// arguments. Therefore, we convert resource pointers - or vectors of them
477// to integer values here.
478static bool hasBufferRsrcWorkaround(const LLT Ty) {
480 return true;
481 if (Ty.isVector()) {
482 const LLT ElemTy = Ty.getElementType();
483 return hasBufferRsrcWorkaround(ElemTy);
484 }
485 return false;
486}
487
488// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
489// workaround this. Eventually it should ignore the type for loads and only care
490// about the size. Return true in cases where we will workaround this for now by
491// bitcasting.
492static bool loadStoreBitcastWorkaround(const LLT Ty) {
494 return false;
495
496 const unsigned Size = Ty.getSizeInBits();
497 if (Ty.isPointerVector())
498 return true;
499 if (Size <= 64)
500 return false;
501 // Address space 8 pointers get their own workaround.
503 return false;
504 if (!Ty.isVector())
505 return true;
506
507 unsigned EltSize = Ty.getScalarSizeInBits();
508 return EltSize != 32 && EltSize != 64;
509}
510
511static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
512 const LLT Ty = Query.Types[0];
513 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
515}
516
517/// Return true if a load or store of the type should be lowered with a bitcast
518/// to a different type.
519static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
520 const LLT MemTy) {
521 const unsigned MemSizeInBits = MemTy.getSizeInBits();
522 const unsigned Size = Ty.getSizeInBits();
523 if (Size != MemSizeInBits)
524 return Size <= 32 && Ty.isVector();
525
527 return true;
528
529 // Don't try to handle bitcasting vector ext loads for now.
530 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
531 (Size <= 32 || isRegisterSize(Size)) &&
533}
534
535/// Return true if we should legalize a load by widening an odd sized memory
536/// access up to the alignment. Note this case when the memory access itself
537/// changes, not the size of the result register.
538static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
539 uint64_t AlignInBits, unsigned AddrSpace,
540 unsigned Opcode) {
541 unsigned SizeInBits = MemoryTy.getSizeInBits();
542 // We don't want to widen cases that are naturally legal.
543 if (isPowerOf2_32(SizeInBits))
544 return false;
545
546 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
547 // end up widening these for a scalar load during RegBankSelect, if we don't
548 // have 96-bit scalar loads.
549 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
550 return false;
551
552 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
553 return false;
554
555 // A load is known dereferenceable up to the alignment, so it's legal to widen
556 // to it.
557 //
558 // TODO: Could check dereferenceable for less aligned cases.
559 unsigned RoundedSize = NextPowerOf2(SizeInBits);
560 if (AlignInBits < RoundedSize)
561 return false;
562
563 // Do not widen if it would introduce a slow unaligned load.
564 const SITargetLowering *TLI = ST.getTargetLowering();
565 unsigned Fast = 0;
567 RoundedSize, AddrSpace, Align(AlignInBits / 8),
569 Fast;
570}
571
572static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
573 unsigned Opcode) {
574 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
575 return false;
576
577 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
578 Query.MMODescrs[0].AlignInBits,
579 Query.Types[1].getAddressSpace(), Opcode);
580}
581
582/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
583/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
584/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
586 MachineRegisterInfo &MRI, unsigned Idx) {
587 MachineOperand &MO = MI.getOperand(Idx);
588
589 const LLT PointerTy = MRI.getType(MO.getReg());
590
591 // Paranoidly prevent us from doing this multiple times.
593 return PointerTy;
594
595 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
596 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
597 if (!PointerTy.isVector()) {
598 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
599 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
600 const LLT S32 = LLT::scalar(32);
601
602 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
603 std::array<Register, 4> VectorElems;
604 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
605 for (unsigned I = 0; I < NumParts; ++I)
606 VectorElems[I] =
607 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
608 B.buildMergeValues(MO, VectorElems);
609 MO.setReg(VectorReg);
610 return VectorTy;
611 }
612 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
613 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
614 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
615 B.buildIntToPtr(MO, Scalar);
616 MO.setReg(BitcastReg);
617
618 return VectorTy;
619}
620
621/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
622/// the form in which the value must be in order to be passed to the low-level
623/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
624/// needed in order to account for the fact that we can't define a register
625/// class for s128 without breaking SelectionDAG.
627 MachineRegisterInfo &MRI = *B.getMRI();
628 const LLT PointerTy = MRI.getType(Pointer);
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631
632 if (!PointerTy.isVector()) {
633 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
634 SmallVector<Register, 4> PointerParts;
635 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
636 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
637 for (unsigned I = 0; I < NumParts; ++I)
638 PointerParts.push_back(Unmerged.getReg(I));
639 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
640 }
641 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
642 return B.buildBitcast(VectorTy, Scalar).getReg(0);
643}
644
646 unsigned Idx) {
647 MachineOperand &MO = MI.getOperand(Idx);
648
649 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
650 // Paranoidly prevent us from doing this multiple times.
652 return;
654}
655
657 const GCNTargetMachine &TM)
658 : ST(ST_) {
659 using namespace TargetOpcode;
660
661 auto GetAddrSpacePtr = [&TM](unsigned AS) {
662 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
663 };
664
665 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
666 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
667 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
668 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
669 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
670 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
671 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
672 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
673 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
674 const LLT BufferStridedPtr =
675 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
676
677 const LLT CodePtr = FlatPtr;
678
679 const std::initializer_list<LLT> AddrSpaces64 = {
680 GlobalPtr, ConstantPtr, FlatPtr
681 };
682
683 const std::initializer_list<LLT> AddrSpaces32 = {
684 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
685 };
686
687 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
688
689 const std::initializer_list<LLT> FPTypesBase = {
690 S32, S64
691 };
692
693 const std::initializer_list<LLT> FPTypes16 = {
694 S32, S64, S16
695 };
696
697 const std::initializer_list<LLT> FPTypesPK16 = {
698 S32, S64, S16, V2S16
699 };
700
701 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
702
703 // s1 for VCC branches, s32 for SCC branches.
705
706 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
707 // elements for v3s16
710 .legalFor(AllS32Vectors)
712 .legalFor(AddrSpaces64)
713 .legalFor(AddrSpaces32)
714 .legalFor(AddrSpaces128)
715 .legalIf(isPointer(0))
716 .clampScalar(0, S16, S256)
718 .clampMaxNumElements(0, S32, 16)
720 .scalarize(0);
721
722 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
723 // Full set of gfx9 features.
724 if (ST.hasScalarAddSub64()) {
725 getActionDefinitionsBuilder({G_ADD, G_SUB})
726 .legalFor({S64, S32, S16, V2S16})
727 .clampMaxNumElementsStrict(0, S16, 2)
728 .scalarize(0)
729 .minScalar(0, S16)
731 .maxScalar(0, S32);
732 } else {
733 getActionDefinitionsBuilder({G_ADD, G_SUB})
734 .legalFor({S32, S16, V2S16})
735 .clampMaxNumElementsStrict(0, S16, 2)
736 .scalarize(0)
737 .minScalar(0, S16)
739 .maxScalar(0, S32);
740 }
741
742 if (ST.hasScalarSMulU64()) {
744 .legalFor({S64, S32, S16, V2S16})
745 .clampMaxNumElementsStrict(0, S16, 2)
746 .scalarize(0)
747 .minScalar(0, S16)
749 .custom();
750 } else {
752 .legalFor({S32, S16, V2S16})
753 .clampMaxNumElementsStrict(0, S16, 2)
754 .scalarize(0)
755 .minScalar(0, S16)
757 .custom();
758 }
759 assert(ST.hasMad64_32());
760
761 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
762 .legalFor({S32, S16, V2S16}) // Clamp modifier
763 .minScalarOrElt(0, S16)
765 .scalarize(0)
767 .lower();
768 } else if (ST.has16BitInsts()) {
769 getActionDefinitionsBuilder({G_ADD, G_SUB})
770 .legalFor({S32, S16})
771 .minScalar(0, S16)
773 .maxScalar(0, S32)
774 .scalarize(0);
775
777 .legalFor({S32, S16})
778 .scalarize(0)
779 .minScalar(0, S16)
780 .widenScalarToNextMultipleOf(0, 32)
781 .custom();
782 assert(ST.hasMad64_32());
783
784 // Technically the saturating operations require clamp bit support, but this
785 // was introduced at the same time as 16-bit operations.
786 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
787 .legalFor({S32, S16}) // Clamp modifier
788 .minScalar(0, S16)
789 .scalarize(0)
791 .lower();
792
793 // We're just lowering this, but it helps get a better result to try to
794 // coerce to the desired type first.
795 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
796 .minScalar(0, S16)
797 .scalarize(0)
798 .lower();
799 } else {
800 getActionDefinitionsBuilder({G_ADD, G_SUB})
801 .legalFor({S32})
802 .widenScalarToNextMultipleOf(0, 32)
803 .clampScalar(0, S32, S32)
804 .scalarize(0);
805
806 auto &Mul = getActionDefinitionsBuilder(G_MUL)
807 .legalFor({S32})
808 .scalarize(0)
809 .minScalar(0, S32)
810 .widenScalarToNextMultipleOf(0, 32);
811
812 if (ST.hasMad64_32())
813 Mul.custom();
814 else
815 Mul.maxScalar(0, S32);
816
817 if (ST.hasIntClamp()) {
818 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
819 .legalFor({S32}) // Clamp modifier.
820 .scalarize(0)
821 .minScalarOrElt(0, S32)
822 .lower();
823 } else {
824 // Clamp bit support was added in VI, along with 16-bit operations.
825 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
826 .minScalar(0, S32)
827 .scalarize(0)
828 .lower();
829 }
830
831 // FIXME: DAG expansion gets better results. The widening uses the smaller
832 // range values and goes for the min/max lowering directly.
833 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
834 .minScalar(0, S32)
835 .scalarize(0)
836 .lower();
837 }
838
840 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
841 .customFor({S32, S64})
842 .clampScalar(0, S32, S64)
844 .scalarize(0);
845
846 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
847 .legalFor({S32})
848 .maxScalar(0, S32);
849
850 if (ST.hasVOP3PInsts()) {
851 Mulh
852 .clampMaxNumElements(0, S8, 2)
853 .lowerFor({V2S8});
854 }
855
856 Mulh
857 .scalarize(0)
858 .lower();
859
860 // Report legal for any types we can handle anywhere. For the cases only legal
861 // on the SALU, RegBankSelect will be able to re-legalize.
862 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
863 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
864 .clampScalar(0, S32, S64)
868 .scalarize(0);
869
871 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
872 .legalFor({{S32, S1}, {S32, S32}})
873 .clampScalar(0, S32, S32)
874 .scalarize(0);
875
877 // Don't worry about the size constraint.
879 .lower();
880
882 .legalFor({S1, S32, S64, S16, GlobalPtr,
883 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
884 .legalIf(isPointer(0))
885 .clampScalar(0, S32, S64)
887
888 getActionDefinitionsBuilder(G_FCONSTANT)
889 .legalFor({S32, S64, S16})
890 .clampScalar(0, S16, S64);
891
892 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
893 .legalIf(isRegisterClassType(0))
894 // s1 and s16 are special cases because they have legal operations on
895 // them, but don't really occupy registers in the normal way.
896 .legalFor({S1, S16})
897 .clampNumElements(0, V16S32, V32S32)
901 .clampMaxNumElements(0, S32, 16);
902
903 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
904
905 // If the amount is divergent, we have to do a wave reduction to get the
906 // maximum value, so this is expanded during RegBankSelect.
907 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
908 .legalFor({{PrivatePtr, S32}});
909
910 getActionDefinitionsBuilder(G_STACKSAVE)
911 .customFor({PrivatePtr});
912 getActionDefinitionsBuilder(G_STACKRESTORE)
913 .legalFor({PrivatePtr});
914
915 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
916
917 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
918 .customIf(typeIsNot(0, PrivatePtr));
919
920 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
921
922 auto &FPOpActions = getActionDefinitionsBuilder(
923 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
924 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
925 .legalFor({S32, S64});
926 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
927 .customFor({S32, S64});
928 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
929 .customFor({S32, S64});
930
931 if (ST.has16BitInsts()) {
932 if (ST.hasVOP3PInsts())
933 FPOpActions.legalFor({S16, V2S16});
934 else
935 FPOpActions.legalFor({S16});
936
937 TrigActions.customFor({S16});
938 FDIVActions.customFor({S16});
939 }
940
941 if (ST.hasPackedFP32Ops()) {
942 FPOpActions.legalFor({V2S32});
943 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
944 }
945
946 auto &MinNumMaxNum = getActionDefinitionsBuilder({
947 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
948
949 if (ST.hasVOP3PInsts()) {
950 MinNumMaxNum.customFor(FPTypesPK16)
953 .clampScalar(0, S16, S64)
954 .scalarize(0);
955 } else if (ST.has16BitInsts()) {
956 MinNumMaxNum.customFor(FPTypes16)
957 .clampScalar(0, S16, S64)
958 .scalarize(0);
959 } else {
960 MinNumMaxNum.customFor(FPTypesBase)
961 .clampScalar(0, S32, S64)
962 .scalarize(0);
963 }
964
965 if (ST.hasVOP3PInsts())
966 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
967
968 FPOpActions
969 .scalarize(0)
970 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
971
972 TrigActions
973 .scalarize(0)
974 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
975
976 FDIVActions
977 .scalarize(0)
978 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
979
980 getActionDefinitionsBuilder({G_FNEG, G_FABS})
981 .legalFor(FPTypesPK16)
983 .scalarize(0)
984 .clampScalar(0, S16, S64);
985
986 if (ST.has16BitInsts()) {
988 .legalFor({S16})
989 .customFor({S32, S64})
990 .scalarize(0)
991 .unsupported();
993 .legalFor({S32, S64, S16})
994 .scalarize(0)
995 .clampScalar(0, S16, S64);
996
997 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
998 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
999 .scalarize(0)
1000 .maxScalarIf(typeIs(0, S16), 1, S16)
1001 .clampScalar(1, S32, S32)
1002 .lower();
1003
1005 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1006 .scalarize(0)
1007 .lower();
1008 } else {
1010 .customFor({S32, S64, S16})
1011 .scalarize(0)
1012 .unsupported();
1013
1014
1015 if (ST.hasFractBug()) {
1017 .customFor({S64})
1018 .legalFor({S32, S64})
1019 .scalarize(0)
1020 .clampScalar(0, S32, S64);
1021 } else {
1023 .legalFor({S32, S64})
1024 .scalarize(0)
1025 .clampScalar(0, S32, S64);
1026 }
1027
1028 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1029 .legalFor({{S32, S32}, {S64, S32}})
1030 .scalarize(0)
1031 .clampScalar(0, S32, S64)
1032 .clampScalar(1, S32, S32)
1033 .lower();
1034
1036 .customFor({{S32, S32}, {S64, S32}})
1037 .scalarize(0)
1038 .minScalar(0, S32)
1039 .clampScalar(1, S32, S32)
1040 .lower();
1041 }
1042
1043 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1044 if (ST.hasCvtPkF16F32Inst())
1045 FPTruncActions.legalFor(
1046 {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}});
1047 else
1048 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1049 FPTruncActions.scalarize(0).lower();
1050
1052 .legalFor({{S64, S32}, {S32, S16}})
1053 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1054 .scalarize(0);
1055
1056 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1057 if (ST.has16BitInsts()) {
1058 FSubActions
1059 // Use actual fsub instruction
1060 .legalFor({S32, S16})
1061 // Must use fadd + fneg
1062 .lowerFor({S64, V2S16});
1063 } else {
1064 FSubActions
1065 // Use actual fsub instruction
1066 .legalFor({S32})
1067 // Must use fadd + fneg
1068 .lowerFor({S64, S16, V2S16});
1069 }
1070
1071 FSubActions
1072 .scalarize(0)
1073 .clampScalar(0, S32, S64);
1074
1075 // Whether this is legal depends on the floating point mode for the function.
1076 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1077 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1078 FMad.customFor({S32, S16});
1079 else if (ST.hasMadMacF32Insts())
1080 FMad.customFor({S32});
1081 else if (ST.hasMadF16())
1082 FMad.customFor({S16});
1083 FMad.scalarize(0)
1084 .lower();
1085
1086 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1087 if (ST.has16BitInsts()) {
1088 FRem.customFor({S16, S32, S64});
1089 } else {
1090 FRem.minScalar(0, S32)
1091 .customFor({S32, S64});
1092 }
1093 FRem.scalarize(0);
1094
1095 // TODO: Do we need to clamp maximum bitwidth?
1097 .legalIf(isScalar(0))
1098 .legalFor({{V2S16, V2S32}})
1099 .clampMaxNumElements(0, S16, 2)
1100 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1101 // situations (like an invalid implicit use), we don't want to infinite loop
1102 // in the legalizer.
1104 .alwaysLegal();
1105
1106 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1107 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1108 {S32, S1}, {S64, S1}, {S16, S1}})
1109 .scalarize(0)
1110 .clampScalar(0, S32, S64)
1111 .widenScalarToNextPow2(1, 32);
1112
1113 // TODO: Split s1->s64 during regbankselect for VALU.
1114 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1115 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1116 .lowerIf(typeIs(1, S1))
1117 .customFor({{S32, S64}, {S64, S64}});
1118 if (ST.has16BitInsts())
1119 IToFP.legalFor({{S16, S16}});
1120 IToFP.clampScalar(1, S32, S64)
1121 .minScalar(0, S32)
1122 .scalarize(0)
1124
1125 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1126 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1127 .customFor({{S64, S32}, {S64, S64}})
1128 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1129 if (ST.has16BitInsts())
1130 FPToI.legalFor({{S16, S16}});
1131 else
1132 FPToI.minScalar(1, S32);
1133
1134 FPToI.minScalar(0, S32)
1135 .widenScalarToNextPow2(0, 32)
1136 .scalarize(0)
1137 .lower();
1138
1139 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1140 .clampScalar(0, S16, S64)
1141 .scalarize(0)
1142 .lower();
1143
1144 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1145 .legalFor({S16, S32})
1146 .scalarize(0)
1147 .lower();
1148
1149 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1150 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1151 .scalarize(0)
1152 .lower();
1153
1154 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1155 .clampScalar(0, S16, S64)
1156 .scalarize(0)
1157 .lower();
1158
1159 if (ST.has16BitInsts()) {
1160 getActionDefinitionsBuilder(
1161 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1162 .legalFor({S16, S32, S64})
1163 .clampScalar(0, S16, S64)
1164 .scalarize(0);
1165 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1166 getActionDefinitionsBuilder(
1167 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1168 .legalFor({S32, S64})
1169 .clampScalar(0, S32, S64)
1170 .scalarize(0);
1171 } else {
1172 getActionDefinitionsBuilder(
1173 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1174 .legalFor({S32})
1175 .customFor({S64})
1176 .clampScalar(0, S32, S64)
1177 .scalarize(0);
1178 }
1179
1180 getActionDefinitionsBuilder(G_PTR_ADD)
1181 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1182 .legalIf(all(isPointer(0), sameSize(0, 1)))
1183 .scalarize(0)
1184 .scalarSameSizeAs(1, 0);
1185
1186 getActionDefinitionsBuilder(G_PTRMASK)
1187 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1188 .scalarSameSizeAs(1, 0)
1189 .scalarize(0);
1190
1191 auto &CmpBuilder =
1192 getActionDefinitionsBuilder(G_ICMP)
1193 // The compare output type differs based on the register bank of the output,
1194 // so make both s1 and s32 legal.
1195 //
1196 // Scalar compares producing output in scc will be promoted to s32, as that
1197 // is the allocatable register type that will be needed for the copy from
1198 // scc. This will be promoted during RegBankSelect, and we assume something
1199 // before that won't try to use s32 result types.
1200 //
1201 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1202 // bank.
1203 .legalForCartesianProduct(
1204 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1205 .legalForCartesianProduct(
1206 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1207 if (ST.has16BitInsts()) {
1208 CmpBuilder.legalFor({{S1, S16}});
1209 }
1210
1211 CmpBuilder
1212 .widenScalarToNextPow2(1)
1213 .clampScalar(1, S32, S64)
1214 .scalarize(0)
1215 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1216
1217 auto &FCmpBuilder =
1218 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1219 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1220
1221 if (ST.hasSALUFloatInsts())
1222 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1223
1224 FCmpBuilder
1225 .widenScalarToNextPow2(1)
1226 .clampScalar(1, S32, S64)
1227 .scalarize(0);
1228
1229 // FIXME: fpow has a selection pattern that should move to custom lowering.
1230 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1231 if (ST.has16BitInsts())
1232 ExpOps.customFor({{S32}, {S16}});
1233 else
1234 ExpOps.customFor({S32});
1235 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1236 .scalarize(0);
1237
1238 getActionDefinitionsBuilder(G_FPOWI)
1239 .clampScalar(0, MinScalarFPTy, S32)
1240 .lower();
1241
1242 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1243 Log2Ops.customFor({S32});
1244 if (ST.has16BitInsts())
1245 Log2Ops.legalFor({S16});
1246 else
1247 Log2Ops.customFor({S16});
1248 Log2Ops.scalarize(0)
1249 .lower();
1250
1251 auto &LogOps =
1252 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1253 LogOps.customFor({S32, S16});
1254 LogOps.clampScalar(0, MinScalarFPTy, S32)
1255 .scalarize(0);
1256
1257 // The 64-bit versions produce 32-bit results, but only on the SALU.
1258 getActionDefinitionsBuilder(G_CTPOP)
1259 .legalFor({{S32, S32}, {S32, S64}})
1260 .clampScalar(0, S32, S32)
1261 .widenScalarToNextPow2(1, 32)
1262 .clampScalar(1, S32, S64)
1263 .scalarize(0)
1264 .widenScalarToNextPow2(0, 32);
1265
1266 // If no 16 bit instr is available, lower into different instructions.
1267 if (ST.has16BitInsts())
1268 getActionDefinitionsBuilder(G_IS_FPCLASS)
1269 .legalForCartesianProduct({S1}, FPTypes16)
1270 .widenScalarToNextPow2(1)
1271 .scalarize(0)
1272 .lower();
1273 else
1274 getActionDefinitionsBuilder(G_IS_FPCLASS)
1275 .legalForCartesianProduct({S1}, FPTypesBase)
1276 .lowerFor({S1, S16})
1277 .widenScalarToNextPow2(1)
1278 .scalarize(0)
1279 .lower();
1280
1281 // The hardware instructions return a different result on 0 than the generic
1282 // instructions expect. The hardware produces -1, but these produce the
1283 // bitwidth.
1284 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1285 .scalarize(0)
1286 .clampScalar(0, S32, S32)
1287 .clampScalar(1, S32, S64)
1288 .widenScalarToNextPow2(0, 32)
1289 .widenScalarToNextPow2(1, 32)
1290 .custom();
1291
1292 // The 64-bit versions produce 32-bit results, but only on the SALU.
1293 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1294 .legalFor({{S32, S32}, {S32, S64}})
1295 .customIf(scalarNarrowerThan(1, 32))
1296 .clampScalar(0, S32, S32)
1297 .clampScalar(1, S32, S64)
1298 .scalarize(0)
1299 .widenScalarToNextPow2(0, 32)
1300 .widenScalarToNextPow2(1, 32);
1301
1302 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1303 .legalFor({{S32, S32}, {S32, S64}})
1304 .clampScalar(0, S32, S32)
1305 .clampScalar(1, S32, S64)
1306 .scalarize(0)
1307 .widenScalarToNextPow2(0, 32)
1308 .widenScalarToNextPow2(1, 32);
1309
1310 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1311 // RegBankSelect.
1312 getActionDefinitionsBuilder(G_BITREVERSE)
1313 .legalFor({S32, S64})
1314 .clampScalar(0, S32, S64)
1315 .scalarize(0)
1316 .widenScalarToNextPow2(0);
1317
1318 if (ST.has16BitInsts()) {
1319 getActionDefinitionsBuilder(G_BSWAP)
1320 .legalFor({S16, S32, V2S16})
1321 .clampMaxNumElementsStrict(0, S16, 2)
1322 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1323 // narrowScalar limitation.
1324 .widenScalarToNextPow2(0)
1325 .clampScalar(0, S16, S32)
1326 .scalarize(0);
1327
1328 if (ST.hasVOP3PInsts()) {
1329 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1330 .legalFor({S32, S16, V2S16})
1331 .clampMaxNumElements(0, S16, 2)
1332 .minScalar(0, S16)
1333 .widenScalarToNextPow2(0)
1334 .scalarize(0)
1335 .lower();
1336 } else {
1337 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1338 .legalFor({S32, S16})
1339 .widenScalarToNextPow2(0)
1340 .minScalar(0, S16)
1341 .scalarize(0)
1342 .lower();
1343 }
1344 } else {
1345 // TODO: Should have same legality without v_perm_b32
1346 getActionDefinitionsBuilder(G_BSWAP)
1347 .legalFor({S32})
1348 .lowerIf(scalarNarrowerThan(0, 32))
1349 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1350 // narrowScalar limitation.
1351 .widenScalarToNextPow2(0)
1352 .maxScalar(0, S32)
1353 .scalarize(0)
1354 .lower();
1355
1356 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1357 .legalFor({S32})
1358 .minScalar(0, S32)
1359 .widenScalarToNextPow2(0)
1360 .scalarize(0)
1361 .lower();
1362 }
1363
1364 getActionDefinitionsBuilder(G_INTTOPTR)
1365 // List the common cases
1366 .legalForCartesianProduct(AddrSpaces64, {S64})
1367 .legalForCartesianProduct(AddrSpaces32, {S32})
1368 .scalarize(0)
1369 // Accept any address space as long as the size matches
1370 .legalIf(sameSize(0, 1))
1371 .widenScalarIf(smallerThan(1, 0),
1372 [](const LegalityQuery &Query) {
1373 return std::pair(
1374 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1375 })
1376 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1377 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1378 });
1379
1380 getActionDefinitionsBuilder(G_PTRTOINT)
1381 // List the common cases
1382 .legalForCartesianProduct(AddrSpaces64, {S64})
1383 .legalForCartesianProduct(AddrSpaces32, {S32})
1384 .scalarize(0)
1385 // Accept any address space as long as the size matches
1386 .legalIf(sameSize(0, 1))
1387 .widenScalarIf(smallerThan(0, 1),
1388 [](const LegalityQuery &Query) {
1389 return std::pair(
1390 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1391 })
1392 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1393 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1394 });
1395
1396 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1397 .scalarize(0)
1398 .custom();
1399
1400 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1401 bool IsLoad) -> bool {
1402 const LLT DstTy = Query.Types[0];
1403
1404 // Split vector extloads.
1405 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1406
1407 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1408 return true;
1409
1410 const LLT PtrTy = Query.Types[1];
1411 unsigned AS = PtrTy.getAddressSpace();
1412 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1413 Query.MMODescrs[0].Ordering !=
1415 return true;
1416
1417 // Catch weird sized loads that don't evenly divide into the access sizes
1418 // TODO: May be able to widen depending on alignment etc.
1419 unsigned NumRegs = (MemSize + 31) / 32;
1420 if (NumRegs == 3) {
1421 if (!ST.hasDwordx3LoadStores())
1422 return true;
1423 } else {
1424 // If the alignment allows, these should have been widened.
1425 if (!isPowerOf2_32(NumRegs))
1426 return true;
1427 }
1428
1429 return false;
1430 };
1431
1432 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1433 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1434 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1435
1436 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1437 // LDS
1438 // TODO: Unsupported flat for SI.
1439
1440 for (unsigned Op : {G_LOAD, G_STORE}) {
1441 const bool IsStore = Op == G_STORE;
1442
1443 auto &Actions = getActionDefinitionsBuilder(Op);
1444 // Explicitly list some common cases.
1445 // TODO: Does this help compile time at all?
1446 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1447 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1448 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1449 {S64, GlobalPtr, S64, GlobalAlign32},
1450 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1451 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1452 {S32, GlobalPtr, S8, GlobalAlign8},
1453 {S32, GlobalPtr, S16, GlobalAlign16},
1454
1455 {S32, LocalPtr, S32, 32},
1456 {S64, LocalPtr, S64, 32},
1457 {V2S32, LocalPtr, V2S32, 32},
1458 {S32, LocalPtr, S8, 8},
1459 {S32, LocalPtr, S16, 16},
1460 {V2S16, LocalPtr, S32, 32},
1461
1462 {S32, PrivatePtr, S32, 32},
1463 {S32, PrivatePtr, S8, 8},
1464 {S32, PrivatePtr, S16, 16},
1465 {V2S16, PrivatePtr, S32, 32},
1466
1467 {S32, ConstantPtr, S32, GlobalAlign32},
1468 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1469 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1470 {S64, ConstantPtr, S64, GlobalAlign32},
1471 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1472 Actions.legalIf(
1473 [=](const LegalityQuery &Query) -> bool {
1474 return isLoadStoreLegal(ST, Query);
1475 });
1476
1477 // The custom pointers (fat pointers, buffer resources) don't work with load
1478 // and store at this level. Fat pointers should have been lowered to
1479 // intrinsics before the translation to MIR.
1480 Actions.unsupportedIf(
1481 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1482
1483 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1484 // ptrtoint. This is needed to account for the fact that we can't have i128
1485 // as a register class for SelectionDAG reasons.
1486 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1487 return hasBufferRsrcWorkaround(Query.Types[0]);
1488 });
1489
1490 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1491 // 64-bits.
1492 //
1493 // TODO: Should generalize bitcast action into coerce, which will also cover
1494 // inserting addrspacecasts.
1495 Actions.customIf(typeIs(1, Constant32Ptr));
1496
1497 // Turn any illegal element vectors into something easier to deal
1498 // with. These will ultimately produce 32-bit scalar shifts to extract the
1499 // parts anyway.
1500 //
1501 // For odd 16-bit element vectors, prefer to split those into pieces with
1502 // 16-bit vector parts.
1503 Actions.bitcastIf(
1504 [=](const LegalityQuery &Query) -> bool {
1505 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1506 Query.MMODescrs[0].MemoryTy);
1507 }, bitcastToRegisterType(0));
1508
1509 if (!IsStore) {
1510 // Widen suitably aligned loads by loading extra bytes. The standard
1511 // legalization actions can't properly express widening memory operands.
1512 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1513 return shouldWidenLoad(ST, Query, G_LOAD);
1514 });
1515 }
1516
1517 // FIXME: load/store narrowing should be moved to lower action
1518 Actions
1519 .narrowScalarIf(
1520 [=](const LegalityQuery &Query) -> bool {
1521 return !Query.Types[0].isVector() &&
1522 needToSplitMemOp(Query, Op == G_LOAD);
1523 },
1524 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1525 const LLT DstTy = Query.Types[0];
1526 const LLT PtrTy = Query.Types[1];
1527
1528 const unsigned DstSize = DstTy.getSizeInBits();
1529 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1530
1531 // Split extloads.
1532 if (DstSize > MemSize)
1533 return std::pair(0, LLT::scalar(MemSize));
1534
1535 unsigned MaxSize = maxSizeForAddrSpace(
1536 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1537 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1538 if (MemSize > MaxSize)
1539 return std::pair(0, LLT::scalar(MaxSize));
1540
1541 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1542 return std::pair(0, LLT::scalar(Align));
1543 })
1544 .fewerElementsIf(
1545 [=](const LegalityQuery &Query) -> bool {
1546 return Query.Types[0].isVector() &&
1547 needToSplitMemOp(Query, Op == G_LOAD);
1548 },
1549 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1550 const LLT DstTy = Query.Types[0];
1551 const LLT PtrTy = Query.Types[1];
1552
1553 LLT EltTy = DstTy.getElementType();
1554 unsigned MaxSize = maxSizeForAddrSpace(
1555 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1556 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1557
1558 // FIXME: Handle widened to power of 2 results better. This ends
1559 // up scalarizing.
1560 // FIXME: 3 element stores scalarized on SI
1561
1562 // Split if it's too large for the address space.
1563 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1564 if (MemSize > MaxSize) {
1565 unsigned NumElts = DstTy.getNumElements();
1566 unsigned EltSize = EltTy.getSizeInBits();
1567
1568 if (MaxSize % EltSize == 0) {
1569 return std::pair(
1571 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1572 }
1573
1574 unsigned NumPieces = MemSize / MaxSize;
1575
1576 // FIXME: Refine when odd breakdowns handled
1577 // The scalars will need to be re-legalized.
1578 if (NumPieces == 1 || NumPieces >= NumElts ||
1579 NumElts % NumPieces != 0)
1580 return std::pair(0, EltTy);
1581
1582 return std::pair(0,
1583 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1584 }
1585
1586 // FIXME: We could probably handle weird extending loads better.
1587 if (DstTy.getSizeInBits() > MemSize)
1588 return std::pair(0, EltTy);
1589
1590 unsigned EltSize = EltTy.getSizeInBits();
1591 unsigned DstSize = DstTy.getSizeInBits();
1592 if (!isPowerOf2_32(DstSize)) {
1593 // We're probably decomposing an odd sized store. Try to split
1594 // to the widest type. TODO: Account for alignment. As-is it
1595 // should be OK, since the new parts will be further legalized.
1596 unsigned FloorSize = llvm::bit_floor(DstSize);
1597 return std::pair(
1599 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1600 }
1601
1602 // May need relegalization for the scalars.
1603 return std::pair(0, EltTy);
1604 })
1605 .minScalar(0, S32)
1606 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1607 .widenScalarToNextPow2(0)
1608 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1609 .lower();
1610 }
1611
1612 // FIXME: Unaligned accesses not lowered.
1613 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1614 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1615 {S32, GlobalPtr, S16, 2 * 8},
1616 {S32, LocalPtr, S8, 8},
1617 {S32, LocalPtr, S16, 16},
1618 {S32, PrivatePtr, S8, 8},
1619 {S32, PrivatePtr, S16, 16},
1620 {S32, ConstantPtr, S8, 8},
1621 {S32, ConstantPtr, S16, 2 * 8}})
1622 .legalIf(
1623 [=](const LegalityQuery &Query) -> bool {
1624 return isLoadStoreLegal(ST, Query);
1625 });
1626
1627 if (ST.hasFlatAddressSpace()) {
1628 ExtLoads.legalForTypesWithMemDesc(
1629 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1630 }
1631
1632 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1633 // 64-bits.
1634 //
1635 // TODO: Should generalize bitcast action into coerce, which will also cover
1636 // inserting addrspacecasts.
1637 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1638
1639 ExtLoads.clampScalar(0, S32, S32)
1640 .widenScalarToNextPow2(0)
1641 .lower();
1642
1643 auto &Atomics = getActionDefinitionsBuilder(
1644 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1645 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1646 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1647 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1648 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1649 {S64, GlobalPtr}, {S64, LocalPtr},
1650 {S32, RegionPtr}, {S64, RegionPtr}});
1651 if (ST.hasFlatAddressSpace()) {
1652 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1653 }
1654
1655 // TODO: v2bf16 operations, and fat buffer pointer support.
1656 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1657 if (ST.hasLDSFPAtomicAddF32()) {
1658 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1659 if (ST.hasLdsAtomicAddF64())
1660 Atomic.legalFor({{S64, LocalPtr}});
1661 if (ST.hasAtomicDsPkAdd16Insts())
1662 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1663 }
1664 if (ST.hasAtomicFaddInsts())
1665 Atomic.legalFor({{S32, GlobalPtr}});
1666 if (ST.hasFlatAtomicFaddF32Inst())
1667 Atomic.legalFor({{S32, FlatPtr}});
1668
1669 if (ST.hasGFX90AInsts()) {
1670 // These are legal with some caveats, and should have undergone expansion in
1671 // the IR in most situations
1672 // TODO: Move atomic expansion into legalizer
1673 Atomic.legalFor({
1674 {S32, GlobalPtr},
1675 {S64, GlobalPtr},
1676 {S64, FlatPtr}
1677 });
1678 }
1679
1680 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1681 ST.hasAtomicBufferGlobalPkAddF16Insts())
1682 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1683 if (ST.hasAtomicGlobalPkAddBF16Inst())
1684 Atomic.legalFor({{V2BF16, GlobalPtr}});
1685 if (ST.hasAtomicFlatPkAdd16Insts())
1686 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1687
1688
1689 // Most of the legalization work here is done by AtomicExpand. We could
1690 // probably use a simpler legality rule that just assumes anything is OK.
1691 auto &AtomicFMinFMax =
1692 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1693 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1694
1695 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1696 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1697 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1698 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1699 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1700 AtomicFMinFMax.legalFor({F32, FlatPtr});
1701 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1702 AtomicFMinFMax.legalFor({F64, FlatPtr});
1703
1704 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1705 // demarshalling
1706 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1707 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1708 {S32, FlatPtr}, {S64, FlatPtr}})
1709 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1710 {S32, RegionPtr}, {S64, RegionPtr}});
1711 // TODO: Pointer types, any 32-bit or 64-bit vector
1712
1713 // Condition should be s32 for scalar, s1 for vector.
1714 getActionDefinitionsBuilder(G_SELECT)
1715 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1716 LocalPtr, FlatPtr, PrivatePtr,
1717 LLT::fixed_vector(2, LocalPtr),
1718 LLT::fixed_vector(2, PrivatePtr)},
1719 {S1, S32})
1720 .clampScalar(0, S16, S64)
1721 .scalarize(1)
1722 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1723 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1724 .clampMaxNumElements(0, S32, 2)
1725 .clampMaxNumElements(0, LocalPtr, 2)
1726 .clampMaxNumElements(0, PrivatePtr, 2)
1727 .scalarize(0)
1728 .widenScalarToNextPow2(0)
1729 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1730
1731 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1732 // be more flexible with the shift amount type.
1733 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1734 .legalFor({{S32, S32}, {S64, S32}});
1735 if (ST.has16BitInsts()) {
1736 if (ST.hasVOP3PInsts()) {
1737 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1738 .clampMaxNumElements(0, S16, 2);
1739 } else
1740 Shifts.legalFor({{S16, S16}});
1741
1742 // TODO: Support 16-bit shift amounts for all types
1743 Shifts.widenScalarIf(
1744 [=](const LegalityQuery &Query) {
1745 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1746 // 32-bit amount.
1747 const LLT ValTy = Query.Types[0];
1748 const LLT AmountTy = Query.Types[1];
1749 return ValTy.getSizeInBits() <= 16 &&
1750 AmountTy.getSizeInBits() < 16;
1751 }, changeTo(1, S16));
1752 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1753 Shifts.clampScalar(1, S32, S32);
1754 Shifts.widenScalarToNextPow2(0, 16);
1755 Shifts.clampScalar(0, S16, S64);
1756
1757 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1758 .minScalar(0, S16)
1759 .scalarize(0)
1760 .lower();
1761 } else {
1762 // Make sure we legalize the shift amount type first, as the general
1763 // expansion for the shifted type will produce much worse code if it hasn't
1764 // been truncated already.
1765 Shifts.clampScalar(1, S32, S32);
1766 Shifts.widenScalarToNextPow2(0, 32);
1767 Shifts.clampScalar(0, S32, S64);
1768
1769 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1770 .minScalar(0, S32)
1771 .scalarize(0)
1772 .lower();
1773 }
1774 Shifts.scalarize(0);
1775
1776 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1777 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1778 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1779 unsigned IdxTypeIdx = 2;
1780
1781 getActionDefinitionsBuilder(Op)
1782 .customIf([=](const LegalityQuery &Query) {
1783 const LLT EltTy = Query.Types[EltTypeIdx];
1784 const LLT VecTy = Query.Types[VecTypeIdx];
1785 const LLT IdxTy = Query.Types[IdxTypeIdx];
1786 const unsigned EltSize = EltTy.getSizeInBits();
1787 const bool isLegalVecType =
1789 // Address space 8 pointers are 128-bit wide values, but the logic
1790 // below will try to bitcast them to 2N x s64, which will fail.
1791 // Therefore, as an intermediate step, wrap extracts/insertions from a
1792 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1793 // extraction result) in order to produce a vector operation that can
1794 // be handled by the logic below.
1795 if (EltTy.isPointer() && EltSize > 64)
1796 return true;
1797 return (EltSize == 32 || EltSize == 64) &&
1798 VecTy.getSizeInBits() % 32 == 0 &&
1799 VecTy.getSizeInBits() <= MaxRegisterSize &&
1800 IdxTy.getSizeInBits() == 32 &&
1801 isLegalVecType;
1802 })
1803 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1804 bitcastToVectorElement32(VecTypeIdx))
1805 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1806 .bitcastIf(
1807 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1808 [=](const LegalityQuery &Query) {
1809 // For > 64-bit element types, try to turn this into a 64-bit
1810 // element vector since we may be able to do better indexing
1811 // if this is scalar. If not, fall back to 32.
1812 const LLT EltTy = Query.Types[EltTypeIdx];
1813 const LLT VecTy = Query.Types[VecTypeIdx];
1814 const unsigned DstEltSize = EltTy.getSizeInBits();
1815 const unsigned VecSize = VecTy.getSizeInBits();
1816
1817 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1818 return std::pair(
1819 VecTypeIdx,
1820 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1821 })
1822 .clampScalar(EltTypeIdx, S32, S64)
1823 .clampScalar(VecTypeIdx, S32, S64)
1824 .clampScalar(IdxTypeIdx, S32, S32)
1825 .clampMaxNumElements(VecTypeIdx, S32, 32)
1826 // TODO: Clamp elements for 64-bit vectors?
1827 .moreElementsIf(
1828 isIllegalRegisterType(VecTypeIdx),
1830 // It should only be necessary with variable indexes.
1831 // As a last resort, lower to the stack
1832 .lower();
1833 }
1834
1835 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1836 .unsupportedIf([=](const LegalityQuery &Query) {
1837 const LLT &EltTy = Query.Types[1].getElementType();
1838 return Query.Types[0] != EltTy;
1839 });
1840
1841 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1842 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1843 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1844
1845 // FIXME: Doesn't handle extract of illegal sizes.
1846 getActionDefinitionsBuilder(Op)
1847 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1848 .lowerIf([=](const LegalityQuery &Query) {
1849 // Sub-vector(or single element) insert and extract.
1850 // TODO: verify immediate offset here since lower only works with
1851 // whole elements.
1852 const LLT BigTy = Query.Types[BigTyIdx];
1853 return BigTy.isVector();
1854 })
1855 // FIXME: Multiples of 16 should not be legal.
1856 .legalIf([=](const LegalityQuery &Query) {
1857 const LLT BigTy = Query.Types[BigTyIdx];
1858 const LLT LitTy = Query.Types[LitTyIdx];
1859 return (BigTy.getSizeInBits() % 32 == 0) &&
1860 (LitTy.getSizeInBits() % 16 == 0);
1861 })
1862 .widenScalarIf(
1863 [=](const LegalityQuery &Query) {
1864 const LLT BigTy = Query.Types[BigTyIdx];
1865 return (BigTy.getScalarSizeInBits() < 16);
1866 },
1868 .widenScalarIf(
1869 [=](const LegalityQuery &Query) {
1870 const LLT LitTy = Query.Types[LitTyIdx];
1871 return (LitTy.getScalarSizeInBits() < 16);
1872 },
1874 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1875 .widenScalarToNextPow2(BigTyIdx, 32);
1876
1877 }
1878
1879 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1880 .legalForCartesianProduct(AllS32Vectors, {S32})
1881 .legalForCartesianProduct(AllS64Vectors, {S64})
1882 .clampNumElements(0, V16S32, V32S32)
1883 .clampNumElements(0, V2S64, V16S64)
1884 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1885 .moreElementsIf(
1888
1889 if (ST.hasScalarPackInsts()) {
1890 BuildVector
1891 // FIXME: Should probably widen s1 vectors straight to s32
1892 .minScalarOrElt(0, S16)
1893 .minScalar(1, S16);
1894
1895 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1896 .legalFor({V2S16, S32})
1897 .lower();
1898 } else {
1899 BuildVector.customFor({V2S16, S16});
1900 BuildVector.minScalarOrElt(0, S32);
1901
1902 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1903 .customFor({V2S16, S32})
1904 .lower();
1905 }
1906
1907 BuildVector.legalIf(isRegisterType(0));
1908
1909 // FIXME: Clamp maximum size
1910 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1911 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1912 .clampMaxNumElements(0, S32, 32)
1913 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1914 .clampMaxNumElements(0, S16, 64);
1915
1916 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1917
1918 // Merge/Unmerge
1919 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1920 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1921 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1922
1923 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1924 const LLT Ty = Query.Types[TypeIdx];
1925 if (Ty.isVector()) {
1926 const LLT &EltTy = Ty.getElementType();
1927 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1928 return true;
1929 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1930 return true;
1931 }
1932 return false;
1933 };
1934
1935 auto &Builder = getActionDefinitionsBuilder(Op)
1936 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1937 .lowerFor({{S16, V2S16}})
1938 .lowerIf([=](const LegalityQuery &Query) {
1939 const LLT BigTy = Query.Types[BigTyIdx];
1940 return BigTy.getSizeInBits() == 32;
1941 })
1942 // Try to widen to s16 first for small types.
1943 // TODO: Only do this on targets with legal s16 shifts
1944 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1945 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1946 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1947 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1948 elementTypeIs(1, S16)),
1949 changeTo(1, V2S16))
1950 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1951 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1952 // valid.
1953 .clampScalar(LitTyIdx, S32, S512)
1954 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1955 // Break up vectors with weird elements into scalars
1956 .fewerElementsIf(
1957 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1958 scalarize(0))
1959 .fewerElementsIf(
1960 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1961 scalarize(1))
1962 .clampScalar(BigTyIdx, S32, MaxScalar);
1963
1964 if (Op == G_MERGE_VALUES) {
1965 Builder.widenScalarIf(
1966 // TODO: Use 16-bit shifts if legal for 8-bit values?
1967 [=](const LegalityQuery &Query) {
1968 const LLT Ty = Query.Types[LitTyIdx];
1969 return Ty.getSizeInBits() < 32;
1970 },
1971 changeTo(LitTyIdx, S32));
1972 }
1973
1974 Builder.widenScalarIf(
1975 [=](const LegalityQuery &Query) {
1976 const LLT Ty = Query.Types[BigTyIdx];
1977 return Ty.getSizeInBits() % 16 != 0;
1978 },
1979 [=](const LegalityQuery &Query) {
1980 // Pick the next power of 2, or a multiple of 64 over 128.
1981 // Whichever is smaller.
1982 const LLT &Ty = Query.Types[BigTyIdx];
1983 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1984 if (NewSizeInBits >= 256) {
1985 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1986 if (RoundedTo < NewSizeInBits)
1987 NewSizeInBits = RoundedTo;
1988 }
1989 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1990 })
1991 // Any vectors left are the wrong size. Scalarize them.
1992 .scalarize(0)
1993 .scalarize(1);
1994 }
1995
1996 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1997 // RegBankSelect.
1998 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1999 .legalFor({{S32}, {S64}});
2000
2001 if (ST.hasVOP3PInsts()) {
2002 SextInReg.lowerFor({{V2S16}})
2003 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2004 // get more vector shift opportunities, since we'll get those when
2005 // expanded.
2006 .clampMaxNumElementsStrict(0, S16, 2);
2007 } else if (ST.has16BitInsts()) {
2008 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2009 } else {
2010 // Prefer to promote to s32 before lowering if we don't have 16-bit
2011 // shifts. This avoid a lot of intermediate truncate and extend operations.
2012 SextInReg.lowerFor({{S32}, {S64}});
2013 }
2014
2015 SextInReg
2016 .scalarize(0)
2017 .clampScalar(0, S32, S64)
2018 .lower();
2019
2020 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2021 .scalarize(0)
2022 .lower();
2023
2024 // TODO: Only Try to form v2s16 with legal packed instructions.
2025 getActionDefinitionsBuilder(G_FSHR)
2026 .legalFor({{S32, S32}})
2027 .lowerFor({{V2S16, V2S16}})
2028 .clampMaxNumElementsStrict(0, S16, 2)
2029 .scalarize(0)
2030 .lower();
2031
2032 if (ST.hasVOP3PInsts()) {
2033 getActionDefinitionsBuilder(G_FSHL)
2034 .lowerFor({{V2S16, V2S16}})
2035 .clampMaxNumElementsStrict(0, S16, 2)
2036 .scalarize(0)
2037 .lower();
2038 } else {
2039 getActionDefinitionsBuilder(G_FSHL)
2040 .scalarize(0)
2041 .lower();
2042 }
2043
2044 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2045 .legalFor({S64});
2046
2047 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2048
2049 getActionDefinitionsBuilder(G_FENCE)
2050 .alwaysLegal();
2051
2052 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2053 .scalarize(0)
2054 .minScalar(0, S32)
2055 .lower();
2056
2057 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2058 .legalFor({{S32, S32}, {S64, S32}})
2059 .clampScalar(1, S32, S32)
2060 .clampScalar(0, S32, S64)
2061 .widenScalarToNextPow2(0)
2062 .scalarize(0);
2063
2064 getActionDefinitionsBuilder(
2065 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2066 G_FCOPYSIGN,
2067
2068 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2069 G_READ_REGISTER, G_WRITE_REGISTER,
2070
2071 G_SADDO, G_SSUBO})
2072 .lower();
2073
2074 if (ST.hasIEEEMinMax()) {
2075 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2076 .legalFor(FPTypesPK16)
2077 .clampMaxNumElements(0, S16, 2)
2078 .scalarize(0);
2079 } else {
2080 // TODO: Implement
2081 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2082 }
2083
2084 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2085 .lower();
2086
2087 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2088
2089 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2090 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2091 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2092 .unsupported();
2093
2094 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2095
2096 getLegacyLegalizerInfo().computeTables();
2097 verify(*ST.getInstrInfo());
2098}
2099
2102 LostDebugLocObserver &LocObserver) const {
2103 MachineIRBuilder &B = Helper.MIRBuilder;
2104 MachineRegisterInfo &MRI = *B.getMRI();
2105
2106 switch (MI.getOpcode()) {
2107 case TargetOpcode::G_ADDRSPACE_CAST:
2108 return legalizeAddrSpaceCast(MI, MRI, B);
2109 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2110 return legalizeFroundeven(MI, MRI, B);
2111 case TargetOpcode::G_FCEIL:
2112 return legalizeFceil(MI, MRI, B);
2113 case TargetOpcode::G_FREM:
2114 return legalizeFrem(MI, MRI, B);
2115 case TargetOpcode::G_INTRINSIC_TRUNC:
2116 return legalizeIntrinsicTrunc(MI, MRI, B);
2117 case TargetOpcode::G_SITOFP:
2118 return legalizeITOFP(MI, MRI, B, true);
2119 case TargetOpcode::G_UITOFP:
2120 return legalizeITOFP(MI, MRI, B, false);
2121 case TargetOpcode::G_FPTOSI:
2122 return legalizeFPTOI(MI, MRI, B, true);
2123 case TargetOpcode::G_FPTOUI:
2124 return legalizeFPTOI(MI, MRI, B, false);
2125 case TargetOpcode::G_FMINNUM:
2126 case TargetOpcode::G_FMAXNUM:
2127 case TargetOpcode::G_FMINNUM_IEEE:
2128 case TargetOpcode::G_FMAXNUM_IEEE:
2129 return legalizeMinNumMaxNum(Helper, MI);
2130 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2131 return legalizeExtractVectorElt(MI, MRI, B);
2132 case TargetOpcode::G_INSERT_VECTOR_ELT:
2133 return legalizeInsertVectorElt(MI, MRI, B);
2134 case TargetOpcode::G_FSIN:
2135 case TargetOpcode::G_FCOS:
2136 return legalizeSinCos(MI, MRI, B);
2137 case TargetOpcode::G_GLOBAL_VALUE:
2138 return legalizeGlobalValue(MI, MRI, B);
2139 case TargetOpcode::G_LOAD:
2140 case TargetOpcode::G_SEXTLOAD:
2141 case TargetOpcode::G_ZEXTLOAD:
2142 return legalizeLoad(Helper, MI);
2143 case TargetOpcode::G_STORE:
2144 return legalizeStore(Helper, MI);
2145 case TargetOpcode::G_FMAD:
2146 return legalizeFMad(MI, MRI, B);
2147 case TargetOpcode::G_FDIV:
2148 return legalizeFDIV(MI, MRI, B);
2149 case TargetOpcode::G_FFREXP:
2150 return legalizeFFREXP(MI, MRI, B);
2151 case TargetOpcode::G_FSQRT:
2152 return legalizeFSQRT(MI, MRI, B);
2153 case TargetOpcode::G_UDIV:
2154 case TargetOpcode::G_UREM:
2155 case TargetOpcode::G_UDIVREM:
2156 return legalizeUnsignedDIV_REM(MI, MRI, B);
2157 case TargetOpcode::G_SDIV:
2158 case TargetOpcode::G_SREM:
2159 case TargetOpcode::G_SDIVREM:
2160 return legalizeSignedDIV_REM(MI, MRI, B);
2161 case TargetOpcode::G_ATOMIC_CMPXCHG:
2162 return legalizeAtomicCmpXChg(MI, MRI, B);
2163 case TargetOpcode::G_FLOG2:
2164 return legalizeFlog2(MI, B);
2165 case TargetOpcode::G_FLOG:
2166 case TargetOpcode::G_FLOG10:
2167 return legalizeFlogCommon(MI, B);
2168 case TargetOpcode::G_FEXP2:
2169 return legalizeFExp2(MI, B);
2170 case TargetOpcode::G_FEXP:
2171 case TargetOpcode::G_FEXP10:
2172 return legalizeFExp(MI, B);
2173 case TargetOpcode::G_FPOW:
2174 return legalizeFPow(MI, B);
2175 case TargetOpcode::G_FFLOOR:
2176 return legalizeFFloor(MI, MRI, B);
2177 case TargetOpcode::G_BUILD_VECTOR:
2178 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2179 return legalizeBuildVector(MI, MRI, B);
2180 case TargetOpcode::G_MUL:
2181 return legalizeMul(Helper, MI);
2182 case TargetOpcode::G_CTLZ:
2183 case TargetOpcode::G_CTTZ:
2184 return legalizeCTLZ_CTTZ(MI, MRI, B);
2185 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2186 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2187 case TargetOpcode::G_STACKSAVE:
2188 return legalizeStackSave(MI, B);
2189 case TargetOpcode::G_GET_FPENV:
2190 return legalizeGetFPEnv(MI, MRI, B);
2191 case TargetOpcode::G_SET_FPENV:
2192 return legalizeSetFPEnv(MI, MRI, B);
2193 case TargetOpcode::G_TRAP:
2194 return legalizeTrap(MI, MRI, B);
2195 case TargetOpcode::G_DEBUGTRAP:
2196 return legalizeDebugTrap(MI, MRI, B);
2197 default:
2198 return false;
2199 }
2200
2201 llvm_unreachable("expected switch to return");
2202}
2203
2205 unsigned AS,
2207 MachineIRBuilder &B) const {
2208 MachineFunction &MF = B.getMF();
2209 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2210 const LLT S32 = LLT::scalar(32);
2211 const LLT S64 = LLT::scalar(64);
2212
2214
2215 if (ST.hasApertureRegs()) {
2216 // Note: this register is somewhat broken. When used as a 32-bit operand,
2217 // it only returns zeroes. The real value is in the upper 32 bits.
2218 // Thus, we must emit extract the high 32 bits.
2219 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2220 ? AMDGPU::SRC_SHARED_BASE
2221 : AMDGPU::SRC_PRIVATE_BASE;
2222 // FIXME: It would be more natural to emit a COPY here, but then copy
2223 // coalescing would kick in and it would think it's okay to use the "HI"
2224 // subregister (instead of extracting the HI 32 bits) which is an artificial
2225 // (unusable) register.
2226 // Register TableGen definitions would need an overhaul to get rid of the
2227 // artificial "HI" aperture registers and prevent this kind of issue from
2228 // happening.
2229 Register Dst = MRI.createGenericVirtualRegister(S64);
2230 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2231 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2232 return B.buildUnmerge(S32, Dst).getReg(1);
2233 }
2234
2235 // TODO: can we be smarter about machine pointer info?
2237 Register LoadAddr = MRI.createGenericVirtualRegister(
2239 // For code object version 5, private_base and shared_base are passed through
2240 // implicit kernargs.
2247 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2248
2249 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2251
2252 if (!loadInputValue(KernargPtrReg, B,
2254 return Register();
2255
2257 PtrInfo,
2261
2262 // Pointer address
2263 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2264 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2265 // Load address
2266 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2267 }
2268
2269 Register QueuePtr = MRI.createGenericVirtualRegister(
2271
2273 return Register();
2274
2275 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2276 // private_segment_aperture_base_hi.
2277 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2278
2280 PtrInfo,
2283 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2284
2285 B.buildPtrAdd(LoadAddr, QueuePtr,
2286 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2287 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2288}
2289
2290/// Return true if the value is a known valid address, such that a null check is
2291/// not necessary.
2293 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2294 MachineInstr *Def = MRI.getVRegDef(Val);
2295 switch (Def->getOpcode()) {
2296 case AMDGPU::G_FRAME_INDEX:
2297 case AMDGPU::G_GLOBAL_VALUE:
2298 case AMDGPU::G_BLOCK_ADDR:
2299 return true;
2300 case AMDGPU::G_CONSTANT: {
2301 const ConstantInt *CI = Def->getOperand(1).getCImm();
2302 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2303 }
2304 default:
2305 return false;
2306 }
2307
2308 return false;
2309}
2310
2313 MachineIRBuilder &B) const {
2314 MachineFunction &MF = B.getMF();
2315
2316 // MI can either be a G_ADDRSPACE_CAST or a
2317 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2318 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2319 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2320 Intrinsic::amdgcn_addrspacecast_nonnull));
2321
2322 const LLT S32 = LLT::scalar(32);
2323 Register Dst = MI.getOperand(0).getReg();
2324 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2325 : MI.getOperand(1).getReg();
2326 LLT DstTy = MRI.getType(Dst);
2327 LLT SrcTy = MRI.getType(Src);
2328 unsigned DestAS = DstTy.getAddressSpace();
2329 unsigned SrcAS = SrcTy.getAddressSpace();
2330
2331 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2332 // vector element.
2333 assert(!DstTy.isVector());
2334
2335 const AMDGPUTargetMachine &TM
2336 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2337
2338 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2339 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2340 return true;
2341 }
2342
2343 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2344 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2345 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2346 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2347 // G_ADDRSPACE_CAST we need to guess.
2348 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2349 // Extract low 32-bits of the pointer.
2350 B.buildExtract(Dst, Src, 0);
2351 MI.eraseFromParent();
2352 return true;
2353 }
2354
2355 unsigned NullVal = TM.getNullPointerValue(DestAS);
2356
2357 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2358 auto FlatNull = B.buildConstant(SrcTy, 0);
2359
2360 // Extract low 32-bits of the pointer.
2361 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2362
2363 auto CmpRes =
2364 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2365 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2366
2367 MI.eraseFromParent();
2368 return true;
2369 }
2370
2371 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2372 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2373 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2374 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2375 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2376 if (!ApertureReg.isValid())
2377 return false;
2378
2379 // Coerce the type of the low half of the result so we can use
2380 // merge_values.
2381 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2382
2383 // TODO: Should we allow mismatched types but matching sizes in merges to
2384 // avoid the ptrtoint?
2385 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2386 };
2387
2388 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2389 // G_ADDRSPACE_CAST we need to guess.
2390 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2391 castLocalOrPrivateToFlat(Dst);
2392 MI.eraseFromParent();
2393 return true;
2394 }
2395
2396 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2397
2398 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2399 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2400
2401 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2402 SegmentNull.getReg(0));
2403
2404 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2405
2406 MI.eraseFromParent();
2407 return true;
2408 }
2409
2410 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2411 SrcTy.getSizeInBits() == 64) {
2412 // Truncate.
2413 B.buildExtract(Dst, Src, 0);
2414 MI.eraseFromParent();
2415 return true;
2416 }
2417
2418 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2419 DstTy.getSizeInBits() == 64) {
2421 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2422 auto PtrLo = B.buildPtrToInt(S32, Src);
2423 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2424 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2425 MI.eraseFromParent();
2426 return true;
2427 }
2428
2429 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2430 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2431
2432 LLVMContext &Ctx = MF.getFunction().getContext();
2433 Ctx.diagnose(InvalidAddrSpaceCast);
2434 B.buildUndef(Dst);
2435 MI.eraseFromParent();
2436 return true;
2437}
2438
2441 MachineIRBuilder &B) const {
2442 Register Src = MI.getOperand(1).getReg();
2443 LLT Ty = MRI.getType(Src);
2444 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2445
2446 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2447 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2448
2449 auto C1 = B.buildFConstant(Ty, C1Val);
2450 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2451
2452 // TODO: Should this propagate fast-math-flags?
2453 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2454 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2455
2456 auto C2 = B.buildFConstant(Ty, C2Val);
2457 auto Fabs = B.buildFAbs(Ty, Src);
2458
2459 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2460 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2461 MI.eraseFromParent();
2462 return true;
2463}
2464
2467 MachineIRBuilder &B) const {
2468
2469 const LLT S1 = LLT::scalar(1);
2470 const LLT S64 = LLT::scalar(64);
2471
2472 Register Src = MI.getOperand(1).getReg();
2473 assert(MRI.getType(Src) == S64);
2474
2475 // result = trunc(src)
2476 // if (src > 0.0 && src != result)
2477 // result += 1.0
2478
2479 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2480
2481 const auto Zero = B.buildFConstant(S64, 0.0);
2482 const auto One = B.buildFConstant(S64, 1.0);
2483 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2484 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2485 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2486 auto Add = B.buildSelect(S64, And, One, Zero);
2487
2488 // TODO: Should this propagate fast-math-flags?
2489 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2490 MI.eraseFromParent();
2491 return true;
2492}
2493
2496 MachineIRBuilder &B) const {
2497 Register DstReg = MI.getOperand(0).getReg();
2498 Register Src0Reg = MI.getOperand(1).getReg();
2499 Register Src1Reg = MI.getOperand(2).getReg();
2500 auto Flags = MI.getFlags();
2501 LLT Ty = MRI.getType(DstReg);
2502
2503 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2504 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2505 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2506 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2507 MI.eraseFromParent();
2508 return true;
2509}
2510
2513 const unsigned FractBits = 52;
2514 const unsigned ExpBits = 11;
2515 LLT S32 = LLT::scalar(32);
2516
2517 auto Const0 = B.buildConstant(S32, FractBits - 32);
2518 auto Const1 = B.buildConstant(S32, ExpBits);
2519
2520 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2521 .addUse(Hi)
2522 .addUse(Const0.getReg(0))
2523 .addUse(Const1.getReg(0));
2524
2525 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2526}
2527
2530 MachineIRBuilder &B) const {
2531 const LLT S1 = LLT::scalar(1);
2532 const LLT S32 = LLT::scalar(32);
2533 const LLT S64 = LLT::scalar(64);
2534
2535 Register Src = MI.getOperand(1).getReg();
2536 assert(MRI.getType(Src) == S64);
2537
2538 // TODO: Should this use extract since the low half is unused?
2539 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2540 Register Hi = Unmerge.getReg(1);
2541
2542 // Extract the upper half, since this is where we will find the sign and
2543 // exponent.
2544 auto Exp = extractF64Exponent(Hi, B);
2545
2546 const unsigned FractBits = 52;
2547
2548 // Extract the sign bit.
2549 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2550 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2551
2552 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2553
2554 const auto Zero32 = B.buildConstant(S32, 0);
2555
2556 // Extend back to 64-bits.
2557 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2558
2559 auto Shr = B.buildAShr(S64, FractMask, Exp);
2560 auto Not = B.buildNot(S64, Shr);
2561 auto Tmp0 = B.buildAnd(S64, Src, Not);
2562 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2563
2564 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2565 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2566
2567 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2568 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2569 MI.eraseFromParent();
2570 return true;
2571}
2572
2575 MachineIRBuilder &B, bool Signed) const {
2576
2577 Register Dst = MI.getOperand(0).getReg();
2578 Register Src = MI.getOperand(1).getReg();
2579
2580 const LLT S64 = LLT::scalar(64);
2581 const LLT S32 = LLT::scalar(32);
2582
2583 assert(MRI.getType(Src) == S64);
2584
2585 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2586 auto ThirtyTwo = B.buildConstant(S32, 32);
2587
2588 if (MRI.getType(Dst) == S64) {
2589 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2590 : B.buildUITOFP(S64, Unmerge.getReg(1));
2591
2592 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2593 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2594
2595 // TODO: Should this propagate fast-math-flags?
2596 B.buildFAdd(Dst, LdExp, CvtLo);
2597 MI.eraseFromParent();
2598 return true;
2599 }
2600
2601 assert(MRI.getType(Dst) == S32);
2602
2603 auto One = B.buildConstant(S32, 1);
2604
2605 MachineInstrBuilder ShAmt;
2606 if (Signed) {
2607 auto ThirtyOne = B.buildConstant(S32, 31);
2608 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2609 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2610 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2611 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2612 .addUse(Unmerge.getReg(1));
2613 auto LS2 = B.buildSub(S32, LS, One);
2614 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2615 } else
2616 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2617 auto Norm = B.buildShl(S64, Src, ShAmt);
2618 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2619 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2620 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2621 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2622 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2623 B.buildFLdexp(Dst, FVal, Scale);
2624 MI.eraseFromParent();
2625 return true;
2626}
2627
2628// TODO: Copied from DAG implementation. Verify logic and document how this
2629// actually works.
2633 bool Signed) const {
2634
2635 Register Dst = MI.getOperand(0).getReg();
2636 Register Src = MI.getOperand(1).getReg();
2637
2638 const LLT S64 = LLT::scalar(64);
2639 const LLT S32 = LLT::scalar(32);
2640
2641 const LLT SrcLT = MRI.getType(Src);
2642 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2643
2644 unsigned Flags = MI.getFlags();
2645
2646 // The basic idea of converting a floating point number into a pair of 32-bit
2647 // integers is illustrated as follows:
2648 //
2649 // tf := trunc(val);
2650 // hif := floor(tf * 2^-32);
2651 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2652 // hi := fptoi(hif);
2653 // lo := fptoi(lof);
2654 //
2655 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2657 if (Signed && SrcLT == S32) {
2658 // However, a 32-bit floating point number has only 23 bits mantissa and
2659 // it's not enough to hold all the significant bits of `lof` if val is
2660 // negative. To avoid the loss of precision, We need to take the absolute
2661 // value after truncating and flip the result back based on the original
2662 // signedness.
2663 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2664 Trunc = B.buildFAbs(S32, Trunc, Flags);
2665 }
2666 MachineInstrBuilder K0, K1;
2667 if (SrcLT == S64) {
2668 K0 = B.buildFConstant(
2669 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2670 K1 = B.buildFConstant(
2671 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2672 } else {
2673 K0 = B.buildFConstant(
2674 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2675 K1 = B.buildFConstant(
2676 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2677 }
2678
2679 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2680 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2681 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2682
2683 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2684 : B.buildFPTOUI(S32, FloorMul);
2685 auto Lo = B.buildFPTOUI(S32, Fma);
2686
2687 if (Signed && SrcLT == S32) {
2688 // Flip the result based on the signedness, which is either all 0s or 1s.
2689 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2690 // r := xor({lo, hi}, sign) - sign;
2691 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2692 Sign);
2693 } else
2694 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2695 MI.eraseFromParent();
2696
2697 return true;
2698}
2699
2701 MachineInstr &MI) const {
2702 MachineFunction &MF = Helper.MIRBuilder.getMF();
2704
2705 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2706 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2707
2708 // With ieee_mode disabled, the instructions have the correct behavior
2709 // already for G_FMINNUM/G_FMAXNUM
2710 if (!MFI->getMode().IEEE)
2711 return !IsIEEEOp;
2712
2713 if (IsIEEEOp)
2714 return true;
2715
2717}
2718
2721 MachineIRBuilder &B) const {
2722 // TODO: Should move some of this into LegalizerHelper.
2723
2724 // TODO: Promote dynamic indexing of s16 to s32
2725
2726 Register Dst = MI.getOperand(0).getReg();
2727 Register Vec = MI.getOperand(1).getReg();
2728
2729 LLT VecTy = MRI.getType(Vec);
2730 LLT EltTy = VecTy.getElementType();
2731 assert(EltTy == MRI.getType(Dst));
2732
2733 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2734 // but we can't go directly to that logic becasue you can't bitcast a vector
2735 // of pointers to a vector of integers. Therefore, introduce an intermediate
2736 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2737 // drive the legalization forward.
2738 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2739 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2740 LLT IntVecTy = VecTy.changeElementType(IntTy);
2741
2742 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2743 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2744 B.buildIntToPtr(Dst, IntElt);
2745
2746 MI.eraseFromParent();
2747 return true;
2748 }
2749
2750 // FIXME: Artifact combiner probably should have replaced the truncated
2751 // constant before this, so we shouldn't need
2752 // getIConstantVRegValWithLookThrough.
2753 std::optional<ValueAndVReg> MaybeIdxVal =
2754 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2755 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2756 return true;
2757 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2758
2759 if (IdxVal < VecTy.getNumElements()) {
2760 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2761 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2762 } else {
2763 B.buildUndef(Dst);
2764 }
2765
2766 MI.eraseFromParent();
2767 return true;
2768}
2769
2772 MachineIRBuilder &B) const {
2773 // TODO: Should move some of this into LegalizerHelper.
2774
2775 // TODO: Promote dynamic indexing of s16 to s32
2776
2777 Register Dst = MI.getOperand(0).getReg();
2778 Register Vec = MI.getOperand(1).getReg();
2779 Register Ins = MI.getOperand(2).getReg();
2780
2781 LLT VecTy = MRI.getType(Vec);
2782 LLT EltTy = VecTy.getElementType();
2783 assert(EltTy == MRI.getType(Ins));
2784
2785 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2786 // but we can't go directly to that logic becasue you can't bitcast a vector
2787 // of pointers to a vector of integers. Therefore, make the pointer vector
2788 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2789 // new value, and then inttoptr the result vector back. This will then allow
2790 // the rest of legalization to take over.
2791 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2792 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2793 LLT IntVecTy = VecTy.changeElementType(IntTy);
2794
2795 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2796 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2797 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2798 MI.getOperand(3));
2799 B.buildIntToPtr(Dst, IntVecDest);
2800 MI.eraseFromParent();
2801 return true;
2802 }
2803
2804 // FIXME: Artifact combiner probably should have replaced the truncated
2805 // constant before this, so we shouldn't need
2806 // getIConstantVRegValWithLookThrough.
2807 std::optional<ValueAndVReg> MaybeIdxVal =
2808 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2809 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2810 return true;
2811
2812 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2813
2814 unsigned NumElts = VecTy.getNumElements();
2815 if (IdxVal < NumElts) {
2817 for (unsigned i = 0; i < NumElts; ++i)
2818 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2819 B.buildUnmerge(SrcRegs, Vec);
2820
2821 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2822 B.buildMergeLikeInstr(Dst, SrcRegs);
2823 } else {
2824 B.buildUndef(Dst);
2825 }
2826
2827 MI.eraseFromParent();
2828 return true;
2829}
2830
2833 MachineIRBuilder &B) const {
2834
2835 Register DstReg = MI.getOperand(0).getReg();
2836 Register SrcReg = MI.getOperand(1).getReg();
2837 LLT Ty = MRI.getType(DstReg);
2838 unsigned Flags = MI.getFlags();
2839
2840 Register TrigVal;
2841 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2842 if (ST.hasTrigReducedRange()) {
2843 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2844 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2845 .addUse(MulVal.getReg(0))
2846 .setMIFlags(Flags)
2847 .getReg(0);
2848 } else
2849 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2850
2851 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2852 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2853 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2854 .addUse(TrigVal)
2855 .setMIFlags(Flags);
2856 MI.eraseFromParent();
2857 return true;
2858}
2859
2862 const GlobalValue *GV,
2863 int64_t Offset,
2864 unsigned GAFlags) const {
2865 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2866 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2867 // to the following code sequence:
2868 //
2869 // For constant address space:
2870 // s_getpc_b64 s[0:1]
2871 // s_add_u32 s0, s0, $symbol
2872 // s_addc_u32 s1, s1, 0
2873 //
2874 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2875 // a fixup or relocation is emitted to replace $symbol with a literal
2876 // constant, which is a pc-relative offset from the encoding of the $symbol
2877 // operand to the global variable.
2878 //
2879 // For global address space:
2880 // s_getpc_b64 s[0:1]
2881 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2882 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2883 //
2884 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2885 // fixups or relocations are emitted to replace $symbol@*@lo and
2886 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2887 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2888 // operand to the global variable.
2889
2891
2892 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2893 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2894
2895 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2896 .addDef(PCReg);
2897
2898 MIB.addGlobalAddress(GV, Offset, GAFlags);
2899 if (GAFlags == SIInstrInfo::MO_NONE)
2900 MIB.addImm(0);
2901 else
2902 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2903
2904 if (!B.getMRI()->getRegClassOrNull(PCReg))
2905 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2906
2907 if (PtrTy.getSizeInBits() == 32)
2908 B.buildExtract(DstReg, PCReg, 0);
2909 return true;
2910}
2911
2912// Emit a ABS32_LO / ABS32_HI relocation stub.
2914 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2915 MachineRegisterInfo &MRI) const {
2916 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2917
2918 LLT S32 = LLT::scalar(32);
2919
2920 // Use the destination directly, if and only if we store the lower address
2921 // part only and we don't have a register class being set.
2922 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2923 ? DstReg
2924 : MRI.createGenericVirtualRegister(S32);
2925
2926 if (!MRI.getRegClassOrNull(AddrLo))
2927 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2928
2929 // Write the lower half.
2930 B.buildInstr(AMDGPU::S_MOV_B32)
2931 .addDef(AddrLo)
2932 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2933
2934 // If required, write the upper half as well.
2935 if (RequiresHighHalf) {
2936 assert(PtrTy.getSizeInBits() == 64 &&
2937 "Must provide a 64-bit pointer type!");
2938
2939 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2940 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2941
2942 B.buildInstr(AMDGPU::S_MOV_B32)
2943 .addDef(AddrHi)
2944 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2945
2946 // Use the destination directly, if and only if we don't have a register
2947 // class being set.
2948 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2949 ? DstReg
2950 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2951
2952 if (!MRI.getRegClassOrNull(AddrDst))
2953 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2954
2955 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2956
2957 // If we created a new register for the destination, cast the result into
2958 // the final output.
2959 if (AddrDst != DstReg)
2960 B.buildCast(DstReg, AddrDst);
2961 } else if (AddrLo != DstReg) {
2962 // If we created a new register for the destination, cast the result into
2963 // the final output.
2964 B.buildCast(DstReg, AddrLo);
2965 }
2966}
2967
2970 MachineIRBuilder &B) const {
2971 Register DstReg = MI.getOperand(0).getReg();
2972 LLT Ty = MRI.getType(DstReg);
2973 unsigned AS = Ty.getAddressSpace();
2974
2975 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2976 MachineFunction &MF = B.getMF();
2978
2980 if (!MFI->isModuleEntryFunction() &&
2981 GV->getName() != "llvm.amdgcn.module.lds" &&
2982 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
2983 const Function &Fn = MF.getFunction();
2984 DiagnosticInfoUnsupported BadLDSDecl(
2985 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2986 DS_Warning);
2987 Fn.getContext().diagnose(BadLDSDecl);
2988
2989 // We currently don't have a way to correctly allocate LDS objects that
2990 // aren't directly associated with a kernel. We do force inlining of
2991 // functions that use local objects. However, if these dead functions are
2992 // not eliminated, we don't want a compile time error. Just emit a warning
2993 // and a trap, since there should be no callable path here.
2994 B.buildTrap();
2995 B.buildUndef(DstReg);
2996 MI.eraseFromParent();
2997 return true;
2998 }
2999
3000 // TODO: We could emit code to handle the initialization somewhere.
3001 // We ignore the initializer for now and legalize it to allow selection.
3002 // The initializer will anyway get errored out during assembly emission.
3003 const SITargetLowering *TLI = ST.getTargetLowering();
3004 if (!TLI->shouldUseLDSConstAddress(GV)) {
3005 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3006 return true; // Leave in place;
3007 }
3008
3009 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3010 Type *Ty = GV->getValueType();
3011 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3012 // zero-sized type in other languages to declare the dynamic shared
3013 // memory which size is not known at the compile time. They will be
3014 // allocated by the runtime and placed directly after the static
3015 // allocated ones. They all share the same offset.
3016 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3017 // Adjust alignment for that dynamic shared memory array.
3018 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3019 LLT S32 = LLT::scalar(32);
3020 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3021 B.buildIntToPtr(DstReg, Sz);
3022 MI.eraseFromParent();
3023 return true;
3024 }
3025 }
3026
3027 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3028 *cast<GlobalVariable>(GV)));
3029 MI.eraseFromParent();
3030 return true;
3031 }
3032
3033 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3034 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3035 MI.eraseFromParent();
3036 return true;
3037 }
3038
3039 const SITargetLowering *TLI = ST.getTargetLowering();
3040
3041 if (TLI->shouldEmitFixup(GV)) {
3042 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3043 MI.eraseFromParent();
3044 return true;
3045 }
3046
3047 if (TLI->shouldEmitPCReloc(GV)) {
3048 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3049 MI.eraseFromParent();
3050 return true;
3051 }
3052
3054 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3055
3056 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3061 LoadTy, Align(8));
3062
3063 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3064
3065 if (Ty.getSizeInBits() == 32) {
3066 // Truncate if this is a 32-bit constant address.
3067 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3068 B.buildExtract(DstReg, Load, 0);
3069 } else
3070 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3071
3072 MI.eraseFromParent();
3073 return true;
3074}
3075
3077 if (Ty.isVector())
3078 return Ty.changeElementCount(
3081}
3082
3084 MachineInstr &MI) const {
3085 MachineIRBuilder &B = Helper.MIRBuilder;
3086 MachineRegisterInfo &MRI = *B.getMRI();
3087 GISelChangeObserver &Observer = Helper.Observer;
3088
3089 Register PtrReg = MI.getOperand(1).getReg();
3090 LLT PtrTy = MRI.getType(PtrReg);
3091 unsigned AddrSpace = PtrTy.getAddressSpace();
3092
3093 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3095 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3096 Observer.changingInstr(MI);
3097 MI.getOperand(1).setReg(Cast.getReg(0));
3098 Observer.changedInstr(MI);
3099 return true;
3100 }
3101
3102 if (MI.getOpcode() != AMDGPU::G_LOAD)
3103 return false;
3104
3105 Register ValReg = MI.getOperand(0).getReg();
3106 LLT ValTy = MRI.getType(ValReg);
3107
3108 if (hasBufferRsrcWorkaround(ValTy)) {
3109 Observer.changingInstr(MI);
3111 Observer.changedInstr(MI);
3112 return true;
3113 }
3114
3115 MachineMemOperand *MMO = *MI.memoperands_begin();
3116 const unsigned ValSize = ValTy.getSizeInBits();
3117 const LLT MemTy = MMO->getMemoryType();
3118 const Align MemAlign = MMO->getAlign();
3119 const unsigned MemSize = MemTy.getSizeInBits();
3120 const uint64_t AlignInBits = 8 * MemAlign.value();
3121
3122 // Widen non-power-of-2 loads to the alignment if needed
3123 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3124 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3125
3126 // This was already the correct extending load result type, so just adjust
3127 // the memory type.
3128 if (WideMemSize == ValSize) {
3129 MachineFunction &MF = B.getMF();
3130
3131 MachineMemOperand *WideMMO =
3132 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3133 Observer.changingInstr(MI);
3134 MI.setMemRefs(MF, {WideMMO});
3135 Observer.changedInstr(MI);
3136 return true;
3137 }
3138
3139 // Don't bother handling edge case that should probably never be produced.
3140 if (ValSize > WideMemSize)
3141 return false;
3142
3143 LLT WideTy = widenToNextPowerOf2(ValTy);
3144
3145 Register WideLoad;
3146 if (!WideTy.isVector()) {
3147 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3148 B.buildTrunc(ValReg, WideLoad).getReg(0);
3149 } else {
3150 // Extract the subvector.
3151
3152 if (isRegisterType(ValTy)) {
3153 // If this a case where G_EXTRACT is legal, use it.
3154 // (e.g. <3 x s32> -> <4 x s32>)
3155 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3156 B.buildExtract(ValReg, WideLoad, 0);
3157 } else {
3158 // For cases where the widened type isn't a nice register value, unmerge
3159 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3160 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3161 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3162 }
3163 }
3164
3165 MI.eraseFromParent();
3166 return true;
3167 }
3168
3169 return false;
3170}
3171
3173 MachineInstr &MI) const {
3174 MachineIRBuilder &B = Helper.MIRBuilder;
3175 MachineRegisterInfo &MRI = *B.getMRI();
3176 GISelChangeObserver &Observer = Helper.Observer;
3177
3178 Register DataReg = MI.getOperand(0).getReg();
3179 LLT DataTy = MRI.getType(DataReg);
3180
3181 if (hasBufferRsrcWorkaround(DataTy)) {
3182 Observer.changingInstr(MI);
3184 Observer.changedInstr(MI);
3185 return true;
3186 }
3187 return false;
3188}
3189
3192 MachineIRBuilder &B) const {
3193 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3194 assert(Ty.isScalar());
3195
3196 MachineFunction &MF = B.getMF();
3198
3199 // TODO: Always legal with future ftz flag.
3200 // FIXME: Do we need just output?
3201 if (Ty == LLT::float32() &&
3203 return true;
3204 if (Ty == LLT::float16() &&
3206 return true;
3207
3208 MachineIRBuilder HelperBuilder(MI);
3209 GISelObserverWrapper DummyObserver;
3210 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3211 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3212}
3213
3216 Register DstReg = MI.getOperand(0).getReg();
3217 Register PtrReg = MI.getOperand(1).getReg();
3218 Register CmpVal = MI.getOperand(2).getReg();
3219 Register NewVal = MI.getOperand(3).getReg();
3220
3221 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3222 "this should not have been custom lowered");
3223
3224 LLT ValTy = MRI.getType(CmpVal);
3225 LLT VecTy = LLT::fixed_vector(2, ValTy);
3226
3227 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3228
3229 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3230 .addDef(DstReg)
3231 .addUse(PtrReg)
3232 .addUse(PackedVal)
3233 .setMemRefs(MI.memoperands());
3234
3235 MI.eraseFromParent();
3236 return true;
3237}
3238
3239/// Return true if it's known that \p Src can never be an f32 denormal value.
3241 Register Src) {
3242 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3243 switch (DefMI->getOpcode()) {
3244 case TargetOpcode::G_INTRINSIC: {
3245 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3246 case Intrinsic::amdgcn_frexp_mant:
3247 return true;
3248 default:
3249 break;
3250 }
3251
3252 break;
3253 }
3254 case TargetOpcode::G_FFREXP: {
3255 if (DefMI->getOperand(0).getReg() == Src)
3256 return true;
3257 break;
3258 }
3259 case TargetOpcode::G_FPEXT: {
3260 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3261 }
3262 default:
3263 return false;
3264 }
3265
3266 return false;
3267}
3268
3269static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3270 if (Flags & MachineInstr::FmAfn)
3271 return true;
3272 const auto &Options = MF.getTarget().Options;
3273 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3274}
3275
3277 unsigned Flags) {
3278 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3281}
3282
3283std::pair<Register, Register>
3285 unsigned Flags) const {
3286 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3287 return {};
3288
3289 const LLT F32 = LLT::scalar(32);
3290 auto SmallestNormal = B.buildFConstant(
3292 auto IsLtSmallestNormal =
3293 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3294
3295 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3296 auto One = B.buildFConstant(F32, 1.0);
3297 auto ScaleFactor =
3298 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3299 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3300
3301 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3302}
3303
3305 MachineIRBuilder &B) const {
3306 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3307 // If we have to handle denormals, scale up the input and adjust the result.
3308
3309 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3310 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3311
3312 Register Dst = MI.getOperand(0).getReg();
3313 Register Src = MI.getOperand(1).getReg();
3314 LLT Ty = B.getMRI()->getType(Dst);
3315 unsigned Flags = MI.getFlags();
3316
3317 if (Ty == LLT::scalar(16)) {
3318 const LLT F32 = LLT::scalar(32);
3319 // Nothing in half is a denormal when promoted to f32.
3320 auto Ext = B.buildFPExt(F32, Src, Flags);
3321 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3322 .addUse(Ext.getReg(0))
3323 .setMIFlags(Flags);
3324 B.buildFPTrunc(Dst, Log2, Flags);
3325 MI.eraseFromParent();
3326 return true;
3327 }
3328
3329 assert(Ty == LLT::scalar(32));
3330
3331 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3332 if (!ScaledInput) {
3333 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3334 .addUse(Src)
3335 .setMIFlags(Flags);
3336 MI.eraseFromParent();
3337 return true;
3338 }
3339
3340 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3341 .addUse(ScaledInput)
3342 .setMIFlags(Flags);
3343
3344 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3345 auto Zero = B.buildFConstant(Ty, 0.0);
3346 auto ResultOffset =
3347 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3348 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3349
3350 MI.eraseFromParent();
3351 return true;
3352}
3353
3355 Register Z, unsigned Flags) {
3356 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3357 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3358}
3359
3361 MachineIRBuilder &B) const {
3362 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3363 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3364
3365 MachineRegisterInfo &MRI = *B.getMRI();
3366 Register Dst = MI.getOperand(0).getReg();
3367 Register X = MI.getOperand(1).getReg();
3368 unsigned Flags = MI.getFlags();
3369 const LLT Ty = MRI.getType(X);
3370 MachineFunction &MF = B.getMF();
3371
3372 const LLT F32 = LLT::scalar(32);
3373 const LLT F16 = LLT::scalar(16);
3374
3375 const AMDGPUTargetMachine &TM =
3376 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3377
3378 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3379 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3380 if (Ty == F16 && !ST.has16BitInsts()) {
3381 Register LogVal = MRI.createGenericVirtualRegister(F32);
3382 auto PromoteSrc = B.buildFPExt(F32, X);
3383 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3384 B.buildFPTrunc(Dst, LogVal);
3385 } else {
3386 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3387 }
3388
3389 MI.eraseFromParent();
3390 return true;
3391 }
3392
3393 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3394 if (ScaledInput)
3395 X = ScaledInput;
3396
3397 auto Y =
3398 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3399
3400 Register R;
3401 if (ST.hasFastFMAF32()) {
3402 // c+cc are ln(2)/ln(10) to more than 49 bits
3403 const float c_log10 = 0x1.344134p-2f;
3404 const float cc_log10 = 0x1.09f79ep-26f;
3405
3406 // c + cc is ln(2) to more than 49 bits
3407 const float c_log = 0x1.62e42ep-1f;
3408 const float cc_log = 0x1.efa39ep-25f;
3409
3410 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3411 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3412
3413 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3414 auto NegR = B.buildFNeg(Ty, R, Flags);
3415 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3416 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3417 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3418 } else {
3419 // ch+ct is ln(2)/ln(10) to more than 36 bits
3420 const float ch_log10 = 0x1.344000p-2f;
3421 const float ct_log10 = 0x1.3509f6p-18f;
3422
3423 // ch + ct is ln(2) to more than 36 bits
3424 const float ch_log = 0x1.62e000p-1f;
3425 const float ct_log = 0x1.0bfbe8p-15f;
3426
3427 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3428 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3429
3430 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3431 auto YH = B.buildAnd(Ty, Y, MaskConst);
3432 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3433 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3434
3435 Register Mad0 =
3436 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3437 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3438 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3439 }
3440
3441 const bool IsFiniteOnly =
3442 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3443 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3444
3445 if (!IsFiniteOnly) {
3446 // Expand isfinite(x) => fabs(x) < inf
3447 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3448 auto Fabs = B.buildFAbs(Ty, Y);
3449 auto IsFinite =
3450 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3451 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3452 }
3453
3454 if (ScaledInput) {
3455 auto Zero = B.buildFConstant(Ty, 0.0);
3456 auto ShiftK =
3457 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3458 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3459 B.buildFSub(Dst, R, Shift, Flags);
3460 } else {
3461 B.buildCopy(Dst, R);
3462 }
3463
3464 MI.eraseFromParent();
3465 return true;
3466}
3467
3469 Register Src, bool IsLog10,
3470 unsigned Flags) const {
3471 const double Log2BaseInverted =
3473
3474 LLT Ty = B.getMRI()->getType(Dst);
3475
3476 if (Ty == LLT::scalar(32)) {
3477 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3478 if (ScaledInput) {
3479 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3480 .addUse(Src)
3481 .setMIFlags(Flags);
3482 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3483 auto Zero = B.buildFConstant(Ty, 0.0);
3484 auto ResultOffset =
3485 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3486 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3487
3488 if (ST.hasFastFMAF32())
3489 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3490 else {
3491 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3492 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3493 }
3494
3495 return true;
3496 }
3497 }
3498
3499 auto Log2Operand = Ty == LLT::scalar(16)
3500 ? B.buildFLog2(Ty, Src, Flags)
3501 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3502 .addUse(Src)
3503 .setMIFlags(Flags);
3504 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3505 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3506 return true;
3507}
3508
3510 MachineIRBuilder &B) const {
3511 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3512 // If we have to handle denormals, scale up the input and adjust the result.
3513
3514 Register Dst = MI.getOperand(0).getReg();
3515 Register Src = MI.getOperand(1).getReg();
3516 unsigned Flags = MI.getFlags();
3517 LLT Ty = B.getMRI()->getType(Dst);
3518 const LLT F16 = LLT::scalar(16);
3519 const LLT F32 = LLT::scalar(32);
3520
3521 if (Ty == F16) {
3522 // Nothing in half is a denormal when promoted to f32.
3523 auto Ext = B.buildFPExt(F32, Src, Flags);
3524 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3525 .addUse(Ext.getReg(0))
3526 .setMIFlags(Flags);
3527 B.buildFPTrunc(Dst, Log2, Flags);
3528 MI.eraseFromParent();
3529 return true;
3530 }
3531
3532 assert(Ty == F32);
3533
3534 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3535 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3536 .addUse(Src)
3537 .setMIFlags(Flags);
3538 MI.eraseFromParent();
3539 return true;
3540 }
3541
3542 // bool needs_scaling = x < -0x1.f80000p+6f;
3543 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3544
3545 // -nextafter(128.0, -1)
3546 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3547 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3548 RangeCheckConst, Flags);
3549
3550 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3551 auto Zero = B.buildFConstant(Ty, 0.0);
3552 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3553 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3554
3555 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3556 .addUse(AddInput.getReg(0))
3557 .setMIFlags(Flags);
3558
3559 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3560 auto One = B.buildFConstant(Ty, 1.0);
3561 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3562 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3563 MI.eraseFromParent();
3564 return true;
3565}
3566
3568 Register X, unsigned Flags) const {
3569 LLT Ty = B.getMRI()->getType(Dst);
3570 LLT F32 = LLT::scalar(32);
3571
3572 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3573 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3574 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3575
3576 if (Ty == F32) {
3577 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3578 .addUse(Mul.getReg(0))
3579 .setMIFlags(Flags);
3580 } else {
3581 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3582 }
3583
3584 return true;
3585 }
3586
3587 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3588 auto NeedsScaling =
3589 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3590 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3591 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3592 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3593
3594 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3595 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3596
3597 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3598 .addUse(ExpInput.getReg(0))
3599 .setMIFlags(Flags);
3600
3601 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3602 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3603 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3604 return true;
3605}
3606
3608 MachineIRBuilder &B) const {
3609 Register Dst = MI.getOperand(0).getReg();
3610 Register X = MI.getOperand(1).getReg();
3611 const unsigned Flags = MI.getFlags();
3612 MachineFunction &MF = B.getMF();
3613 MachineRegisterInfo &MRI = *B.getMRI();
3614 LLT Ty = MRI.getType(Dst);
3615 const LLT F16 = LLT::scalar(16);
3616 const LLT F32 = LLT::scalar(32);
3617 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3618
3619 if (Ty == F16) {
3620 // v_exp_f16 (fmul x, log2e)
3621 if (allowApproxFunc(MF, Flags)) {
3622 // TODO: Does this really require fast?
3623 legalizeFExpUnsafe(B, Dst, X, Flags);
3624 MI.eraseFromParent();
3625 return true;
3626 }
3627
3628 // exp(f16 x) ->
3629 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3630
3631 // Nothing in half is a denormal when promoted to f32.
3632 auto Ext = B.buildFPExt(F32, X, Flags);
3633 Register Lowered = MRI.createGenericVirtualRegister(F32);
3634 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3635 B.buildFPTrunc(Dst, Lowered, Flags);
3636 MI.eraseFromParent();
3637 return true;
3638 }
3639
3640 assert(Ty == F32);
3641
3642 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3643 // library behavior. Also, is known-not-daz source sufficient?
3644 if (allowApproxFunc(MF, Flags)) {
3645 legalizeFExpUnsafe(B, Dst, X, Flags);
3646 MI.eraseFromParent();
3647 return true;
3648 }
3649
3650 // Algorithm:
3651 //
3652 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3653 //
3654 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3655 // n = 64*m + j, 0 <= j < 64
3656 //
3657 // e^x = 2^((64*m + j + f)/64)
3658 // = (2^m) * (2^(j/64)) * 2^(f/64)
3659 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3660 //
3661 // f = x*(64/ln(2)) - n
3662 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3663 //
3664 // e^x = (2^m) * (2^(j/64)) * e^r
3665 //
3666 // (2^(j/64)) is precomputed
3667 //
3668 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3669 // e^r = 1 + q
3670 //
3671 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3672 //
3673 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3674 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3675 Register PH, PL;
3676
3677 if (ST.hasFastFMAF32()) {
3678 const float c_exp = numbers::log2ef;
3679 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3680 const float c_exp10 = 0x1.a934f0p+1f;
3681 const float cc_exp10 = 0x1.2f346ep-24f;
3682
3683 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3684 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3685 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3686 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3687
3688 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3689 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3690 } else {
3691 const float ch_exp = 0x1.714000p+0f;
3692 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3693
3694 const float ch_exp10 = 0x1.a92000p+1f;
3695 const float cl_exp10 = 0x1.4f0978p-11f;
3696
3697 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3698 auto XH = B.buildAnd(Ty, X, MaskConst);
3699 auto XL = B.buildFSub(Ty, X, XH, Flags);
3700
3701 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3702 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3703
3704 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3705 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3706
3707 Register Mad0 =
3708 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3709 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3710 }
3711
3712 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3713
3714 // It is unsafe to contract this fsub into the PH multiply.
3715 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3716 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3717 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3718
3719 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3720 .addUse(A.getReg(0))
3721 .setMIFlags(Flags);
3722 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3723
3724 auto UnderflowCheckConst =
3725 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3726 auto Zero = B.buildFConstant(Ty, 0.0);
3727 auto Underflow =
3728 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3729
3730 R = B.buildSelect(Ty, Underflow, Zero, R);
3731
3732 const auto &Options = MF.getTarget().Options;
3733
3734 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3735 auto OverflowCheckConst =
3736 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3737
3738 auto Overflow =
3739 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3740 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3741 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3742 }
3743
3744 B.buildCopy(Dst, R);
3745 MI.eraseFromParent();
3746 return true;
3747}
3748
3750 MachineIRBuilder &B) const {
3751 Register Dst = MI.getOperand(0).getReg();
3752 Register Src0 = MI.getOperand(1).getReg();
3753 Register Src1 = MI.getOperand(2).getReg();
3754 unsigned Flags = MI.getFlags();
3755 LLT Ty = B.getMRI()->getType(Dst);
3756 const LLT F16 = LLT::float16();
3757 const LLT F32 = LLT::float32();
3758
3759 if (Ty == F32) {
3760 auto Log = B.buildFLog2(F32, Src0, Flags);
3761 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3762 .addUse(Log.getReg(0))
3763 .addUse(Src1)
3764 .setMIFlags(Flags);
3765 B.buildFExp2(Dst, Mul, Flags);
3766 } else if (Ty == F16) {
3767 // There's no f16 fmul_legacy, so we need to convert for it.
3768 auto Log = B.buildFLog2(F16, Src0, Flags);
3769 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3770 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3771 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3772 .addUse(Ext0.getReg(0))
3773 .addUse(Ext1.getReg(0))
3774 .setMIFlags(Flags);
3775 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3776 } else
3777 return false;
3778
3779 MI.eraseFromParent();
3780 return true;
3781}
3782
3783// Find a source register, ignoring any possible source modifiers.
3785 Register ModSrc = OrigSrc;
3786 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3787 ModSrc = SrcFNeg->getOperand(1).getReg();
3788 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3789 ModSrc = SrcFAbs->getOperand(1).getReg();
3790 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3791 ModSrc = SrcFAbs->getOperand(1).getReg();
3792 return ModSrc;
3793}
3794
3797 MachineIRBuilder &B) const {
3798
3799 const LLT S1 = LLT::scalar(1);
3800 const LLT F64 = LLT::float64();
3801 Register Dst = MI.getOperand(0).getReg();
3802 Register OrigSrc = MI.getOperand(1).getReg();
3803 unsigned Flags = MI.getFlags();
3804 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3805 "this should not have been custom lowered");
3806
3807 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3808 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3809 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3810 // V_FRACT bug is:
3811 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3812 //
3813 // Convert floor(x) to (x - fract(x))
3814
3815 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3816 .addUse(OrigSrc)
3817 .setMIFlags(Flags);
3818
3819 // Give source modifier matching some assistance before obscuring a foldable
3820 // pattern.
3821
3822 // TODO: We can avoid the neg on the fract? The input sign to fract
3823 // shouldn't matter?
3824 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3825
3826 auto Const =
3827 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3828
3829 Register Min = MRI.createGenericVirtualRegister(F64);
3830
3831 // We don't need to concern ourselves with the snan handling difference, so
3832 // use the one which will directly select.
3833 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3834 if (MFI->getMode().IEEE)
3835 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3836 else
3837 B.buildFMinNum(Min, Fract, Const, Flags);
3838
3839 Register CorrectedFract = Min;
3840 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3841 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3842 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3843 }
3844
3845 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3846 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3847
3848 MI.eraseFromParent();
3849 return true;
3850}
3851
3852// Turn an illegal packed v2s16 build vector into bit operations.
3853// TODO: This should probably be a bitcast action in LegalizerHelper.
3856 Register Dst = MI.getOperand(0).getReg();
3857 const LLT S32 = LLT::scalar(32);
3858 const LLT S16 = LLT::scalar(16);
3859 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3860
3861 Register Src0 = MI.getOperand(1).getReg();
3862 Register Src1 = MI.getOperand(2).getReg();
3863
3864 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3865 assert(MRI.getType(Src0) == S32);
3866 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3867 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3868 }
3869
3870 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3871 B.buildBitcast(Dst, Merge);
3872
3873 MI.eraseFromParent();
3874 return true;
3875}
3876
3877// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3878//
3879// Source and accumulation registers must all be 32-bits.
3880//
3881// TODO: When the multiply is uniform, we should produce a code sequence
3882// that is better suited to instruction selection on the SALU. Instead of
3883// the outer loop going over parts of the result, the outer loop should go
3884// over parts of one of the factors. This should result in instruction
3885// selection that makes full use of S_ADDC_U32 instructions.
3888 ArrayRef<Register> Src0,
3889 ArrayRef<Register> Src1,
3890 bool UsePartialMad64_32,
3891 bool SeparateOddAlignedProducts) const {
3892 // Use (possibly empty) vectors of S1 registers to represent the set of
3893 // carries from one pair of positions to the next.
3894 using Carry = SmallVector<Register, 2>;
3895
3896 MachineIRBuilder &B = Helper.MIRBuilder;
3897 GISelKnownBits &KB = *Helper.getKnownBits();
3898
3899 const LLT S1 = LLT::scalar(1);
3900 const LLT S32 = LLT::scalar(32);
3901 const LLT S64 = LLT::scalar(64);
3902
3903 Register Zero32;
3904 Register Zero64;
3905
3906 auto getZero32 = [&]() -> Register {
3907 if (!Zero32)
3908 Zero32 = B.buildConstant(S32, 0).getReg(0);
3909 return Zero32;
3910 };
3911 auto getZero64 = [&]() -> Register {
3912 if (!Zero64)
3913 Zero64 = B.buildConstant(S64, 0).getReg(0);
3914 return Zero64;
3915 };
3916
3917 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3918 for (unsigned i = 0; i < Src0.size(); ++i) {
3919 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3920 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3921 }
3922
3923 // Merge the given carries into the 32-bit LocalAccum, which is modified
3924 // in-place.
3925 //
3926 // Returns the carry-out, which is a single S1 register or null.
3927 auto mergeCarry =
3928 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3929 if (CarryIn.empty())
3930 return Register();
3931
3932 bool HaveCarryOut = true;
3933 Register CarryAccum;
3934 if (CarryIn.size() == 1) {
3935 if (!LocalAccum) {
3936 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3937 return Register();
3938 }
3939
3940 CarryAccum = getZero32();
3941 } else {
3942 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3943 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3944 CarryAccum =
3945 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3946 .getReg(0);
3947 }
3948
3949 if (!LocalAccum) {
3950 LocalAccum = getZero32();
3951 HaveCarryOut = false;
3952 }
3953 }
3954
3955 auto Add =
3956 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3957 LocalAccum = Add.getReg(0);
3958 return HaveCarryOut ? Add.getReg(1) : Register();
3959 };
3960
3961 // Build a multiply-add chain to compute
3962 //
3963 // LocalAccum + (partial products at DstIndex)
3964 // + (opportunistic subset of CarryIn)
3965 //
3966 // LocalAccum is an array of one or two 32-bit registers that are updated
3967 // in-place. The incoming registers may be null.
3968 //
3969 // In some edge cases, carry-ins can be consumed "for free". In that case,
3970 // the consumed carry bits are removed from CarryIn in-place.
3971 auto buildMadChain =
3972 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3973 -> Carry {
3974 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3975 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3976
3977 Carry CarryOut;
3978 unsigned j0 = 0;
3979
3980 // Use plain 32-bit multiplication for the most significant part of the
3981 // result by default.
3982 if (LocalAccum.size() == 1 &&
3983 (!UsePartialMad64_32 || !CarryIn.empty())) {
3984 do {
3985 // Skip multiplication if one of the operands is 0
3986 unsigned j1 = DstIndex - j0;
3987 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3988 ++j0;
3989 continue;
3990 }
3991 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3992 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3993 LocalAccum[0] = Mul.getReg(0);
3994 } else {
3995 if (CarryIn.empty()) {
3996 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3997 } else {
3998 LocalAccum[0] =
3999 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4000 .getReg(0);
4001 CarryIn.pop_back();
4002 }
4003 }
4004 ++j0;
4005 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4006 }
4007
4008 // Build full 64-bit multiplies.
4009 if (j0 <= DstIndex) {
4010 bool HaveSmallAccum = false;
4011 Register Tmp;
4012
4013 if (LocalAccum[0]) {
4014 if (LocalAccum.size() == 1) {
4015 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4016 HaveSmallAccum = true;
4017 } else if (LocalAccum[1]) {
4018 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4019 HaveSmallAccum = false;
4020 } else {
4021 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4022 HaveSmallAccum = true;
4023 }
4024 } else {
4025 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4026 Tmp = getZero64();
4027 HaveSmallAccum = true;
4028 }
4029
4030 do {
4031 unsigned j1 = DstIndex - j0;
4032 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4033 ++j0;
4034 continue;
4035 }
4036 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4037 {Src0[j0], Src1[j1], Tmp});
4038 Tmp = Mad.getReg(0);
4039 if (!HaveSmallAccum)
4040 CarryOut.push_back(Mad.getReg(1));
4041 HaveSmallAccum = false;
4042
4043 ++j0;
4044 } while (j0 <= DstIndex);
4045
4046 auto Unmerge = B.buildUnmerge(S32, Tmp);
4047 LocalAccum[0] = Unmerge.getReg(0);
4048 if (LocalAccum.size() > 1)
4049 LocalAccum[1] = Unmerge.getReg(1);
4050 }
4051
4052 return CarryOut;
4053 };
4054
4055 // Outer multiply loop, iterating over destination parts from least
4056 // significant to most significant parts.
4057 //
4058 // The columns of the following diagram correspond to the destination parts
4059 // affected by one iteration of the outer loop (ignoring boundary
4060 // conditions).
4061 //
4062 // Dest index relative to 2 * i: 1 0 -1
4063 // ------
4064 // Carries from previous iteration: e o
4065 // Even-aligned partial product sum: E E .
4066 // Odd-aligned partial product sum: O O
4067 //
4068 // 'o' is OddCarry, 'e' is EvenCarry.
4069 // EE and OO are computed from partial products via buildMadChain and use
4070 // accumulation where possible and appropriate.
4071 //
4072 Register SeparateOddCarry;
4073 Carry EvenCarry;
4074 Carry OddCarry;
4075
4076 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4077 Carry OddCarryIn = std::move(OddCarry);
4078 Carry EvenCarryIn = std::move(EvenCarry);
4079 OddCarry.clear();
4080 EvenCarry.clear();
4081
4082 // Partial products at offset 2 * i.
4083 if (2 * i < Accum.size()) {
4084 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4085 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4086 }
4087
4088 // Partial products at offset 2 * i - 1.
4089 if (i > 0) {
4090 if (!SeparateOddAlignedProducts) {
4091 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4092 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4093 } else {
4094 bool IsHighest = 2 * i >= Accum.size();
4095 Register SeparateOddOut[2];
4096 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4097 .take_front(IsHighest ? 1 : 2);
4098 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4099
4101
4102 if (i == 1) {
4103 if (!IsHighest)
4104 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4105 else
4106 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4107 } else {
4108 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4109 SeparateOddCarry);
4110 }
4111 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4112
4113 if (!IsHighest) {
4114 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4115 Lo->getOperand(1).getReg());
4116 Accum[2 * i] = Hi.getReg(0);
4117 SeparateOddCarry = Hi.getReg(1);
4118 }
4119 }
4120 }
4121
4122 // Add in the carries from the previous iteration
4123 if (i > 0) {
4124 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4125 EvenCarryIn.push_back(CarryOut);
4126
4127 if (2 * i < Accum.size()) {
4128 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4129 OddCarry.push_back(CarryOut);
4130 }
4131 }
4132 }
4133}
4134
4135// Custom narrowing of wide multiplies using wide multiply-add instructions.
4136//
4137// TODO: If the multiply is followed by an addition, we should attempt to
4138// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4140 MachineInstr &MI) const {
4141 assert(ST.hasMad64_32());
4142 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4143
4144 MachineIRBuilder &B = Helper.MIRBuilder;
4145 MachineRegisterInfo &MRI = *B.getMRI();
4146
4147 Register DstReg = MI.getOperand(0).getReg();
4148 Register Src0 = MI.getOperand(1).getReg();
4149 Register Src1 = MI.getOperand(2).getReg();
4150
4151 LLT Ty = MRI.getType(DstReg);
4152 assert(Ty.isScalar());
4153
4154 unsigned Size = Ty.getSizeInBits();
4155 unsigned NumParts = Size / 32;
4156 assert((Size % 32) == 0);
4157 assert(NumParts >= 2);
4158
4159 // Whether to use MAD_64_32 for partial products whose high half is
4160 // discarded. This avoids some ADD instructions but risks false dependency
4161 // stalls on some subtargets in some cases.
4162 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4163
4164 // Whether to compute odd-aligned partial products separately. This is
4165 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4166 // in an even-aligned VGPR.
4167 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4168
4169 LLT S32 = LLT::scalar(32);
4170 SmallVector<Register, 2> Src0Parts, Src1Parts;
4171 for (unsigned i = 0; i < NumParts; ++i) {
4172 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4173 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4174 }
4175 B.buildUnmerge(Src0Parts, Src0);
4176 B.buildUnmerge(Src1Parts, Src1);
4177
4178 SmallVector<Register, 2> AccumRegs(NumParts);
4179 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4180 SeparateOddAlignedProducts);
4181
4182 B.buildMergeLikeInstr(DstReg, AccumRegs);
4183 MI.eraseFromParent();
4184 return true;
4185}
4186
4187// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4188// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4189// case with a single min instruction instead of a compare+select.
4192 MachineIRBuilder &B) const {
4193 Register Dst = MI.getOperand(0).getReg();
4194 Register Src = MI.getOperand(1).getReg();
4195 LLT DstTy = MRI.getType(Dst);
4196 LLT SrcTy = MRI.getType(Src);
4197
4198 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4199 ? AMDGPU::G_AMDGPU_FFBH_U32
4200 : AMDGPU::G_AMDGPU_FFBL_B32;
4201 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4202 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4203
4204 MI.eraseFromParent();
4205 return true;
4206}
4207
4210 MachineIRBuilder &B) const {
4211 Register Dst = MI.getOperand(0).getReg();
4212 Register Src = MI.getOperand(1).getReg();
4213 LLT SrcTy = MRI.getType(Src);
4214 TypeSize NumBits = SrcTy.getSizeInBits();
4215
4216 assert(NumBits < 32u);
4217
4218 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4219 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4220 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4221 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4222 B.buildTrunc(Dst, Ctlz);
4223 MI.eraseFromParent();
4224 return true;
4225}
4226
4227// Check that this is a G_XOR x, -1
4228static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4229 if (MI.getOpcode() != TargetOpcode::G_XOR)
4230 return false;
4231 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4232 return ConstVal && *ConstVal == -1;
4233}
4234
4235// Return the use branch instruction, otherwise null if the usage is invalid.
4236static MachineInstr *
4238 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4239 Register CondDef = MI.getOperand(0).getReg();
4240 if (!MRI.hasOneNonDBGUse(CondDef))
4241 return nullptr;
4242
4243 MachineBasicBlock *Parent = MI.getParent();
4244 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4245
4246 if (isNot(MRI, *UseMI)) {
4247 Register NegatedCond = UseMI->getOperand(0).getReg();
4248 if (!MRI.hasOneNonDBGUse(NegatedCond))
4249 return nullptr;
4250
4251 // We're deleting the def of this value, so we need to remove it.
4252 eraseInstr(*UseMI, MRI);
4253
4254 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4255 Negated = true;
4256 }
4257
4258 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4259 return nullptr;
4260
4261 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4262 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4263 if (Next == Parent->end()) {
4264 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4265 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4266 return nullptr;
4267 UncondBrTarget = &*NextMBB;
4268 } else {
4269 if (Next->getOpcode() != AMDGPU::G_BR)
4270 return nullptr;
4271 Br = &*Next;
4272 UncondBrTarget = Br->getOperand(0).getMBB();
4273 }
4274
4275 return UseMI;
4276}
4277
4279 const ArgDescriptor *Arg,
4280 const TargetRegisterClass *ArgRC,
4281 LLT ArgTy) const {
4282 MCRegister SrcReg = Arg->getRegister();
4283 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4284 assert(DstReg.isVirtual() && "Virtual register expected");
4285
4286 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4287 *ArgRC, B.getDebugLoc(), ArgTy);
4288 if (Arg->isMasked()) {
4289 // TODO: Should we try to emit this once in the entry block?
4290 const LLT S32 = LLT::scalar(32);
4291 const unsigned Mask = Arg->getMask();
4292 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4293
4294 Register AndMaskSrc = LiveIn;
4295
4296 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4297 // 0.
4298 if (Shift != 0) {
4299 auto ShiftAmt = B.buildConstant(S32, Shift);
4300 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4301 }
4302
4303 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4304 } else {
4305 B.buildCopy(DstReg, LiveIn);
4306 }
4307
4308 return true;
4309}
4310
4312 Register DstReg, MachineIRBuilder &B,
4314 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4315 const ArgDescriptor *Arg = nullptr;
4316 const TargetRegisterClass *ArgRC;
4317 LLT ArgTy;
4318
4319 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4320 const ArgDescriptor WorkGroupIDX =
4321 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4322 // If GridZ is not programmed in an entry function then the hardware will set
4323 // it to all zeros, so there is no need to mask the GridY value in the low
4324 // order bits.
4325 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4326 AMDGPU::TTMP7,
4327 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4328 const ArgDescriptor WorkGroupIDZ =
4329 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4330 if (ST.hasArchitectedSGPRs() &&
4332 switch (ArgType) {
4334 Arg = &WorkGroupIDX;
4335 ArgRC = &AMDGPU::SReg_32RegClass;
4336 ArgTy = LLT::scalar(32);
4337 break;
4339 Arg = &WorkGroupIDY;
4340 ArgRC = &AMDGPU::SReg_32RegClass;
4341 ArgTy = LLT::scalar(32);
4342 break;
4344 Arg = &WorkGroupIDZ;
4345 ArgRC = &AMDGPU::SReg_32RegClass;
4346 ArgTy = LLT::scalar(32);
4347 break;
4348 default:
4349 break;
4350 }
4351 }
4352
4353 if (!Arg)
4354 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4355
4356 if (!Arg) {
4358 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4359 // case the pointer argument may be missing and we use null.
4360 B.buildConstant(DstReg, 0);
4361 return true;
4362 }
4363
4364 // It's undefined behavior if a function marked with the amdgpu-no-*
4365 // attributes uses the corresponding intrinsic.
4366 B.buildUndef(DstReg);
4367 return true;
4368 }
4369
4370 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4371 return false; // TODO: Handle these
4372 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4373}
4374
4378 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4379 return false;
4380
4381 MI.eraseFromParent();
4382 return true;
4383}
4384
4386 int64_t C) {
4387 B.buildConstant(MI.getOperand(0).getReg(), C);
4388 MI.eraseFromParent();
4389 return true;
4390}
4391
4394 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4395 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4396 if (MaxID == 0)
4397 return replaceWithConstant(B, MI, 0);
4398
4399 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4400 const ArgDescriptor *Arg;
4401 const TargetRegisterClass *ArgRC;
4402 LLT ArgTy;
4403 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4404
4405 Register DstReg = MI.getOperand(0).getReg();
4406 if (!Arg) {
4407 // It's undefined behavior if a function marked with the amdgpu-no-*
4408 // attributes uses the corresponding intrinsic.
4409 B.buildUndef(DstReg);
4410 MI.eraseFromParent();
4411 return true;
4412 }
4413
4414 if (Arg->isMasked()) {
4415 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4416 // masking operations anyway.
4417 //
4418 // TODO: We could assert the top bit is 0 for the source copy.
4419 if (!loadInputValue(DstReg, B, ArgType))
4420 return false;
4421 } else {
4422 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4423 if (!loadInputValue(TmpReg, B, ArgType))
4424 return false;
4425 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4426 }
4427
4428 MI.eraseFromParent();
4429 return true;
4430}
4431
4433 int64_t Offset) const {
4435 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4436
4437 // TODO: If we passed in the base kernel offset we could have a better
4438 // alignment than 4, but we don't really need it.
4439 if (!loadInputValue(KernArgReg, B,
4441 llvm_unreachable("failed to find kernarg segment ptr");
4442
4443 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4444 // TODO: Should get nuw
4445 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4446}
4447
4448/// Legalize a value that's loaded from kernel arguments. This is only used by
4449/// legacy intrinsics.
4453 Align Alignment) const {
4454 Register DstReg = MI.getOperand(0).getReg();
4455
4456 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4457 "unexpected kernarg parameter type");
4458
4461 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4464 MI.eraseFromParent();
4465 return true;
4466}
4467
4470 MachineIRBuilder &B) const {
4471 Register Dst = MI.getOperand(0).getReg();
4472 LLT DstTy = MRI.getType(Dst);
4473 LLT S16 = LLT::scalar(16);
4474 LLT S32 = LLT::scalar(32);
4475 LLT S64 = LLT::scalar(64);
4476
4477 if (DstTy == S16)
4478 return legalizeFDIV16(MI, MRI, B);
4479 if (DstTy == S32)
4480 return legalizeFDIV32(MI, MRI, B);
4481 if (DstTy == S64)
4482 return legalizeFDIV64(MI, MRI, B);
4483
4484 return false;
4485}
4486
4488 Register DstDivReg,
4489 Register DstRemReg,
4490 Register X,
4491 Register Y) const {
4492 const LLT S1 = LLT::scalar(1);
4493 const LLT S32 = LLT::scalar(32);
4494
4495 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4496 // algorithm used here.
4497
4498 // Initial estimate of inv(y).
4499 auto FloatY = B.buildUITOFP(S32, Y);
4500 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4501 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4502 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4503 auto Z = B.buildFPTOUI(S32, ScaledY);
4504
4505 // One round of UNR.
4506 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4507 auto NegYZ = B.buildMul(S32, NegY, Z);
4508 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4509
4510 // Quotient/remainder estimate.
4511 auto Q = B.buildUMulH(S32, X, Z);
4512 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4513
4514 // First quotient/remainder refinement.
4515 auto One = B.buildConstant(S32, 1);
4516 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4517 if (DstDivReg)
4518 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4519 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4520
4521 // Second quotient/remainder refinement.
4522 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4523 if (DstDivReg)
4524 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4525
4526 if (DstRemReg)
4527 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4528}
4529
4530// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4531//
4532// Return lo, hi of result
4533//
4534// %cvt.lo = G_UITOFP Val.lo
4535// %cvt.hi = G_UITOFP Val.hi
4536// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4537// %rcp = G_AMDGPU_RCP_IFLAG %mad
4538// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4539// %mul2 = G_FMUL %mul1, 2**(-32)
4540// %trunc = G_INTRINSIC_TRUNC %mul2
4541// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4542// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4543static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4544 Register Val) {
4545 const LLT S32 = LLT::scalar(32);
4546 auto Unmerge = B.buildUnmerge(S32, Val);
4547
4548 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4549 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4550
4551 auto Mad = B.buildFMAD(
4552 S32, CvtHi, // 2**32
4553 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4554
4555 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4556 auto Mul1 = B.buildFMul(
4557 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4558
4559 // 2**(-32)
4560 auto Mul2 = B.buildFMul(
4561 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4562 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4563
4564 // -(2**32)
4565 auto Mad2 = B.buildFMAD(
4566 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4567 Mul1);
4568
4569 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4570 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4571
4572 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4573}
4574
4576 Register DstDivReg,
4577 Register DstRemReg,
4578 Register Numer,
4579 Register Denom) const {
4580 const LLT S32 = LLT::scalar(32);
4581 const LLT S64 = LLT::scalar(64);
4582 const LLT S1 = LLT::scalar(1);
4583 Register RcpLo, RcpHi;
4584
4585 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4586
4587 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4588
4589 auto Zero64 = B.buildConstant(S64, 0);
4590 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4591
4592 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4593 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4594
4595 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4596 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4597 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4598
4599 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4600 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4601 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4602
4603 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4604 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4605 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4606 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4607 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4608
4609 auto Zero32 = B.buildConstant(S32, 0);
4610 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4611 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4612 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4613
4614 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4615 Register NumerLo = UnmergeNumer.getReg(0);
4616 Register NumerHi = UnmergeNumer.getReg(1);
4617
4618 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4619 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4620 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4621 Register Mul3_Lo = UnmergeMul3.getReg(0);
4622 Register Mul3_Hi = UnmergeMul3.getReg(1);
4623 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4624 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4625 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4626 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4627
4628 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4629 Register DenomLo = UnmergeDenom.getReg(0);
4630 Register DenomHi = UnmergeDenom.getReg(1);
4631
4632 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4633 auto C1 = B.buildSExt(S32, CmpHi);
4634
4635 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4636 auto C2 = B.buildSExt(S32, CmpLo);
4637
4638 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4639 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4640
4641 // TODO: Here and below portions of the code can be enclosed into if/endif.
4642 // Currently control flow is unconditional and we have 4 selects after
4643 // potential endif to substitute PHIs.
4644
4645 // if C3 != 0 ...
4646 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4647 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4648 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4649 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4650
4651 auto One64 = B.buildConstant(S64, 1);
4652 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4653
4654 auto C4 =
4655 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4656 auto C5 =
4657 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4658 auto C6 = B.buildSelect(
4659 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4660
4661 // if (C6 != 0)
4662 auto Add4 = B.buildAdd(S64, Add3, One64);
4663 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4664
4665 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4666 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4667 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4668
4669 // endif C6
4670 // endif C3
4671
4672 if (DstDivReg) {
4673 auto Sel1 = B.buildSelect(
4674 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4675 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4676 Sel1, MulHi3);
4677 }
4678
4679 if (DstRemReg) {
4680 auto Sel2 = B.buildSelect(
4681 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4682 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4683 Sel2, Sub1);
4684 }
4685}
4686
4689 MachineIRBuilder &B) const {
4690 Register DstDivReg, DstRemReg;
4691 switch (MI.getOpcode()) {
4692 default:
4693 llvm_unreachable("Unexpected opcode!");
4694 case AMDGPU::G_UDIV: {
4695 DstDivReg = MI.getOperand(0).getReg();
4696 break;
4697 }
4698 case AMDGPU::G_UREM: {
4699 DstRemReg = MI.getOperand(0).getReg();
4700 break;
4701 }
4702 case AMDGPU::G_UDIVREM: {
4703 DstDivReg = MI.getOperand(0).getReg();
4704 DstRemReg = MI.getOperand(1).getReg();
4705 break;
4706 }
4707 }
4708
4709 const LLT S64 = LLT::scalar(64);
4710 const LLT S32 = LLT::scalar(32);
4711 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4712 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4713 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4714 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4715
4716 if (Ty == S32)
4717 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4718 else if (Ty == S64)
4719 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4720 else
4721 return false;
4722
4723 MI.eraseFromParent();
4724 return true;
4725}
4726
4729 MachineIRBuilder &B) const {
4730 const LLT S64 = LLT::scalar(64);
4731 const LLT S32 = LLT::scalar(32);
4732
4733 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4734 if (Ty != S32 && Ty != S64)
4735 return false;
4736
4737 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4738 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4739 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4740
4741 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4742 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4743 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4744
4745 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4746 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4747
4748 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4749 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4750
4751 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4752 switch (MI.getOpcode()) {
4753 default:
4754 llvm_unreachable("Unexpected opcode!");
4755 case AMDGPU::G_SDIV: {
4756 DstDivReg = MI.getOperand(0).getReg();
4757 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4758 break;
4759 }
4760 case AMDGPU::G_SREM: {
4761 DstRemReg = MI.getOperand(0).getReg();
4762 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4763 break;
4764 }
4765 case AMDGPU::G_SDIVREM: {
4766 DstDivReg = MI.getOperand(0).getReg();
4767 DstRemReg = MI.getOperand(1).getReg();
4768 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4769 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4770 break;
4771 }
4772 }
4773
4774 if (Ty == S32)
4775 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4776 else
4777 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4778
4779 if (DstDivReg) {
4780 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4781 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4782 B.buildSub(DstDivReg, SignXor, Sign);
4783 }
4784
4785 if (DstRemReg) {
4786 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4787 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4788 B.buildSub(DstRemReg, SignXor, Sign);
4789 }
4790
4791 MI.eraseFromParent();
4792 return true;
4793}
4794
4797 MachineIRBuilder &B) const {
4798 Register Res = MI.getOperand(0).getReg();
4799 Register LHS = MI.getOperand(1).getReg();
4800 Register RHS = MI.getOperand(2).getReg();
4801 uint16_t Flags = MI.getFlags();
4802 LLT ResTy = MRI.getType(Res);
4803
4804 const MachineFunction &MF = B.getMF();
4805 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4807
4808 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
4809 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4810 return false;
4811
4812 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4813 // the CI documentation has a worst case error of 1 ulp.
4814 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4815 // use it as long as we aren't trying to use denormals.
4816 //
4817 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4818
4819 // 1 / x -> RCP(x)
4820 if (CLHS->isExactlyValue(1.0)) {
4821 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4822 .addUse(RHS)
4823 .setMIFlags(Flags);
4824
4825 MI.eraseFromParent();
4826 return true;
4827 }
4828
4829 // -1 / x -> RCP( FNEG(x) )
4830 if (CLHS->isExactlyValue(-1.0)) {
4831 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4832 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4833 .addUse(FNeg.getReg(0))
4834 .setMIFlags(Flags);
4835
4836 MI.eraseFromParent();
4837 return true;
4838 }
4839 }
4840
4841 // For f16 require afn or arcp.
4842 // For f32 require afn.
4843 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4844 !MI.getFlag(MachineInstr::FmArcp)))
4845 return false;
4846
4847 // x / y -> x * (1.0 / y)
4848 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4849 .addUse(RHS)
4850 .setMIFlags(Flags);
4851 B.buildFMul(Res, LHS, RCP, Flags);
4852
4853 MI.eraseFromParent();
4854 return true;
4855}
4856
4859 MachineIRBuilder &B) const {
4860 Register Res = MI.getOperand(0).getReg();
4861 Register X = MI.getOperand(1).getReg();
4862 Register Y = MI.getOperand(2).getReg();
4863 uint16_t Flags = MI.getFlags();
4864 LLT ResTy = MRI.getType(Res);
4865
4866 const MachineFunction &MF = B.getMF();
4867 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4868 MI.getFlag(MachineInstr::FmAfn);
4869
4870 if (!AllowInaccurateRcp)
4871 return false;
4872
4873 auto NegY = B.buildFNeg(ResTy, Y);
4874 auto One = B.buildFConstant(ResTy, 1.0);
4875
4876 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4877 .addUse(Y)
4878 .setMIFlags(Flags);
4879
4880 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4881 R = B.buildFMA(ResTy, Tmp0, R, R);
4882
4883 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4884 R = B.buildFMA(ResTy, Tmp1, R, R);
4885
4886 auto Ret = B.buildFMul(ResTy, X, R);
4887 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4888
4889 B.buildFMA(Res, Tmp2, R, Ret);
4890 MI.eraseFromParent();
4891 return true;
4892}
4893
4896 MachineIRBuilder &B) const {
4898 return true;
4899
4900 Register Res = MI.getOperand(0).getReg();
4901 Register LHS = MI.getOperand(1).getReg();
4902 Register RHS = MI.getOperand(2).getReg();
4903
4904 uint16_t Flags = MI.getFlags();
4905
4906 LLT S16 = LLT::scalar(16);
4907 LLT S32 = LLT::scalar(32);
4908
4909 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4910 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4911 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4912 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
4913 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4914 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
4915 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4916 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4917 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4918 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4919 // q16.u = opx(V_CVT_F16_F32, q32.u);
4920 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4921
4922 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4923 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4924 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
4925 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4926 .addUse(RHSExt.getReg(0))
4927 .setMIFlags(Flags);
4928 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
4930 if (ST.hasMadMacF32Insts()) {
4931 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4932 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
4933 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4934 } else {
4935 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4936 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
4937 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4938 }
4939 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
4940 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
4941 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
4942 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
4943 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4944 .addUse(RDst.getReg(0))
4945 .addUse(RHS)
4946 .addUse(LHS)
4947 .setMIFlags(Flags);
4948
4949 MI.eraseFromParent();
4950 return true;
4951}
4952
4953static constexpr unsigned SPDenormModeBitField =
4955
4956// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4957// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4959 const GCNSubtarget &ST,
4961 // Set SP denorm mode to this value.
4962 unsigned SPDenormMode =
4963 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4964
4965 if (ST.hasDenormModeInst()) {
4966 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4967 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4968
4969 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4970 B.buildInstr(AMDGPU::S_DENORM_MODE)
4971 .addImm(NewDenormModeValue);
4972
4973 } else {
4974 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4975 .addImm(SPDenormMode)
4976 .addImm(SPDenormModeBitField);
4977 }
4978}
4979
4982 MachineIRBuilder &B) const {
4984 return true;
4985
4986 Register Res = MI.getOperand(0).getReg();
4987 Register LHS = MI.getOperand(1).getReg();
4988 Register RHS = MI.getOperand(2).getReg();
4989 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4990 SIModeRegisterDefaults Mode = MFI->getMode();
4991
4992 uint16_t Flags = MI.getFlags();
4993
4994 LLT S32 = LLT::scalar(32);
4995 LLT S1 = LLT::scalar(1);
4996
4997 auto One = B.buildFConstant(S32, 1.0f);
4998
4999 auto DenominatorScaled =
5000 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5001 .addUse(LHS)
5002 .addUse(RHS)
5003 .addImm(0)
5004 .setMIFlags(Flags);
5005 auto NumeratorScaled =
5006 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5007 .addUse(LHS)
5008 .addUse(RHS)
5009 .addImm(1)
5010 .setMIFlags(Flags);
5011
5012 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5013 .addUse(DenominatorScaled.getReg(0))
5014 .setMIFlags(Flags);
5015 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5016
5017 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5018 const bool HasDynamicDenormals =
5019 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5020 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5021
5022 Register SavedSPDenormMode;
5023 if (!PreservesDenormals) {
5024 if (HasDynamicDenormals) {
5025 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5026 B.buildInstr(AMDGPU::S_GETREG_B32)
5027 .addDef(SavedSPDenormMode)
5028 .addImm(SPDenormModeBitField);
5029 }
5030 toggleSPDenormMode(true, B, ST, Mode);
5031 }
5032
5033 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5034 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5035 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5036 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5037 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5038 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5039
5040 if (!PreservesDenormals) {
5041 if (HasDynamicDenormals) {
5042 assert(SavedSPDenormMode);
5043 B.buildInstr(AMDGPU::S_SETREG_B32)
5044 .addReg(SavedSPDenormMode)
5045 .addImm(SPDenormModeBitField);
5046 } else
5047 toggleSPDenormMode(false, B, ST, Mode);
5048 }
5049
5050 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5051 .addUse(Fma4.getReg(0))
5052 .addUse(Fma1.getReg(0))
5053 .addUse(Fma3.getReg(0))
5054 .addUse(NumeratorScaled.getReg(1))
5055 .setMIFlags(Flags);
5056
5057 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5058 .addUse(Fmas.getReg(0))
5059 .addUse(RHS)
5060 .addUse(LHS)
5061 .setMIFlags(Flags);
5062
5063 MI.eraseFromParent();
5064 return true;
5065}
5066
5069 MachineIRBuilder &B) const {
5071 return true;
5072
5073 Register Res = MI.getOperand(0).getReg();
5074 Register LHS = MI.getOperand(1).getReg();
5075 Register RHS = MI.getOperand(2).getReg();
5076
5077 uint16_t Flags = MI.getFlags();
5078
5079 LLT S64 = LLT::scalar(64);
5080 LLT S1 = LLT::scalar(1);
5081
5082 auto One = B.buildFConstant(S64, 1.0);
5083
5084 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5085 .addUse(LHS)
5086 .addUse(RHS)
5087 .addImm(0)
5088 .setMIFlags(Flags);
5089
5090 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5091
5092 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5093 .addUse(DivScale0.getReg(0))
5094 .setMIFlags(Flags);
5095
5096 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5097 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5098 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5099
5100 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5101 .addUse(LHS)
5102 .addUse(RHS)
5103 .addImm(1)
5104 .setMIFlags(Flags);
5105
5106 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5107 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5108 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5109
5110 Register Scale;
5112 // Workaround a hardware bug on SI where the condition output from div_scale
5113 // is not usable.
5114
5115 LLT S32 = LLT::scalar(32);
5116
5117 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5118 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5119 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5120 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5121
5122 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5123 Scale1Unmerge.getReg(1));
5124 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5125 Scale0Unmerge.getReg(1));
5126 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5127 } else {
5128 Scale = DivScale1.getReg(1);
5129 }
5130
5131 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5132 .addUse(Fma4.getReg(0))
5133 .addUse(Fma3.getReg(0))
5134 .addUse(Mul.getReg(0))
5135 .addUse(Scale)
5136 .setMIFlags(Flags);
5137
5138 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5139 .addUse(Fmas.getReg(0))
5140 .addUse(RHS)
5141 .addUse(LHS)
5142 .setMIFlags(Flags);
5143
5144 MI.eraseFromParent();
5145 return true;
5146}
5147
5150 MachineIRBuilder &B) const {
5151 Register Res0 = MI.getOperand(0).getReg();
5152 Register Res1 = MI.getOperand(1).getReg();
5153 Register Val = MI.getOperand(2).getReg();
5154 uint16_t Flags = MI.getFlags();
5155
5156 LLT Ty = MRI.getType(Res0);
5157 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5158
5159 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5160 .addUse(Val)
5161 .setMIFlags(Flags);
5162 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5163 .addUse(Val)
5164 .setMIFlags(Flags);
5165
5166 if (ST.hasFractBug()) {
5167 auto Fabs = B.buildFAbs(Ty, Val);
5168 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5169 auto IsFinite =
5170 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5171 auto Zero = B.buildConstant(InstrExpTy, 0);
5172 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5173 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5174 }
5175
5176 B.buildCopy(Res0, Mant);
5177 B.buildSExtOrTrunc(Res1, Exp);
5178
5179 MI.eraseFromParent();
5180 return true;
5181}
5182
5185 MachineIRBuilder &B) const {
5186 Register Res = MI.getOperand(0).getReg();
5187 Register LHS = MI.getOperand(2).getReg();
5188 Register RHS = MI.getOperand(3).getReg();
5189 uint16_t Flags = MI.getFlags();
5190
5191 LLT S32 = LLT::scalar(32);
5192 LLT S1 = LLT::scalar(1);
5193
5194 auto Abs = B.buildFAbs(S32, RHS, Flags);
5195 const APFloat C0Val(1.0f);
5196
5197 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5198 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5199 auto C2 = B.buildFConstant(S32, 1.0f);
5200
5201 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5202 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5203
5204 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5205
5206 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5207 .addUse(Mul0.getReg(0))
5208 .setMIFlags(Flags);
5209
5210 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5211
5212 B.buildFMul(Res, Sel, Mul1, Flags);
5213
5214 MI.eraseFromParent();
5215 return true;
5216}
5217
5220 MachineIRBuilder &B) const {
5221 // Bypass the correct expansion a standard promotion through G_FSQRT would
5222 // get. The f32 op is accurate enough for the f16 cas.
5223 unsigned Flags = MI.getFlags();
5224 assert(!ST.has16BitInsts());
5225 const LLT F32 = LLT::scalar(32);
5226 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5227 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5228 .addUse(Ext.getReg(0))
5229 .setMIFlags(Flags);
5230 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5231 MI.eraseFromParent();
5232 return true;
5233}
5234
5237 MachineIRBuilder &B) const {
5238 MachineFunction &MF = B.getMF();
5239 Register Dst = MI.getOperand(0).getReg();
5240 Register X = MI.getOperand(1).getReg();
5241 const unsigned Flags = MI.getFlags();
5242 const LLT S1 = LLT::scalar(1);
5243 const LLT F32 = LLT::scalar(32);
5244 const LLT I32 = LLT::scalar(32);
5245
5246 if (allowApproxFunc(MF, Flags)) {
5247 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5248 .addUse(X)
5249 .setMIFlags(Flags);
5250 MI.eraseFromParent();
5251 return true;
5252 }
5253
5254 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5255 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5256 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5257 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5258 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5259
5260 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5261 if (needsDenormHandlingF32(MF, X, Flags)) {
5262 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5263 .addUse(SqrtX.getReg(0))
5264 .setMIFlags(Flags);
5265
5266 auto NegOne = B.buildConstant(I32, -1);
5267 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5268
5269 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5270 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5271
5272 auto PosOne = B.buildConstant(I32, 1);
5273 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5274
5275 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5276 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5277
5278 auto Zero = B.buildFConstant(F32, 0.0f);
5279 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5280
5281 SqrtS =
5282 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5283
5284 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5285 SqrtS =
5286 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5287 } else {
5288 auto SqrtR =
5289 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5290 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5291
5292 auto Half = B.buildFConstant(F32, 0.5f);
5293 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5294 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5295 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5296 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5297 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5298 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5299 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5300 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5301 }
5302
5303 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5304
5305 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5306
5307 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5308
5309 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5310 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5311
5312 MI.eraseFromParent();
5313 return true;
5314}
5315
5318 MachineIRBuilder &B) const {
5319 // For double type, the SQRT and RSQ instructions don't have required
5320 // precision, we apply Goldschmidt's algorithm to improve the result:
5321 //
5322 // y0 = rsq(x)
5323 // g0 = x * y0
5324 // h0 = 0.5 * y0
5325 //
5326 // r0 = 0.5 - h0 * g0
5327 // g1 = g0 * r0 + g0
5328 // h1 = h0 * r0 + h0
5329 //
5330 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5331 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5332 // h2 = h1 * r1 + h1
5333 //
5334 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5335 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5336 //
5337 // sqrt(x) = g3
5338
5339 const LLT S1 = LLT::scalar(1);
5340 const LLT S32 = LLT::scalar(32);
5341 const LLT F64 = LLT::scalar(64);
5342
5343 Register Dst = MI.getOperand(0).getReg();
5344 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5345
5346 Register X = MI.getOperand(1).getReg();
5347 unsigned Flags = MI.getFlags();
5348
5349 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5350
5351 auto ZeroInt = B.buildConstant(S32, 0);
5352 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5353
5354 // Scale up input if it is too small.
5355 auto ScaleUpFactor = B.buildConstant(S32, 256);
5356 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5357 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5358
5359 auto SqrtY =
5360 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5361
5362 auto Half = B.buildFConstant(F64, 0.5);
5363 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5364 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5365
5366 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5367 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5368
5369 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5370 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5371
5372 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5373 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5374
5375 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5376
5377 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5378 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5379
5380 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5381
5382 // Scale down the result.
5383 auto ScaleDownFactor = B.buildConstant(S32, -128);
5384 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5385 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5386
5387 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5388 // with finite only or nsz because rsq(+/-0) = +/-inf
5389
5390 // TODO: Check for DAZ and expand to subnormals
5391 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5392
5393 // If x is +INF, +0, or -0, use its original value
5394 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5395
5396 MI.eraseFromParent();
5397 return true;
5398}
5399
5402 MachineIRBuilder &B) const {
5403 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5404 if (Ty == LLT::scalar(32))
5405 return legalizeFSQRTF32(MI, MRI, B);
5406 if (Ty == LLT::scalar(64))
5407 return legalizeFSQRTF64(MI, MRI, B);
5408 if (Ty == LLT::scalar(16))
5409 return legalizeFSQRTF16(MI, MRI, B);
5410 return false;
5411}
5412
5413// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5414// FIXME: Why do we handle this one but not other removed instructions?
5415//
5416// Reciprocal square root. The clamp prevents infinite results, clamping
5417// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5418// +-max_float.
5421 MachineIRBuilder &B) const {
5423 return true;
5424
5425 Register Dst = MI.getOperand(0).getReg();
5426 Register Src = MI.getOperand(2).getReg();
5427 auto Flags = MI.getFlags();
5428
5429 LLT Ty = MRI.getType(Dst);
5430
5431 const fltSemantics *FltSemantics;
5432 if (Ty == LLT::scalar(32))
5433 FltSemantics = &APFloat::IEEEsingle();
5434 else if (Ty == LLT::scalar(64))
5435 FltSemantics = &APFloat::IEEEdouble();
5436 else
5437 return false;
5438
5439 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5440 .addUse(Src)
5441 .setMIFlags(Flags);
5442
5443 // We don't need to concern ourselves with the snan handling difference, since
5444 // the rsq quieted (or not) so use the one which will directly select.
5445 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5446 const bool UseIEEE = MFI->getMode().IEEE;
5447
5448 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5449 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5450 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5451
5452 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5453
5454 if (UseIEEE)
5455 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5456 else
5457 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5458 MI.eraseFromParent();
5459 return true;
5460}
5461
5462// TODO: Fix pointer type handling
5465 Intrinsic::ID IID) const {
5466
5467 MachineIRBuilder &B = Helper.MIRBuilder;
5468 MachineRegisterInfo &MRI = *B.getMRI();
5469
5470 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5471 IID == Intrinsic::amdgcn_permlanex16;
5472 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5473 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5474
5475 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5476 Register Src2, LLT VT) -> Register {
5477 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5478 switch (IID) {
5479 case Intrinsic::amdgcn_readfirstlane:
5480 case Intrinsic::amdgcn_permlane64:
5481 return LaneOp.getReg(0);
5482 case Intrinsic::amdgcn_readlane:
5483 case Intrinsic::amdgcn_set_inactive:
5484 case Intrinsic::amdgcn_set_inactive_chain_arg:
5485 return LaneOp.addUse(Src1).getReg(0);
5486 case Intrinsic::amdgcn_writelane:
5487 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5488 case Intrinsic::amdgcn_permlane16:
5489 case Intrinsic::amdgcn_permlanex16: {
5490 Register Src3 = MI.getOperand(5).getReg();
5491 Register Src4 = MI.getOperand(6).getImm();
5492 Register Src5 = MI.getOperand(7).getImm();
5493 return LaneOp.addUse(Src1)
5494 .addUse(Src2)
5495 .addUse(Src3)
5496 .addImm(Src4)
5497 .addImm(Src5)
5498 .getReg(0);
5499 }
5500 case Intrinsic::amdgcn_mov_dpp8:
5501 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5502 case Intrinsic::amdgcn_update_dpp:
5503 return LaneOp.addUse(Src1)
5504 .addImm(MI.getOperand(4).getImm())
5505 .addImm(MI.getOperand(5).getImm())
5506 .addImm(MI.getOperand(6).getImm())
5507 .addImm(MI.getOperand(7).getImm())
5508 .getReg(0);
5509 default:
5510 llvm_unreachable("unhandled lane op");
5511 }
5512 };
5513
5514 Register DstReg = MI.getOperand(0).getReg();
5515 Register Src0 = MI.getOperand(2).getReg();
5516 Register Src1, Src2;
5517 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5518 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5519 Src1 = MI.getOperand(3).getReg();
5520 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5521 Src2 = MI.getOperand(4).getReg();
5522 }
5523 }
5524
5525 LLT Ty = MRI.getType(DstReg);
5526 unsigned Size = Ty.getSizeInBits();
5527
5528 unsigned SplitSize = 32;
5529 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5530 ST.hasDPALU_DPP() &&
5531 AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
5532 SplitSize = 64;
5533
5534 if (Size == SplitSize) {
5535 // Already legal
5536 return true;
5537 }
5538
5539 if (Size < 32) {
5540 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5541
5542 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5543 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5544
5545 if (IID == Intrinsic::amdgcn_writelane)
5546 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5547
5548 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5549 B.buildTrunc(DstReg, LaneOpDst);
5550 MI.eraseFromParent();
5551 return true;
5552 }
5553
5554 if (Size % SplitSize != 0)
5555 return false;
5556
5557 LLT PartialResTy = LLT::scalar(SplitSize);
5558 if (Ty.isVector()) {
5559 LLT EltTy = Ty.getElementType();
5560 unsigned EltSize = EltTy.getSizeInBits();
5561 if (EltSize == SplitSize) {
5562 PartialResTy = EltTy;
5563 } else if (EltSize == 16 || EltSize == 32) {
5564 unsigned NElem = SplitSize / EltSize;
5565 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5566 }
5567 // Handle all other cases via S32/S64 pieces;
5568 }
5569
5570 SmallVector<Register, 4> PartialRes;
5571 unsigned NumParts = Size / SplitSize;
5572 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5573 MachineInstrBuilder Src1Parts, Src2Parts;
5574
5575 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5576 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5577
5578 if (IID == Intrinsic::amdgcn_writelane)
5579 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5580
5581 for (unsigned i = 0; i < NumParts; ++i) {
5582 Src0 = Src0Parts.getReg(i);
5583
5584 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5585 Src1 = Src1Parts.getReg(i);
5586
5587 if (IID == Intrinsic::amdgcn_writelane)
5588 Src2 = Src2Parts.getReg(i);
5589
5590 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5591 }
5592
5593 B.buildMergeLikeInstr(DstReg, PartialRes);
5594 MI.eraseFromParent();
5595 return true;
5596}
5597
5600 MachineIRBuilder &B) const {
5604 LLT DstTy = MRI.getType(DstReg);
5605 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5606
5607 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5608 if (!loadInputValue(KernargPtrReg, B,
5610 return false;
5611
5612 // FIXME: This should be nuw
5613 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5614 return true;
5615}
5616
5617/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5618/// bits of the pointer and replace them with the stride argument, then
5619/// merge_values everything together. In the common case of a raw buffer (the
5620/// stride component is 0), we can just AND off the upper half.
5623 Register Result = MI.getOperand(0).getReg();
5624 Register Pointer = MI.getOperand(2).getReg();
5625 Register Stride = MI.getOperand(3).getReg();
5626 Register NumRecords = MI.getOperand(4).getReg();
5627 Register Flags = MI.getOperand(5).getReg();
5628
5629 LLT S32 = LLT::scalar(32);
5630
5631 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5632 auto Unmerge = B.buildUnmerge(S32, Pointer);
5633 Register LowHalf = Unmerge.getReg(0);
5634 Register HighHalf = Unmerge.getReg(1);
5635
5636 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5637 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5638
5639 MachineInstrBuilder NewHighHalf = Masked;
5640 std::optional<ValueAndVReg> StrideConst =
5642 if (!StrideConst || !StrideConst->Value.isZero()) {
5643 MachineInstrBuilder ShiftedStride;
5644 if (StrideConst) {
5645 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5646 uint32_t ShiftedStrideVal = StrideVal << 16;
5647 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5648 } else {
5649 auto ExtStride = B.buildAnyExt(S32, Stride);
5650 auto ShiftConst = B.buildConstant(S32, 16);
5651 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5652 }
5653 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5654 }
5655 Register NewHighHalfReg = NewHighHalf.getReg(0);
5656 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5657 MI.eraseFromParent();
5658 return true;
5659}
5660
5663 MachineIRBuilder &B) const {
5664 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5665 if (!MFI->isEntryFunction()) {
5668 }
5669
5670 Register DstReg = MI.getOperand(0).getReg();
5671 if (!getImplicitArgPtr(DstReg, MRI, B))
5672 return false;
5673
5674 MI.eraseFromParent();
5675 return true;
5676}
5677
5680 MachineIRBuilder &B) const {
5681 Function &F = B.getMF().getFunction();
5682 std::optional<uint32_t> KnownSize =
5684 if (KnownSize.has_value())
5685 B.buildConstant(DstReg, *KnownSize);
5686 return false;
5687}
5688
5691 MachineIRBuilder &B) const {
5692
5693 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5694 if (!MFI->isEntryFunction()) {
5697 }
5698
5699 Register DstReg = MI.getOperand(0).getReg();
5700 if (!getLDSKernelId(DstReg, MRI, B))
5701 return false;
5702
5703 MI.eraseFromParent();
5704 return true;
5705}
5706
5710 unsigned AddrSpace) const {
5711 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5712 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5713 Register Hi32 = Unmerge.getReg(1);
5714
5715 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5716 MI.eraseFromParent();
5717 return true;
5718}
5719
5720// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5721// offset (the offset that is included in bounds checking and swizzling, to be
5722// split between the instruction's voffset and immoffset fields) and soffset
5723// (the offset that is excluded from bounds checking and swizzling, to go in
5724// the instruction's soffset field). This function takes the first kind of
5725// offset and figures out how to split it between voffset and immoffset.
5726std::pair<Register, unsigned>
5728 Register OrigOffset) const {
5729 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5730 Register BaseReg;
5731 unsigned ImmOffset;
5732 const LLT S32 = LLT::scalar(32);
5733 MachineRegisterInfo &MRI = *B.getMRI();
5734
5735 std::tie(BaseReg, ImmOffset) =
5737
5738 // If BaseReg is a pointer, convert it to int.
5739 if (MRI.getType(BaseReg).isPointer())
5740 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5741
5742 // If the immediate value is too big for the immoffset field, put only bits
5743 // that would normally fit in the immoffset field. The remaining value that
5744 // is copied/added for the voffset field is a large power of 2, and it
5745 // stands more chance of being CSEd with the copy/add for another similar
5746 // load/store.
5747 // However, do not do that rounding down if that is a negative
5748 // number, as it appears to be illegal to have a negative offset in the
5749 // vgpr, even if adding the immediate offset makes it positive.
5750 unsigned Overflow = ImmOffset & ~MaxImm;
5751 ImmOffset -= Overflow;
5752 if ((int32_t)Overflow < 0) {
5753 Overflow += ImmOffset;
5754 ImmOffset = 0;
5755 }
5756
5757 if (Overflow != 0) {
5758 if (!BaseReg) {
5759 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5760 } else {
5761 auto OverflowVal = B.buildConstant(S32, Overflow);
5762 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5763 }
5764 }
5765
5766 if (!BaseReg)
5767 BaseReg = B.buildConstant(S32, 0).getReg(0);
5768
5769 return std::pair(BaseReg, ImmOffset);
5770}
5771
5772/// Handle register layout difference for f16 images for some subtargets.
5775 Register Reg,
5776 bool ImageStore) const {
5777 const LLT S16 = LLT::scalar(16);
5778 const LLT S32 = LLT::scalar(32);
5779 LLT StoreVT = MRI.getType(Reg);
5780 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5781
5782 if (ST.hasUnpackedD16VMem()) {
5783 auto Unmerge = B.buildUnmerge(S16, Reg);
5784
5785 SmallVector<Register, 4> WideRegs;
5786 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5787 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5788
5789 int NumElts = StoreVT.getNumElements();
5790
5791 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5792 .getReg(0);
5793 }
5794
5795 if (ImageStore && ST.hasImageStoreD16Bug()) {
5796 if (StoreVT.getNumElements() == 2) {
5797 SmallVector<Register, 4> PackedRegs;
5798 Reg = B.buildBitcast(S32, Reg).getReg(0);
5799 PackedRegs.push_back(Reg);
5800 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5801 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5802 .getReg(0);
5803 }
5804
5805 if (StoreVT.getNumElements() == 3) {
5806 SmallVector<Register, 4> PackedRegs;
5807 auto Unmerge = B.buildUnmerge(S16, Reg);
5808 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5809 PackedRegs.push_back(Unmerge.getReg(I));
5810 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5811 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5812 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5813 }
5814
5815 if (StoreVT.getNumElements() == 4) {
5816 SmallVector<Register, 4> PackedRegs;
5817 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5818 auto Unmerge = B.buildUnmerge(S32, Reg);
5819 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5820 PackedRegs.push_back(Unmerge.getReg(I));
5821 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5822 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5823 .getReg(0);
5824 }
5825
5826 llvm_unreachable("invalid data type");
5827 }
5828
5829 if (StoreVT == LLT::fixed_vector(3, S16)) {
5830 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5831 .getReg(0);
5832 }
5833 return Reg;
5834}
5835
5837 Register VData, LLT MemTy,
5838 bool IsFormat) const {
5839 MachineRegisterInfo *MRI = B.getMRI();
5840 LLT Ty = MRI->getType(VData);
5841
5842 const LLT S16 = LLT::scalar(16);
5843
5844 // Fixup buffer resources themselves needing to be v4i128.
5846 return castBufferRsrcToV4I32(VData, B);
5847
5848 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
5849 Ty = getBitcastRegisterType(Ty);
5850 VData = B.buildBitcast(Ty, VData).getReg(0);
5851 }
5852 // Fixup illegal register types for i8 stores.
5853 if (Ty == LLT::scalar(8) || Ty == S16) {
5854 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5855 return AnyExt;
5856 }
5857
5858 if (Ty.isVector()) {
5859 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5860 if (IsFormat)
5861 return handleD16VData(B, *MRI, VData);
5862 }
5863 }
5864
5865 return VData;
5866}
5867
5869 LegalizerHelper &Helper,
5870 bool IsTyped,
5871 bool IsFormat) const {
5872 MachineIRBuilder &B = Helper.MIRBuilder;
5873 MachineRegisterInfo &MRI = *B.getMRI();
5874
5875 Register VData = MI.getOperand(1).getReg();
5876 LLT Ty = MRI.getType(VData);
5877 LLT EltTy = Ty.getScalarType();
5878 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5879 const LLT S32 = LLT::scalar(32);
5880
5881 MachineMemOperand *MMO = *MI.memoperands_begin();
5882 const int MemSize = MMO->getSize().getValue();
5883 LLT MemTy = MMO->getMemoryType();
5884
5885 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
5886
5888 Register RSrc = MI.getOperand(2).getReg();
5889
5890 unsigned ImmOffset;
5891
5892 // The typed intrinsics add an immediate after the registers.
5893 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5894
5895 // The struct intrinsic variants add one additional operand over raw.
5896 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5897 Register VIndex;
5898 int OpOffset = 0;
5899 if (HasVIndex) {
5900 VIndex = MI.getOperand(3).getReg();
5901 OpOffset = 1;
5902 } else {
5903 VIndex = B.buildConstant(S32, 0).getReg(0);
5904 }
5905
5906 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5907 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5908
5909 unsigned Format = 0;
5910 if (IsTyped) {
5911 Format = MI.getOperand(5 + OpOffset).getImm();
5912 ++OpOffset;
5913 }
5914
5915 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5916
5917 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5918
5919 unsigned Opc;
5920 if (IsTyped) {
5921 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5922 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5923 } else if (IsFormat) {
5924 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5925 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5926 } else {
5927 switch (MemSize) {
5928 case 1:
5929 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5930 break;
5931 case 2:
5932 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5933 break;
5934 default:
5935 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5936 break;
5937 }
5938 }
5939
5940 auto MIB = B.buildInstr(Opc)
5941 .addUse(VData) // vdata
5942 .addUse(RSrc) // rsrc
5943 .addUse(VIndex) // vindex
5944 .addUse(VOffset) // voffset
5945 .addUse(SOffset) // soffset
5946 .addImm(ImmOffset); // offset(imm)
5947
5948 if (IsTyped)
5949 MIB.addImm(Format);
5950
5951 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5952 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5953 .addMemOperand(MMO);
5954
5955 MI.eraseFromParent();
5956 return true;
5957}
5958
5959static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5960 Register VIndex, Register VOffset, Register SOffset,
5961 unsigned ImmOffset, unsigned Format,
5962 unsigned AuxiliaryData, MachineMemOperand *MMO,
5963 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5964 auto MIB = B.buildInstr(Opc)
5965 .addDef(LoadDstReg) // vdata
5966 .addUse(RSrc) // rsrc
5967 .addUse(VIndex) // vindex
5968 .addUse(VOffset) // voffset
5969 .addUse(SOffset) // soffset
5970 .addImm(ImmOffset); // offset(imm)
5971
5972 if (IsTyped)
5973 MIB.addImm(Format);
5974
5975 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5976 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5977 .addMemOperand(MMO);
5978}
5979
5981 LegalizerHelper &Helper,
5982 bool IsFormat,
5983 bool IsTyped) const {
5984 MachineIRBuilder &B = Helper.MIRBuilder;
5985 MachineRegisterInfo &MRI = *B.getMRI();
5986 GISelChangeObserver &Observer = Helper.Observer;
5987
5988 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5989 MachineMemOperand *MMO = *MI.memoperands_begin();
5990 const LLT MemTy = MMO->getMemoryType();
5991 const LLT S32 = LLT::scalar(32);
5992
5993 Register Dst = MI.getOperand(0).getReg();
5994
5995 Register StatusDst;
5996 int OpOffset = 0;
5997 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5998 bool IsTFE = MI.getNumExplicitDefs() == 2;
5999 if (IsTFE) {
6000 StatusDst = MI.getOperand(1).getReg();
6001 ++OpOffset;
6002 }
6003
6004 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6005 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6006
6007 // The typed intrinsics add an immediate after the registers.
6008 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6009
6010 // The struct intrinsic variants add one additional operand over raw.
6011 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6012 Register VIndex;
6013 if (HasVIndex) {
6014 VIndex = MI.getOperand(3 + OpOffset).getReg();
6015 ++OpOffset;
6016 } else {
6017 VIndex = B.buildConstant(S32, 0).getReg(0);
6018 }
6019
6020 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6021 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6022
6023 unsigned Format = 0;
6024 if (IsTyped) {
6025 Format = MI.getOperand(5 + OpOffset).getImm();
6026 ++OpOffset;
6027 }
6028
6029 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6030 unsigned ImmOffset;
6031
6032 LLT Ty = MRI.getType(Dst);
6033 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6034 // logic doesn't have to handle that case.
6035 if (hasBufferRsrcWorkaround(Ty)) {
6036 Observer.changingInstr(MI);
6037 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6038 Observer.changedInstr(MI);
6039 Dst = MI.getOperand(0).getReg();
6040 B.setInsertPt(B.getMBB(), MI);
6041 }
6042 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6043 Ty = getBitcastRegisterType(Ty);
6044 Observer.changingInstr(MI);
6045 Helper.bitcastDst(MI, Ty, 0);
6046 Observer.changedInstr(MI);
6047 Dst = MI.getOperand(0).getReg();
6048 B.setInsertPt(B.getMBB(), MI);
6049 }
6050
6051 LLT EltTy = Ty.getScalarType();
6052 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6053 const bool Unpacked = ST.hasUnpackedD16VMem();
6054
6055 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6056
6057 unsigned Opc;
6058
6059 // TODO: Support TFE for typed and narrow loads.
6060 if (IsTyped) {
6061 if (IsTFE)
6062 return false;
6063 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6064 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6065 } else if (IsFormat) {
6066 if (IsD16) {
6067 if (IsTFE)
6068 return false;
6069 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6070 } else {
6071 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6072 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6073 }
6074 } else {
6075 switch (MemTy.getSizeInBits()) {
6076 case 8:
6077 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6078 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6079 break;
6080 case 16:
6081 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6082 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6083 break;
6084 default:
6085 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6086 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6087 break;
6088 }
6089 }
6090
6091 if (IsTFE) {
6092 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6093 unsigned NumLoadDWords = NumValueDWords + 1;
6094 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6095 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6096 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6097 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6098 if (MemTy.getSizeInBits() < 32) {
6099 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6100 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6101 B.buildTrunc(Dst, ExtDst);
6102 } else if (NumValueDWords == 1) {
6103 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6104 } else {
6105 SmallVector<Register, 5> LoadElts;
6106 for (unsigned I = 0; I != NumValueDWords; ++I)
6107 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6108 LoadElts.push_back(StatusDst);
6109 B.buildUnmerge(LoadElts, LoadDstReg);
6110 LoadElts.truncate(NumValueDWords);
6111 B.buildMergeLikeInstr(Dst, LoadElts);
6112 }
6113 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6114 (IsD16 && !Ty.isVector())) {
6115 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6116 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6117 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6118 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6119 B.buildTrunc(Dst, LoadDstReg);
6120 } else if (Unpacked && IsD16 && Ty.isVector()) {
6121 LLT UnpackedTy = Ty.changeElementSize(32);
6122 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6123 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6124 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6125 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6126 // FIXME: G_TRUNC should work, but legalization currently fails
6127 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6129 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6130 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6131 B.buildMergeLikeInstr(Dst, Repack);
6132 } else {
6133 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6134 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6135 }
6136
6137 MI.eraseFromParent();
6138 return true;
6139}
6140
6141static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6142 switch (IntrID) {
6143 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6144 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6145 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6146 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6147 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6148 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6149 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6150 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6151 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6152 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6153 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6154 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6155 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6156 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6158 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6159 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6160 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6161 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6162 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6163 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6164 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6165 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6166 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6167 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6168 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6170 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6171 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6172 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6173 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6174 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6175 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6176 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6177 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6178 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6180 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6182 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6183 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6184 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6185 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6186 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6187 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6188 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6189 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6190 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6191 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6192 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6193 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6195 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6196 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6197 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6198 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6200 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6201 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6202 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6203 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6204 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6205 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6206 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6207 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6208 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6209 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6210 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6211 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6212 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6213 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6214 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6215 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6217 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6218 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6219 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6220 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6221 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6223 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6224 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6225 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6226 default:
6227 llvm_unreachable("unhandled atomic opcode");
6228 }
6229}
6230
6233 Intrinsic::ID IID) const {
6234 const bool IsCmpSwap =
6235 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6236 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6237 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6238 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6239
6240 Register Dst = MI.getOperand(0).getReg();
6241 // Since we don't have 128-bit atomics, we don't need to handle the case of
6242 // p8 argmunents to the atomic itself
6243 Register VData = MI.getOperand(2).getReg();
6244
6245 Register CmpVal;
6246 int OpOffset = 0;
6247
6248 if (IsCmpSwap) {
6249 CmpVal = MI.getOperand(3).getReg();
6250 ++OpOffset;
6251 }
6252
6253 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6254 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6255 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6256
6257 // The struct intrinsic variants add one additional operand over raw.
6258 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6259 Register VIndex;
6260 if (HasVIndex) {
6261 VIndex = MI.getOperand(4 + OpOffset).getReg();
6262 ++OpOffset;
6263 } else {
6264 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6265 }
6266
6267 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6268 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6269 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6270
6271 MachineMemOperand *MMO = *MI.memoperands_begin();
6272
6273 unsigned ImmOffset;
6274 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6275
6276 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6277 .addDef(Dst)
6278 .addUse(VData); // vdata
6279
6280 if (IsCmpSwap)
6281 MIB.addReg(CmpVal);
6282
6283 MIB.addUse(RSrc) // rsrc
6284 .addUse(VIndex) // vindex
6285 .addUse(VOffset) // voffset
6286 .addUse(SOffset) // soffset
6287 .addImm(ImmOffset) // offset(imm)
6288 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6289 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6290 .addMemOperand(MMO);
6291
6292 MI.eraseFromParent();
6293 return true;
6294}
6295
6296/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6297/// vector with s16 typed elements.
6299 SmallVectorImpl<Register> &PackedAddrs,
6300 unsigned ArgOffset,
6302 bool IsA16, bool IsG16) {
6303 const LLT S16 = LLT::scalar(16);
6304 const LLT V2S16 = LLT::fixed_vector(2, 16);
6305 auto EndIdx = Intr->VAddrEnd;
6306
6307 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6308 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6309 if (!SrcOp.isReg())
6310 continue; // _L to _LZ may have eliminated this.
6311
6312 Register AddrReg = SrcOp.getReg();
6313
6314 if ((I < Intr->GradientStart) ||
6315 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6316 (I >= Intr->CoordStart && !IsA16)) {
6317 if ((I < Intr->GradientStart) && IsA16 &&
6318 (B.getMRI()->getType(AddrReg) == S16)) {
6319 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6320 // Special handling of bias when A16 is on. Bias is of type half but
6321 // occupies full 32-bit.
6322 PackedAddrs.push_back(
6323 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6324 .getReg(0));
6325 } else {
6326 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6327 "Bias needs to be converted to 16 bit in A16 mode");
6328 // Handle any gradient or coordinate operands that should not be packed
6329 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6330 PackedAddrs.push_back(AddrReg);
6331 }
6332 } else {
6333 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6334 // derivatives dx/dh and dx/dv are packed with undef.
6335 if (((I + 1) >= EndIdx) ||
6336 ((Intr->NumGradients / 2) % 2 == 1 &&
6337 (I == static_cast<unsigned>(Intr->GradientStart +
6338 (Intr->NumGradients / 2) - 1) ||
6339 I == static_cast<unsigned>(Intr->GradientStart +
6340 Intr->NumGradients - 1))) ||
6341 // Check for _L to _LZ optimization
6342 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6343 PackedAddrs.push_back(
6344 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6345 .getReg(0));
6346 } else {
6347 PackedAddrs.push_back(
6348 B.buildBuildVector(
6349 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6350 .getReg(0));
6351 ++I;
6352 }
6353 }
6354 }
6355}
6356
6357/// Convert from separate vaddr components to a single vector address register,
6358/// and replace the remaining operands with $noreg.
6360 int DimIdx, int NumVAddrs) {
6361 const LLT S32 = LLT::scalar(32);
6362 (void)S32;
6363 SmallVector<Register, 8> AddrRegs;
6364 for (int I = 0; I != NumVAddrs; ++I) {
6365 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6366 if (SrcOp.isReg()) {
6367 AddrRegs.push_back(SrcOp.getReg());
6368 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6369 }
6370 }
6371
6372 int NumAddrRegs = AddrRegs.size();
6373 if (NumAddrRegs != 1) {
6374 auto VAddr =
6375 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6376 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6377 }
6378
6379 for (int I = 1; I != NumVAddrs; ++I) {
6380 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6381 if (SrcOp.isReg())
6382 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6383 }
6384}
6385
6386/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6387///
6388/// Depending on the subtarget, load/store with 16-bit element data need to be
6389/// rewritten to use the low half of 32-bit registers, or directly use a packed
6390/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6391/// registers.
6392///
6393/// We don't want to directly select image instructions just yet, but also want
6394/// to exposes all register repacking to the legalizer/combiners. We also don't
6395/// want a selected instruction entering RegBankSelect. In order to avoid
6396/// defining a multitude of intermediate image instructions, directly hack on
6397/// the intrinsic's arguments. In cases like a16 addresses, this requires
6398/// padding now unnecessary arguments with $noreg.
6401 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6402
6403 const MachineFunction &MF = *MI.getMF();
6404 const unsigned NumDefs = MI.getNumExplicitDefs();
6405 const unsigned ArgOffset = NumDefs + 1;
6406 bool IsTFE = NumDefs == 2;
6407 // We are only processing the operands of d16 image operations on subtargets
6408 // that use the unpacked register layout, or need to repack the TFE result.
6409
6410 // TODO: Do we need to guard against already legalized intrinsics?
6411 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6413
6414 MachineRegisterInfo *MRI = B.getMRI();
6415 const LLT S32 = LLT::scalar(32);
6416 const LLT S16 = LLT::scalar(16);
6417 const LLT V2S16 = LLT::fixed_vector(2, 16);
6418
6419 unsigned DMask = 0;
6420 Register VData;
6421 LLT Ty;
6422
6423 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6424 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6425 Ty = MRI->getType(VData);
6426 }
6427
6428 const bool IsAtomicPacked16Bit =
6429 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6430 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6431
6432 // Check for 16 bit addresses and pack if true.
6433 LLT GradTy =
6434 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6435 LLT AddrTy =
6436 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6437 const bool IsG16 =
6438 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6439 const bool IsA16 = AddrTy == S16;
6440 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6441
6442 int DMaskLanes = 0;
6443 if (!BaseOpcode->Atomic) {
6444 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6445 if (BaseOpcode->Gather4) {
6446 DMaskLanes = 4;
6447 } else if (DMask != 0) {
6448 DMaskLanes = llvm::popcount(DMask);
6449 } else if (!IsTFE && !BaseOpcode->Store) {
6450 // If dmask is 0, this is a no-op load. This can be eliminated.
6451 B.buildUndef(MI.getOperand(0));
6452 MI.eraseFromParent();
6453 return true;
6454 }
6455 }
6456
6457 Observer.changingInstr(MI);
6458 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6459
6460 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6461 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6462 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6463 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6464 unsigned NewOpcode = LoadOpcode;
6465 if (BaseOpcode->Store)
6466 NewOpcode = StoreOpcode;
6467 else if (BaseOpcode->NoReturn)
6468 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6469
6470 // Track that we legalized this
6471 MI.setDesc(B.getTII().get(NewOpcode));
6472
6473 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6474 // dmask to be at least 1 otherwise the instruction will fail
6475 if (IsTFE && DMask == 0) {
6476 DMask = 0x1;
6477 DMaskLanes = 1;
6478 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6479 }
6480
6481 if (BaseOpcode->Atomic) {
6482 Register VData0 = MI.getOperand(2).getReg();
6483 LLT Ty = MRI->getType(VData0);
6484
6485 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6486 if (Ty.isVector() && !IsAtomicPacked16Bit)
6487 return false;
6488
6489 if (BaseOpcode->AtomicX2) {
6490 Register VData1 = MI.getOperand(3).getReg();
6491 // The two values are packed in one register.
6492 LLT PackedTy = LLT::fixed_vector(2, Ty);
6493 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6494 MI.getOperand(2).setReg(Concat.getReg(0));
6495 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6496 }
6497 }
6498
6499 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6500
6501 // Rewrite the addressing register layout before doing anything else.
6502 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6503 // 16 bit gradients are supported, but are tied to the A16 control
6504 // so both gradients and addresses must be 16 bit
6505 return false;
6506 }
6507
6508 if (IsA16 && !ST.hasA16()) {
6509 // A16 not supported
6510 return false;
6511 }
6512
6513 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6514 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6515
6516 if (IsA16 || IsG16) {
6517 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6518 // instructions expect VGPR_32
6519 SmallVector<Register, 4> PackedRegs;
6520
6521 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6522
6523 // See also below in the non-a16 branch
6524 const bool UseNSA = ST.hasNSAEncoding() &&
6525 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6526 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6527 const bool UsePartialNSA =
6528 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6529
6530 if (UsePartialNSA) {
6531 // Pack registers that would go over NSAMaxSize into last VAddr register
6532 LLT PackedAddrTy =
6533 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6534 auto Concat = B.buildConcatVectors(
6535 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6536 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6537 PackedRegs.resize(NSAMaxSize);
6538 } else if (!UseNSA && PackedRegs.size() > 1) {
6539 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6540 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6541 PackedRegs[0] = Concat.getReg(0);
6542 PackedRegs.resize(1);
6543 }
6544
6545 const unsigned NumPacked = PackedRegs.size();
6546 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6547 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6548 if (!SrcOp.isReg()) {
6549 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6550 continue;
6551 }
6552
6553 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6554
6555 if (I - Intr->VAddrStart < NumPacked)
6556 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6557 else
6558 SrcOp.setReg(AMDGPU::NoRegister);
6559 }
6560 } else {
6561 // If the register allocator cannot place the address registers contiguously
6562 // without introducing moves, then using the non-sequential address encoding
6563 // is always preferable, since it saves VALU instructions and is usually a
6564 // wash in terms of code size or even better.
6565 //
6566 // However, we currently have no way of hinting to the register allocator
6567 // that MIMG addresses should be placed contiguously when it is possible to
6568 // do so, so force non-NSA for the common 2-address case as a heuristic.
6569 //
6570 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6571 // allocation when possible.
6572 //
6573 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6574 // set of the remaining addresses.
6575 const bool UseNSA = ST.hasNSAEncoding() &&
6576 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6577 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6578 const bool UsePartialNSA =
6579 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6580
6581 if (UsePartialNSA) {
6583 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6584 Intr->NumVAddrs - NSAMaxSize + 1);
6585 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6586 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6587 Intr->NumVAddrs);
6588 }
6589 }
6590
6591 int Flags = 0;
6592 if (IsA16)
6593 Flags |= 1;
6594 if (IsG16)
6595 Flags |= 2;
6596 MI.addOperand(MachineOperand::CreateImm(Flags));
6597
6598 if (BaseOpcode->NoReturn) { // No TFE for stores?
6599 // TODO: Handle dmask trim
6600 if (!Ty.isVector() || !IsD16)
6601 return true;
6602
6603 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6604 if (RepackedReg != VData) {
6605 MI.getOperand(1).setReg(RepackedReg);
6606 }
6607
6608 return true;
6609 }
6610
6611 Register DstReg = MI.getOperand(0).getReg();
6612 const LLT EltTy = Ty.getScalarType();
6613 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6614
6615 // Confirm that the return type is large enough for the dmask specified
6616 if (NumElts < DMaskLanes)
6617 return false;
6618
6619 if (NumElts > 4 || DMaskLanes > 4)
6620 return false;
6621
6622 // Image atomic instructions are using DMask to specify how many bits
6623 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6624 // DMaskLanes for image atomic has default value '0'.
6625 // We must be sure that atomic variants (especially packed) will not be
6626 // truncated from v2s16 or v4s16 to s16 type.
6627 //
6628 // ChangeElementCount will be needed for image load where Ty is always scalar.
6629 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6630 const LLT AdjustedTy =
6631 DMaskLanes == 0
6632 ? Ty
6633 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6634
6635 // The raw dword aligned data component of the load. The only legal cases
6636 // where this matters should be when using the packed D16 format, for
6637 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6638 LLT RoundedTy;
6639
6640 // S32 vector to cover all data, plus TFE result element.
6641 LLT TFETy;
6642
6643 // Register type to use for each loaded component. Will be S32 or V2S16.
6644 LLT RegTy;
6645
6646 if (IsD16 && ST.hasUnpackedD16VMem()) {
6647 RoundedTy =
6648 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6649 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6650 RegTy = S32;
6651 } else {
6652 unsigned EltSize = EltTy.getSizeInBits();
6653 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6654 unsigned RoundedSize = 32 * RoundedElts;
6655 RoundedTy = LLT::scalarOrVector(
6656 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6657 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6658 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6659 }
6660
6661 // The return type does not need adjustment.
6662 // TODO: Should we change s16 case to s32 or <2 x s16>?
6663 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6664 return true;
6665
6666 Register Dst1Reg;
6667
6668 // Insert after the instruction.
6669 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6670
6671 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6672 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6673 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6674 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6675
6676 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6677
6678 MI.getOperand(0).setReg(NewResultReg);
6679
6680 // In the IR, TFE is supposed to be used with a 2 element struct return
6681 // type. The instruction really returns these two values in one contiguous
6682 // register, with one additional dword beyond the loaded data. Rewrite the
6683 // return type to use a single register result.
6684
6685 if (IsTFE) {
6686 Dst1Reg = MI.getOperand(1).getReg();
6687 if (MRI->getType(Dst1Reg) != S32)
6688 return false;
6689
6690 // TODO: Make sure the TFE operand bit is set.
6691 MI.removeOperand(1);
6692
6693 // Handle the easy case that requires no repack instructions.
6694 if (Ty == S32) {
6695 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6696 return true;
6697 }
6698 }
6699
6700 // Now figure out how to copy the new result register back into the old
6701 // result.
6702 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6703
6704 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6705
6706 if (ResultNumRegs == 1) {
6707 assert(!IsTFE);
6708 ResultRegs[0] = NewResultReg;
6709 } else {
6710 // We have to repack into a new vector of some kind.
6711 for (int I = 0; I != NumDataRegs; ++I)
6712 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6713 B.buildUnmerge(ResultRegs, NewResultReg);
6714
6715 // Drop the final TFE element to get the data part. The TFE result is
6716 // directly written to the right place already.
6717 if (IsTFE)
6718 ResultRegs.resize(NumDataRegs);
6719 }
6720
6721 // For an s16 scalar result, we form an s32 result with a truncate regardless
6722 // of packed vs. unpacked.
6723 if (IsD16 && !Ty.isVector()) {
6724 B.buildTrunc(DstReg, ResultRegs[0]);
6725 return true;
6726 }
6727
6728 // Avoid a build/concat_vector of 1 entry.
6729 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6730 B.buildBitcast(DstReg, ResultRegs[0]);
6731 return true;
6732 }
6733
6734 assert(Ty.isVector());
6735
6736 if (IsD16) {
6737 // For packed D16 results with TFE enabled, all the data components are
6738 // S32. Cast back to the expected type.
6739 //
6740 // TODO: We don't really need to use load s32 elements. We would only need one
6741 // cast for the TFE result if a multiple of v2s16 was used.
6742 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6743 for (Register &Reg : ResultRegs)
6744 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6745 } else if (ST.hasUnpackedD16VMem()) {
6746 for (Register &Reg : ResultRegs)
6747 Reg = B.buildTrunc(S16, Reg).getReg(0);
6748 }
6749 }
6750
6751 auto padWithUndef = [&](LLT Ty, int NumElts) {
6752 if (NumElts == 0)
6753 return;
6754 Register Undef = B.buildUndef(Ty).getReg(0);
6755 for (int I = 0; I != NumElts; ++I)
6756 ResultRegs.push_back(Undef);
6757 };
6758
6759 // Pad out any elements eliminated due to the dmask.
6760 LLT ResTy = MRI->getType(ResultRegs[0]);
6761 if (!ResTy.isVector()) {
6762 padWithUndef(ResTy, NumElts - ResultRegs.size());
6763 B.buildBuildVector(DstReg, ResultRegs);
6764 return true;
6765 }
6766
6767 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6768 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6769
6770 // Deal with the one annoying legal case.
6771 const LLT V3S16 = LLT::fixed_vector(3, 16);
6772 if (Ty == V3S16) {
6773 if (IsTFE) {
6774 if (ResultRegs.size() == 1) {
6775 NewResultReg = ResultRegs[0];
6776 } else if (ResultRegs.size() == 2) {
6777 LLT V4S16 = LLT::fixed_vector(4, 16);
6778 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6779 } else {
6780 return false;
6781 }
6782 }
6783
6784 if (MRI->getType(DstReg).getNumElements() <
6785 MRI->getType(NewResultReg).getNumElements()) {
6786 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6787 } else {
6788 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6789 }
6790 return true;
6791 }
6792
6793 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6794 B.buildConcatVectors(DstReg, ResultRegs);
6795 return true;
6796}
6797
6799 MachineInstr &MI) const {
6800 MachineIRBuilder &B = Helper.MIRBuilder;
6801 GISelChangeObserver &Observer = Helper.Observer;
6802
6803 Register OrigDst = MI.getOperand(0).getReg();
6804 Register Dst;
6805 LLT Ty = B.getMRI()->getType(OrigDst);
6806 unsigned Size = Ty.getSizeInBits();
6807 MachineFunction &MF = B.getMF();
6808 unsigned Opc = 0;
6809 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6810 assert(Size == 8 || Size == 16);
6811 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6812 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6813 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6814 // destination register.
6815 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6816 } else {
6817 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6818 Dst = OrigDst;
6819 }
6820
6821 Observer.changingInstr(MI);
6822
6823 // Handle needing to s.buffer.load() a p8 value.
6824 if (hasBufferRsrcWorkaround(Ty)) {
6825 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6826 B.setInsertPt(B.getMBB(), MI);
6827 }
6829 Ty = getBitcastRegisterType(Ty);
6830 Helper.bitcastDst(MI, Ty, 0);
6831 B.setInsertPt(B.getMBB(), MI);
6832 }
6833
6834 // FIXME: We don't really need this intermediate instruction. The intrinsic
6835 // should be fixed to have a memory operand. Since it's readnone, we're not
6836 // allowed to add one.
6837 MI.setDesc(B.getTII().get(Opc));
6838 MI.removeOperand(1); // Remove intrinsic ID
6839
6840 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6841 const unsigned MemSize = (Size + 7) / 8;
6842 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6848 MemSize, MemAlign);
6849 MI.addMemOperand(MF, MMO);
6850 if (Dst != OrigDst) {
6851 MI.getOperand(0).setReg(Dst);
6852 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6853 B.buildTrunc(OrigDst, Dst);
6854 }
6855
6856 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6857 // always be legal. We may need to restore this to a 96-bit result if it turns
6858 // out this needs to be converted to a vector load during RegBankSelect.
6859 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6860 if (Ty.isVector())
6862 else
6863 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6864 }
6865
6866 Observer.changedInstr(MI);
6867 return true;
6868}
6869
6871 MachineInstr &MI) const {
6872 MachineIRBuilder &B = Helper.MIRBuilder;
6873 GISelChangeObserver &Observer = Helper.Observer;
6874 Observer.changingInstr(MI);
6875 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6876 MI.removeOperand(0); // Remove intrinsic ID
6878 Observer.changedInstr(MI);
6879 return true;
6880}
6881
6882// TODO: Move to selection
6885 MachineIRBuilder &B) const {
6886 if (!ST.isTrapHandlerEnabled() ||
6888 return legalizeTrapEndpgm(MI, MRI, B);
6889
6890 return ST.supportsGetDoorbellID() ?
6892}
6893
6896 const DebugLoc &DL = MI.getDebugLoc();
6897 MachineBasicBlock &BB = B.getMBB();
6898 MachineFunction *MF = BB.getParent();
6899
6900 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6901 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6902 .addImm(0);
6903 MI.eraseFromParent();
6904 return true;
6905 }
6906
6907 // We need a block split to make the real endpgm a terminator. We also don't
6908 // want to break phis in successor blocks, so we can't just delete to the
6909 // end of the block.
6910 BB.splitAt(MI, false /*UpdateLiveIns*/);
6912 MF->push_back(TrapBB);
6913 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6914 .addImm(0);
6915 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6916 .addMBB(TrapBB);
6917
6918 BB.addSuccessor(TrapBB);
6919 MI.eraseFromParent();
6920 return true;
6921}
6922
6925 MachineFunction &MF = B.getMF();
6926 const LLT S64 = LLT::scalar(64);
6927
6928 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6929 // For code object version 5, queue_ptr is passed through implicit kernarg.
6935 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6936
6937 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6939
6940 if (!loadInputValue(KernargPtrReg, B,
6942 return false;
6943
6944 // TODO: can we be smarter about machine pointer info?
6947 PtrInfo,
6951
6952 // Pointer address
6953 Register LoadAddr = MRI.createGenericVirtualRegister(
6955 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6956 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6957 // Load address
6958 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6959 B.buildCopy(SGPR01, Temp);
6960 B.buildInstr(AMDGPU::S_TRAP)
6961 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6962 .addReg(SGPR01, RegState::Implicit);
6963 MI.eraseFromParent();
6964 return true;
6965 }
6966
6967 // Pass queue pointer to trap handler as input, and insert trap instruction
6968 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6969 Register LiveIn =
6970 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6972 return false;
6973
6974 B.buildCopy(SGPR01, LiveIn);
6975 B.buildInstr(AMDGPU::S_TRAP)
6976 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6977 .addReg(SGPR01, RegState::Implicit);
6978
6979 MI.eraseFromParent();
6980 return true;
6981}
6982
6985 MachineIRBuilder &B) const {
6986 // We need to simulate the 's_trap 2' instruction on targets that run in
6987 // PRIV=1 (where it is treated as a nop).
6988 if (ST.hasPrivEnabledTrap2NopBug()) {
6989 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6990 MI.getDebugLoc());
6991 MI.eraseFromParent();
6992 return true;
6993 }
6994
6995 B.buildInstr(AMDGPU::S_TRAP)
6996 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6997 MI.eraseFromParent();
6998 return true;
6999}
7000
7003 MachineIRBuilder &B) const {
7004 // Is non-HSA path or trap-handler disabled? Then, report a warning
7005 // accordingly
7006 if (!ST.isTrapHandlerEnabled() ||
7008 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
7009 "debugtrap handler not supported",
7010 MI.getDebugLoc(), DS_Warning);
7011 LLVMContext &Ctx = B.getMF().getFunction().getContext();
7012 Ctx.diagnose(NoTrap);
7013 } else {
7014 // Insert debug-trap instruction
7015 B.buildInstr(AMDGPU::S_TRAP)
7016 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7017 }
7018
7019 MI.eraseFromParent();
7020 return true;
7021}
7022
7024 MachineIRBuilder &B) const {
7025 MachineRegisterInfo &MRI = *B.getMRI();
7026 const LLT S16 = LLT::scalar(16);
7027 const LLT S32 = LLT::scalar(32);
7028 const LLT V2S16 = LLT::fixed_vector(2, 16);
7029 const LLT V3S32 = LLT::fixed_vector(3, 32);
7030
7031 Register DstReg = MI.getOperand(0).getReg();
7032 Register NodePtr = MI.getOperand(2).getReg();
7033 Register RayExtent = MI.getOperand(3).getReg();
7034 Register RayOrigin = MI.getOperand(4).getReg();
7035 Register RayDir = MI.getOperand(5).getReg();
7036 Register RayInvDir = MI.getOperand(6).getReg();
7037 Register TDescr = MI.getOperand(7).getReg();
7038
7039 if (!ST.hasGFX10_AEncoding()) {
7040 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
7041 "intrinsic not supported on subtarget",
7042 MI.getDebugLoc());
7043 B.getMF().getFunction().getContext().diagnose(BadIntrin);
7044 return false;
7045 }
7046
7047 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7048 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7049 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7050 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7051 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7052 const unsigned NumVDataDwords = 4;
7053 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7054 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7055 const bool UseNSA =
7056 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7057
7058 const unsigned BaseOpcodes[2][2] = {
7059 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7060 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7061 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7062 int Opcode;
7063 if (UseNSA) {
7064 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7065 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7066 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7067 : AMDGPU::MIMGEncGfx10NSA,
7068 NumVDataDwords, NumVAddrDwords);
7069 } else {
7070 assert(!IsGFX12Plus);
7071 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7072 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7073 : AMDGPU::MIMGEncGfx10Default,
7074 NumVDataDwords, NumVAddrDwords);
7075 }
7076 assert(Opcode != -1);
7077
7079 if (UseNSA && IsGFX11Plus) {
7080 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7081 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7082 auto Merged = B.buildMergeLikeInstr(
7083 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7084 Ops.push_back(Merged.getReg(0));
7085 };
7086
7087 Ops.push_back(NodePtr);
7088 Ops.push_back(RayExtent);
7089 packLanes(RayOrigin);
7090
7091 if (IsA16) {
7092 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7093 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7094 auto MergedDir = B.buildMergeLikeInstr(
7095 V3S32,
7096 {B.buildBitcast(
7097 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7098 UnmergeRayDir.getReg(0)}))
7099 .getReg(0),
7100 B.buildBitcast(
7101 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7102 UnmergeRayDir.getReg(1)}))
7103 .getReg(0),
7104 B.buildBitcast(
7105 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7106 UnmergeRayDir.getReg(2)}))
7107 .getReg(0)});
7108 Ops.push_back(MergedDir.getReg(0));
7109 } else {
7110 packLanes(RayDir);
7111 packLanes(RayInvDir);
7112 }
7113 } else {
7114 if (Is64) {
7115 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7116 Ops.push_back(Unmerge.getReg(0));
7117 Ops.push_back(Unmerge.getReg(1));
7118 } else {
7119 Ops.push_back(NodePtr);
7120 }
7121 Ops.push_back(RayExtent);
7122
7123 auto packLanes = [&Ops, &S32, &B](Register Src) {
7124 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7125 Ops.push_back(Unmerge.getReg(0));
7126 Ops.push_back(Unmerge.getReg(1));
7127 Ops.push_back(Unmerge.getReg(2));
7128 };
7129
7130 packLanes(RayOrigin);
7131 if (IsA16) {
7132 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7133 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7134 Register R1 = MRI.createGenericVirtualRegister(S32);
7135 Register R2 = MRI.createGenericVirtualRegister(S32);
7136 Register R3 = MRI.createGenericVirtualRegister(S32);
7137 B.buildMergeLikeInstr(R1,
7138 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7139 B.buildMergeLikeInstr(
7140 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7141 B.buildMergeLikeInstr(
7142 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7143 Ops.push_back(R1);
7144 Ops.push_back(R2);
7145 Ops.push_back(R3);
7146 } else {
7147 packLanes(RayDir);
7148 packLanes(RayInvDir);
7149 }
7150 }
7151
7152 if (!UseNSA) {
7153 // Build a single vector containing all the operands so far prepared.
7154 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7155 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7156 Ops.clear();
7157 Ops.push_back(MergedOps);
7158 }
7159
7160 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7161 .addDef(DstReg)
7162 .addImm(Opcode);
7163
7164 for (Register R : Ops) {
7165 MIB.addUse(R);
7166 }
7167
7168 MIB.addUse(TDescr)
7169 .addImm(IsA16 ? 1 : 0)
7170 .cloneMemRefs(MI);
7171
7172 MI.eraseFromParent();
7173 return true;
7174}
7175
7177 MachineIRBuilder &B) const {
7178 const SITargetLowering *TLI = ST.getTargetLowering();
7180 Register DstReg = MI.getOperand(0).getReg();
7181 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7182 MI.eraseFromParent();
7183 return true;
7184}
7185
7187 MachineIRBuilder &B) const {
7188 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7189 if (!ST.hasArchitectedSGPRs())
7190 return false;
7191 LLT S32 = LLT::scalar(32);
7192 Register DstReg = MI.getOperand(0).getReg();
7193 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7194 auto LSB = B.buildConstant(S32, 25);
7195 auto Width = B.buildConstant(S32, 5);
7196 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7197 MI.eraseFromParent();
7198 return true;
7199}
7200
7201static constexpr unsigned FPEnvModeBitField =
7203
7204static constexpr unsigned FPEnvTrapBitField =
7206
7209 MachineIRBuilder &B) const {
7210 Register Src = MI.getOperand(0).getReg();
7211 if (MRI.getType(Src) != S64)
7212 return false;
7213
7214 auto ModeReg =
7215 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7216 /*HasSideEffects=*/true, /*isConvergent=*/false)
7217 .addImm(FPEnvModeBitField);
7218 auto TrapReg =
7219 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7220 /*HasSideEffects=*/true, /*isConvergent=*/false)
7221 .addImm(FPEnvTrapBitField);
7222 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7223 MI.eraseFromParent();
7224 return true;
7225}
7226
7229 MachineIRBuilder &B) const {
7230 Register Src = MI.getOperand(0).getReg();
7231 if (MRI.getType(Src) != S64)
7232 return false;
7233
7234 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7235 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7236 /*HasSideEffects=*/true, /*isConvergent=*/false)
7237 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7238 .addReg(Unmerge.getReg(0));
7239 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7240 /*HasSideEffects=*/true, /*isConvergent=*/false)
7241 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7242 .addReg(Unmerge.getReg(1));
7243 MI.eraseFromParent();
7244 return true;
7245}
7246
7248 MachineInstr &MI) const {
7249 MachineIRBuilder &B = Helper.MIRBuilder;
7250 MachineRegisterInfo &MRI = *B.getMRI();
7251
7252 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7253 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7254 switch (IntrID) {
7255 case Intrinsic::amdgcn_if:
7256 case Intrinsic::amdgcn_else: {
7257 MachineInstr *Br = nullptr;
7258 MachineBasicBlock *UncondBrTarget = nullptr;
7259 bool Negated = false;
7260 if (MachineInstr *BrCond =
7261 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7262 const SIRegisterInfo *TRI
7263 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7264
7265 Register Def = MI.getOperand(1).getReg();
7266 Register Use = MI.getOperand(3).getReg();
7267
7268 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7269
7270 if (Negated)
7271 std::swap(CondBrTarget, UncondBrTarget);
7272
7273 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7274 if (IntrID == Intrinsic::amdgcn_if) {
7275 B.buildInstr(AMDGPU::SI_IF)
7276 .addDef(Def)
7277 .addUse(Use)
7278 .addMBB(UncondBrTarget);
7279 } else {
7280 B.buildInstr(AMDGPU::SI_ELSE)
7281 .addDef(Def)
7282 .addUse(Use)
7283 .addMBB(UncondBrTarget);
7284 }
7285
7286 if (Br) {
7287 Br->getOperand(0).setMBB(CondBrTarget);
7288 } else {
7289 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7290 // since we're swapping branch targets it needs to be reinserted.
7291 // FIXME: IRTranslator should probably not do this
7292 B.buildBr(*CondBrTarget);
7293 }
7294
7295 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7296 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7297 MI.eraseFromParent();
7298 BrCond->eraseFromParent();
7299 return true;
7300 }
7301
7302 return false;
7303 }
7304 case Intrinsic::amdgcn_loop: {
7305 MachineInstr *Br = nullptr;
7306 MachineBasicBlock *UncondBrTarget = nullptr;
7307 bool Negated = false;
7308 if (MachineInstr *BrCond =
7309 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7310 const SIRegisterInfo *TRI
7311 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7312
7313 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7314 Register Reg = MI.getOperand(2).getReg();
7315
7316 if (Negated)
7317 std::swap(CondBrTarget, UncondBrTarget);
7318
7319 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7320 B.buildInstr(AMDGPU::SI_LOOP)
7321 .addUse(Reg)
7322 .addMBB(UncondBrTarget);
7323
7324 if (Br)
7325 Br->getOperand(0).setMBB(CondBrTarget);
7326 else
7327 B.buildBr(*CondBrTarget);
7328
7329 MI.eraseFromParent();
7330 BrCond->eraseFromParent();
7331 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7332 return true;
7333 }
7334
7335 return false;
7336 }
7337 case Intrinsic::amdgcn_addrspacecast_nonnull:
7338 return legalizeAddrSpaceCast(MI, MRI, B);
7339 case Intrinsic::amdgcn_make_buffer_rsrc:
7341 case Intrinsic::amdgcn_kernarg_segment_ptr:
7342 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7343 // This only makes sense to call in a kernel, so just lower to null.
7344 B.buildConstant(MI.getOperand(0).getReg(), 0);
7345 MI.eraseFromParent();
7346 return true;
7347 }
7348
7351 case Intrinsic::amdgcn_implicitarg_ptr:
7352 return legalizeImplicitArgPtr(MI, MRI, B);
7353 case Intrinsic::amdgcn_workitem_id_x:
7354 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7356 case Intrinsic::amdgcn_workitem_id_y:
7357 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7359 case Intrinsic::amdgcn_workitem_id_z:
7360 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7362 case Intrinsic::amdgcn_workgroup_id_x:
7365 case Intrinsic::amdgcn_workgroup_id_y:
7368 case Intrinsic::amdgcn_workgroup_id_z:
7371 case Intrinsic::amdgcn_wave_id:
7372 return legalizeWaveID(MI, B);
7373 case Intrinsic::amdgcn_lds_kernel_id:
7376 case Intrinsic::amdgcn_dispatch_ptr:
7379 case Intrinsic::amdgcn_queue_ptr:
7382 case Intrinsic::amdgcn_implicit_buffer_ptr:
7385 case Intrinsic::amdgcn_dispatch_id:
7388 case Intrinsic::r600_read_ngroups_x:
7389 // TODO: Emit error for hsa
7392 case Intrinsic::r600_read_ngroups_y:
7395 case Intrinsic::r600_read_ngroups_z:
7398 case Intrinsic::r600_read_local_size_x:
7399 // TODO: Could insert G_ASSERT_ZEXT from s16
7401 case Intrinsic::r600_read_local_size_y:
7402 // TODO: Could insert G_ASSERT_ZEXT from s16
7404 // TODO: Could insert G_ASSERT_ZEXT from s16
7405 case Intrinsic::r600_read_local_size_z:
7407 case Intrinsic::r600_read_global_size_x:
7409 case Intrinsic::r600_read_global_size_y:
7411 case Intrinsic::r600_read_global_size_z:
7413 case Intrinsic::amdgcn_fdiv_fast:
7414 return legalizeFDIVFastIntrin(MI, MRI, B);
7415 case Intrinsic::amdgcn_is_shared:
7417 case Intrinsic::amdgcn_is_private:
7419 case Intrinsic::amdgcn_wavefrontsize: {
7420 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7421 MI.eraseFromParent();
7422 return true;
7423 }
7424 case Intrinsic::amdgcn_s_buffer_load:
7425 return legalizeSBufferLoad(Helper, MI);
7426 case Intrinsic::amdgcn_raw_buffer_store:
7427 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7428 case Intrinsic::amdgcn_struct_buffer_store:
7429 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7430 return legalizeBufferStore(MI, Helper, false, false);
7431 case Intrinsic::amdgcn_raw_buffer_store_format:
7432 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7433 case Intrinsic::amdgcn_struct_buffer_store_format:
7434 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7435 return legalizeBufferStore(MI, Helper, false, true);
7436 case Intrinsic::amdgcn_raw_tbuffer_store:
7437 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7438 case Intrinsic::amdgcn_struct_tbuffer_store:
7439 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7440 return legalizeBufferStore(MI, Helper, true, true);
7441 case Intrinsic::amdgcn_raw_buffer_load:
7442 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7443 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7444 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7445 case Intrinsic::amdgcn_struct_buffer_load:
7446 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7449 return legalizeBufferLoad(MI, Helper, false, false);
7450 case Intrinsic::amdgcn_raw_buffer_load_format:
7451 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7452 case Intrinsic::amdgcn_struct_buffer_load_format:
7453 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7454 return legalizeBufferLoad(MI, Helper, true, false);
7455 case Intrinsic::amdgcn_raw_tbuffer_load:
7456 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7457 case Intrinsic::amdgcn_struct_tbuffer_load:
7458 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7459 return legalizeBufferLoad(MI, Helper, true, true);
7460 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7461 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7462 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7463 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7464 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7466 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7467 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7468 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7469 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7470 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7471 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7472 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7473 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7474 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7475 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7476 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7478 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7480 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7481 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7482 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7483 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7484 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7485 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7486 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7487 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7488 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7489 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7490 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7492 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7493 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7494 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7495 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7496 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7498 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7500 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7501 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7502 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7504 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7505 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7506 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7507 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7508 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7509 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7510 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7511 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7512 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7513 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7514 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7515 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7516 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7518 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7520 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7521 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7522 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7523 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7524 return legalizeBufferAtomic(MI, B, IntrID);
7525 case Intrinsic::amdgcn_rsq_clamp:
7527 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7528 return legalizeBVHIntrinsic(MI, B);
7529 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7530 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7534 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7535 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7536 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7537 Register Index = MI.getOperand(5).getReg();
7538 LLT S32 = LLT::scalar(32);
7539 if (MRI.getType(Index) != S32)
7540 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7541 return true;
7542 }
7543 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7544 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7545 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7546 Register Index = MI.getOperand(7).getReg();
7547 LLT S32 = LLT::scalar(32);
7548 if (MRI.getType(Index) != S32)
7549 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7550 return true;
7551 }
7552 case Intrinsic::amdgcn_fmed3: {
7553 GISelChangeObserver &Observer = Helper.Observer;
7554
7555 // FIXME: This is to workaround the inability of tablegen match combiners to
7556 // match intrinsics in patterns.
7557 Observer.changingInstr(MI);
7558 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7559 MI.removeOperand(1);
7560 Observer.changedInstr(MI);
7561 return true;
7562 }
7563 case Intrinsic::amdgcn_readlane:
7564 case Intrinsic::amdgcn_writelane:
7565 case Intrinsic::amdgcn_readfirstlane:
7566 case Intrinsic::amdgcn_permlane16:
7567 case Intrinsic::amdgcn_permlanex16:
7568 case Intrinsic::amdgcn_permlane64:
7569 case Intrinsic::amdgcn_set_inactive:
7570 case Intrinsic::amdgcn_set_inactive_chain_arg:
7571 case Intrinsic::amdgcn_mov_dpp8:
7572 case Intrinsic::amdgcn_update_dpp:
7573 return legalizeLaneOp(Helper, MI, IntrID);
7574 case Intrinsic::amdgcn_s_buffer_prefetch_data:
7575 return legalizeSBufferPrefetch(Helper, MI);
7576 default: {
7577 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7579 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7580 return true;
7581 }
7582 }
7583
7584 return true;
7585}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
unsigned Intr
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static const LLT V3S64
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static const LLT V16S16
static const LLT S128
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static const LLT V4S32
static const LLT V2S32
static const LLT V8S64
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static const LLT V12S32
static const LLT V8S32
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static const LLT V2S16
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static const LLT V4S64
static const LLT S1
static const LLT V3S32
static const LLT S64
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static const LLT S32
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static const LLT S1024
static const LLT V6S32
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static const LLT V7S32
static const LLT V5S32
static const LLT V4S16
static const LLT V11S32
static const LLT F64
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static const LLT V32S32
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static const LLT V9S32
static const LLT V10S32
static const LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static const LLT V12S16
static const LLT V16S64
static const LLT S512
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V16S32
static const LLT V7S64
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static const LLT V5S64
static const LLT S160
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static const LLT V4S128
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static const LLT V2BF16
static const LLT V6S64
static constexpr unsigned MaxRegisterSize
static const LLT V2S8
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static const LLT S96
static const LLT V2S64
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static const LLT S16
static const LLT V10S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V2S128
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static const LLT V2F16
static const LLT S256
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static const LLT S8
static const LLT V6S16
static bool isRegisterVectorType(LLT Ty)
static const LLT S224
static const LLT V8S16
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
static const LLT F32
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition: MachO.cpp:71
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define R2(n)
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1214
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
Value * RHS
Value * LHS
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1155
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1135
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1095
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:163
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool hasA16() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:487
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:465
bool hasMadF16() const
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:471
bool hasMad64_32() const
Definition: GCNSubtarget.h:755
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:467
bool hasIntClamp() const
Definition: GCNSubtarget.h:367
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:287
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:387
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:744
bool hasNSAEncoding() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:742
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasDPALU_DPP() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasFractBug() const
Definition: GCNSubtarget.h:405
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
static constexpr LLT float64()
Get a 64-bit IEEE double value.
Definition: LowLevelType.h:94
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
constexpr bool isScalar() const
Definition: LowLevelType.h:146
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
Definition: LowLevelType.h:211
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:64
constexpr bool isPointerVector() const
Definition: LowLevelType.h:152
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:159
constexpr bool isVector() const
Definition: LowLevelType.h:148
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:190
constexpr bool isPointer() const
Definition: LowLevelType.h:149
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:277
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:183
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:218
static constexpr LLT float16()
Get a 16-bit IEEE half value.
Definition: LowLevelType.h:84
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:270
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr bool isPointerOrPointerVector() const
Definition: LowLevelType.h:153
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
Definition: LowLevelType.h:227
constexpr LLT getScalarType() const
Definition: LowLevelType.h:205
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
Definition: LowLevelType.h:124
static constexpr LLT float32()
Get a 32-bit IEEE float value.
Definition: LowLevelType.h:89
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:390
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:415
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void truncate(size_type N)
Like resize, but requires that N is less than size().
Definition: SmallVector.h:644
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:895
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:353
@ Offset
Definition: DWP.cpp:480
Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition: Utils.cpp:1960
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:630
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:444
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
void * PointerTy
Definition: GenericValue.h:21
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:299
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Add
Sum of integers.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition: Utils.cpp:1651
@ DS_Warning
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:418
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:382
@ Enable
Enable colors.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static constexpr uint64_t encode(Fields... Values)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:265
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:266
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:79
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.