LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::scalar(32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::scalar(64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
327constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
328constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
329constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
330constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
331constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
332constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
333constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
334constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
335constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
336constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
337constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
338constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
341constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
342constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
343constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
344constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
345constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
346constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
347constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
350constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
364
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
375
378 (ST.useRealTrue16Insts() && Ty == S16) ||
380}
381
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 Query.MMODescrs[0].Ordering !=
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
500 Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
601 RoundedSize, AddrSpace, Align(AlignInBits / 8),
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
612 Query.MMODescrs[0].AlignInBits,
613 Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(Idx);
622
623 const LLT PointerTy = MRI.getType(MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
647 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Unmerged.getReg(I));
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 }
675 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
677}
678
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
686 return;
688}
689
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
736
737 // s1 for VCC branches, s32 for SCC branches.
739
740 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
741 // elements for v3s16
744 .legalFor(AllS32Vectors)
746 .legalFor(AddrSpaces64)
747 .legalFor(AddrSpaces32)
748 .legalFor(AddrSpaces128)
749 .legalIf(isPointer(0))
750 .clampScalar(0, S16, S256)
752 .clampMaxNumElements(0, S32, 16)
754 .scalarize(0);
755
756 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
757 // Full set of gfx9 features.
758 if (ST.hasScalarAddSub64()) {
759 getActionDefinitionsBuilder({G_ADD, G_SUB})
760 .legalFor({S64, S32, S16, V2S16})
761 .clampMaxNumElementsStrict(0, S16, 2)
762 .scalarize(0)
763 .minScalar(0, S16)
765 .maxScalar(0, S32);
766 } else {
767 getActionDefinitionsBuilder({G_ADD, G_SUB})
768 .legalFor({S32, S16, V2S16})
769 .clampMaxNumElementsStrict(0, S16, 2)
770 .scalarize(0)
771 .minScalar(0, S16)
773 .maxScalar(0, S32);
774 }
775
776 if (ST.hasScalarSMulU64()) {
778 .legalFor({S64, S32, S16, V2S16})
779 .clampMaxNumElementsStrict(0, S16, 2)
780 .scalarize(0)
781 .minScalar(0, S16)
783 .custom();
784 } else {
786 .legalFor({S32, S16, V2S16})
787 .clampMaxNumElementsStrict(0, S16, 2)
788 .scalarize(0)
789 .minScalar(0, S16)
791 .custom();
792 }
793 assert(ST.hasMad64_32());
794
795 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
796 .legalFor({S32, S16, V2S16}) // Clamp modifier
797 .minScalarOrElt(0, S16)
799 .scalarize(0)
801 .lower();
802 } else if (ST.has16BitInsts()) {
803 getActionDefinitionsBuilder({G_ADD, G_SUB})
804 .legalFor({S32, S16})
805 .minScalar(0, S16)
807 .maxScalar(0, S32)
808 .scalarize(0);
809
811 .legalFor({S32, S16})
812 .scalarize(0)
813 .minScalar(0, S16)
815 .custom();
816 assert(ST.hasMad64_32());
817
818 // Technically the saturating operations require clamp bit support, but this
819 // was introduced at the same time as 16-bit operations.
820 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
821 .legalFor({S32, S16}) // Clamp modifier
822 .minScalar(0, S16)
823 .scalarize(0)
825 .lower();
826
827 // We're just lowering this, but it helps get a better result to try to
828 // coerce to the desired type first.
829 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
830 .minScalar(0, S16)
831 .scalarize(0)
832 .lower();
833 } else {
834 getActionDefinitionsBuilder({G_ADD, G_SUB})
835 .legalFor({S32})
836 .widenScalarToNextMultipleOf(0, 32)
837 .clampScalar(0, S32, S32)
838 .scalarize(0);
839
840 auto &Mul = getActionDefinitionsBuilder(G_MUL)
841 .legalFor({S32})
842 .scalarize(0)
843 .minScalar(0, S32)
845
846 if (ST.hasMad64_32())
847 Mul.custom();
848 else
849 Mul.maxScalar(0, S32);
850
851 if (ST.hasIntClamp()) {
852 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
853 .legalFor({S32}) // Clamp modifier.
854 .scalarize(0)
856 .lower();
857 } else {
858 // Clamp bit support was added in VI, along with 16-bit operations.
859 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
860 .minScalar(0, S32)
861 .scalarize(0)
862 .lower();
863 }
864
865 // FIXME: DAG expansion gets better results. The widening uses the smaller
866 // range values and goes for the min/max lowering directly.
867 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
868 .minScalar(0, S32)
869 .scalarize(0)
870 .lower();
871 }
872
874 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
875 .customFor({S32, S64})
876 .clampScalar(0, S32, S64)
878 .scalarize(0);
879
880 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
881 .legalFor({S32})
882 .maxScalar(0, S32);
883
884 if (ST.hasVOP3PInsts()) {
885 Mulh
886 .clampMaxNumElements(0, S8, 2)
887 .lowerFor({V2S8});
888 }
889
890 Mulh
891 .scalarize(0)
892 .lower();
893
894 // Report legal for any types we can handle anywhere. For the cases only legal
895 // on the SALU, RegBankSelect will be able to re-legalize.
896 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
897 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
898 .clampScalar(0, S32, S64)
904 .scalarize(0);
905
907 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
908 .legalFor({{S32, S1}, {S32, S32}})
909 .clampScalar(0, S32, S32)
910 .scalarize(0);
911
913 // Don't worry about the size constraint.
915 .lower();
916
918 .legalFor({S1, S32, S64, S16, GlobalPtr,
919 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
920 .legalIf(isPointer(0))
921 .clampScalar(0, S32, S64)
923
924 getActionDefinitionsBuilder(G_FCONSTANT)
925 .legalFor({S32, S64, S16})
926 .clampScalar(0, S16, S64);
927
928 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
929 .legalIf(isRegisterClassType(ST, 0))
930 // s1 and s16 are special cases because they have legal operations on
931 // them, but don't really occupy registers in the normal way.
932 .legalFor({S1, S16})
933 .clampNumElements(0, V16S32, V32S32)
937 .clampMaxNumElements(0, S32, 16);
938
939 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
940
941 // If the amount is divergent, we have to do a wave reduction to get the
942 // maximum value, so this is expanded during RegBankSelect.
943 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
944 .legalFor({{PrivatePtr, S32}});
945
946 getActionDefinitionsBuilder(G_STACKSAVE)
947 .customFor({PrivatePtr});
948 getActionDefinitionsBuilder(G_STACKRESTORE)
949 .legalFor({PrivatePtr});
950
951 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
952
953 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
954 .customIf(typeIsNot(0, PrivatePtr));
955
956 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
957
958 auto &FPOpActions = getActionDefinitionsBuilder(
959 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
960 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
961 .legalFor({S32, S64});
962 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
963 .customFor({S32, S64});
964 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
965 .customFor({S32, S64});
966
967 if (ST.has16BitInsts()) {
968 if (ST.hasVOP3PInsts())
969 FPOpActions.legalFor({S16, V2S16});
970 else
971 FPOpActions.legalFor({S16});
972
973 TrigActions.customFor({S16});
974 FDIVActions.customFor({S16});
975 }
976
977 if (ST.hasPackedFP32Ops()) {
978 FPOpActions.legalFor({V2S32});
979 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
980 }
981
982 auto &MinNumMaxNumIeee =
983 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
984
985 if (ST.hasVOP3PInsts()) {
986 MinNumMaxNumIeee.legalFor(FPTypesPK16)
987 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
988 .clampMaxNumElements(0, S16, 2)
989 .clampScalar(0, S16, S64)
990 .scalarize(0);
991 } else if (ST.has16BitInsts()) {
992 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
993 } else {
994 MinNumMaxNumIeee.legalFor(FPTypesBase)
995 .clampScalar(0, S32, S64)
996 .scalarize(0);
997 }
998
999 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1000 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1001
1002 if (ST.hasVOP3PInsts()) {
1003 MinNumMaxNum.customFor(FPTypesPK16)
1004 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1005 .clampMaxNumElements(0, S16, 2)
1006 .clampScalar(0, S16, S64)
1007 .scalarize(0);
1008 } else if (ST.has16BitInsts()) {
1009 MinNumMaxNum.customFor(FPTypes16)
1010 .clampScalar(0, S16, S64)
1011 .scalarize(0);
1012 } else {
1013 MinNumMaxNum.customFor(FPTypesBase)
1014 .clampScalar(0, S32, S64)
1015 .scalarize(0);
1016 }
1017
1018 if (ST.hasVOP3PInsts())
1019 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1020
1021 FPOpActions
1022 .scalarize(0)
1023 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1024
1025 TrigActions
1026 .scalarize(0)
1027 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1028
1029 FDIVActions
1030 .scalarize(0)
1031 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1032
1033 auto &FNegAbs = getActionDefinitionsBuilder({G_FNEG, G_FABS});
1034 FNegAbs.legalFor(FPTypesPK16)
1035 .legalFor(ST.hasPackedFP32Ops(), {V2S32})
1037 if (ST.hasPackedFP32Ops())
1038 FNegAbs.clampMaxNumElementsStrict(0, S32, 2);
1039 FNegAbs.scalarize(0).clampScalar(0, S16, S64);
1040
1041 if (ST.has16BitInsts()) {
1043 .legalFor({S16})
1044 .customFor({S32, S64})
1045 .scalarize(0)
1046 .unsupported();
1048 .legalFor({S32, S64, S16})
1049 .scalarize(0)
1050 .clampScalar(0, S16, S64);
1051
1052 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1053 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1054 .scalarize(0)
1055 .maxScalarIf(typeIs(0, S16), 1, S16)
1056 .clampScalar(1, S32, S32)
1057 .lower();
1058
1060 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1061 .scalarize(0)
1062 .lower();
1063
1065 .lowerFor({S16, S32, S64})
1066 .scalarize(0)
1067 .lower();
1068 } else {
1070 .customFor({S32, S64, S16})
1071 .scalarize(0)
1072 .unsupported();
1073
1074
1075 if (ST.hasFractBug()) {
1077 .customFor({S64})
1078 .legalFor({S32, S64})
1079 .scalarize(0)
1080 .clampScalar(0, S32, S64);
1081 } else {
1083 .legalFor({S32, S64})
1084 .scalarize(0)
1085 .clampScalar(0, S32, S64);
1086 }
1087
1088 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1089 .legalFor({{S32, S32}, {S64, S32}})
1090 .scalarize(0)
1091 .clampScalar(0, S32, S64)
1092 .clampScalar(1, S32, S32)
1093 .lower();
1094
1096 .customFor({{S32, S32}, {S64, S32}})
1097 .scalarize(0)
1098 .minScalar(0, S32)
1099 .clampScalar(1, S32, S32)
1100 .lower();
1101
1103 .lowerFor({S32, S64})
1104 .scalarize(0)
1105 .lower();
1106 }
1107
1108 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1109 if (ST.hasCvtPkF16F32Inst()) {
1110 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1111 .clampMaxNumElements(0, S16, 2);
1112 } else {
1113 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1114 }
1115 FPTruncActions.scalarize(0).lower();
1116
1118 .legalFor({{S64, S32}, {S32, S16}})
1119 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1120 .scalarize(0);
1121
1122 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1123 if (ST.has16BitInsts()) {
1124 FSubActions
1125 // Use actual fsub instruction
1126 .legalFor({S32, S16})
1127 // Must use fadd + fneg
1128 .lowerFor({S64, V2S16});
1129 } else {
1130 FSubActions
1131 // Use actual fsub instruction
1132 .legalFor({S32})
1133 // Must use fadd + fneg
1134 .lowerFor({S64, S16, V2S16});
1135 }
1136
1137 FSubActions
1138 .scalarize(0)
1139 .clampScalar(0, S32, S64);
1140
1141 // Whether this is legal depends on the floating point mode for the function.
1142 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1143 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1144 FMad.customFor({S32, S16});
1145 else if (ST.hasMadMacF32Insts())
1146 FMad.customFor({S32});
1147 else if (ST.hasMadF16())
1148 FMad.customFor({S16});
1149 FMad.scalarize(0)
1150 .lower();
1151
1152 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1153 if (ST.has16BitInsts()) {
1154 FRem.customFor({S16, S32, S64});
1155 } else {
1156 FRem.minScalar(0, S32)
1157 .customFor({S32, S64});
1158 }
1159 FRem.scalarize(0);
1160
1161 // TODO: Do we need to clamp maximum bitwidth?
1163 .legalIf(isScalar(0))
1164 .legalFor({{V2S16, V2S32}})
1165 .clampMaxNumElements(0, S16, 2)
1166 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1167 // situations (like an invalid implicit use), we don't want to infinite loop
1168 // in the legalizer.
1170 .alwaysLegal();
1171
1172 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1173 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1174 {S32, S1}, {S64, S1}, {S16, S1}})
1175 .scalarize(0)
1176 .clampScalar(0, S32, S64)
1177 .widenScalarToNextPow2(1, 32);
1178
1179 // TODO: Split s1->s64 during regbankselect for VALU.
1180 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1181 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1182 .lowerIf(typeIs(1, S1))
1183 .customFor({{S32, S64}, {S64, S64}});
1184 if (ST.has16BitInsts())
1185 IToFP.legalFor({{S16, S16}});
1186 IToFP.clampScalar(1, S32, S64)
1187 .minScalar(0, S32)
1188 .scalarize(0)
1190
1191 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1192 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1193 .customFor({{S64, S32}, {S64, S64}})
1194 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1195 if (ST.has16BitInsts())
1196 FPToI.legalFor({{S16, S16}});
1197 else
1198 FPToI.minScalar(1, S32);
1199
1200 FPToI.minScalar(0, S32)
1201 .widenScalarToNextPow2(0, 32)
1202 .scalarize(0)
1203 .lower();
1204
1205 // clang-format off
1206 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1207 .legalFor({{S32, S32}, {S32, S64}})
1208 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1209 if (ST.has16BitInsts())
1210 FPToISat.legalFor({{S16, S16}});
1211
1212 FPToISat.minScalar(1, S32);
1213 FPToISat.minScalar(0, S32)
1214 .widenScalarToNextPow2(0, 32)
1215 .scalarize(0)
1216 .lower();
1217 // clang-format on
1218
1219 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1220 .clampScalar(0, S16, S64)
1221 .scalarize(0)
1222 .lower();
1223
1224 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1225 .legalFor({S16, S32})
1226 .scalarize(0)
1227 .lower();
1228
1229 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1230 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1231 .scalarize(0)
1232 .lower();
1233
1234 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1235 .clampScalar(0, S16, S64)
1236 .scalarize(0)
1237 .lower();
1238
1239 if (ST.has16BitInsts()) {
1241 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1242 .legalFor({S16, S32, S64})
1243 .clampScalar(0, S16, S64)
1244 .scalarize(0);
1245 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1247 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1248 .legalFor({S32, S64})
1249 .clampScalar(0, S32, S64)
1250 .scalarize(0);
1251 } else {
1253 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1254 .legalFor({S32})
1255 .customFor({S64})
1256 .clampScalar(0, S32, S64)
1257 .scalarize(0);
1258 }
1259
1261 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1262 .legalIf(all(isPointer(0), sameSize(0, 1)))
1263 .scalarize(0)
1264 .scalarSameSizeAs(1, 0);
1265
1267 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1268 .scalarSameSizeAs(1, 0)
1269 .scalarize(0);
1270
1271 auto &CmpBuilder =
1273 // The compare output type differs based on the register bank of the output,
1274 // so make both s1 and s32 legal.
1275 //
1276 // Scalar compares producing output in scc will be promoted to s32, as that
1277 // is the allocatable register type that will be needed for the copy from
1278 // scc. This will be promoted during RegBankSelect, and we assume something
1279 // before that won't try to use s32 result types.
1280 //
1281 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1282 // bank.
1284 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1285 .legalForCartesianProduct(
1286 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1287 if (ST.has16BitInsts()) {
1288 CmpBuilder.legalFor({{S1, S16}});
1289 }
1290
1291 CmpBuilder
1293 .clampScalar(1, S32, S64)
1294 .scalarize(0)
1295 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1296
1297 auto &FCmpBuilder =
1299 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1300
1301 if (ST.hasSALUFloatInsts())
1302 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1303
1304 FCmpBuilder
1306 .clampScalar(1, S32, S64)
1307 .scalarize(0);
1308
1309 // FIXME: fpow has a selection pattern that should move to custom lowering.
1310 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1311 if (ST.has16BitInsts())
1312 ExpOps.customFor({{S32}, {S16}});
1313 else
1314 ExpOps.customFor({S32});
1315 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1316 .scalarize(0);
1317
1319 .clampScalar(0, MinScalarFPTy, S32)
1320 .lower();
1321
1323 .legalFor(ST.has16BitInsts(), {S16})
1324 .customFor({S32, S16})
1325 .scalarize(0)
1326 .lower();
1327
1329 .legalFor(ST.has16BitInsts(), {S16})
1330 .customFor({S32, S64, S16})
1331 .scalarize(0)
1332 .lower();
1333
1334 auto &LogOps =
1335 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1336 LogOps.customFor({S32, S16, S64});
1337 LogOps.clampScalar(0, MinScalarFPTy, S32)
1338 .scalarize(0);
1339
1340 // The 64-bit versions produce 32-bit results, but only on the SALU.
1342 .legalFor({{S32, S32}, {S32, S64}})
1343 .clampScalar(0, S32, S32)
1344 .widenScalarToNextPow2(1, 32)
1345 .clampScalar(1, S32, S64)
1346 .scalarize(0)
1347 .widenScalarToNextPow2(0, 32);
1348
1349 // If no 16 bit instr is available, lower into different instructions.
1350 if (ST.has16BitInsts())
1351 getActionDefinitionsBuilder(G_IS_FPCLASS)
1352 .legalForCartesianProduct({S1}, FPTypes16)
1353 .widenScalarToNextPow2(1)
1354 .scalarize(0)
1355 .lower();
1356 else
1357 getActionDefinitionsBuilder(G_IS_FPCLASS)
1358 .legalForCartesianProduct({S1}, FPTypesBase)
1359 .lowerFor({S1, S16})
1360 .widenScalarToNextPow2(1)
1361 .scalarize(0)
1362 .lower();
1363
1364 // The hardware instructions return a different result on 0 than the generic
1365 // instructions expect. The hardware produces -1, but these produce the
1366 // bitwidth.
1367 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1368 .scalarize(0)
1369 .clampScalar(0, S32, S32)
1370 .clampScalar(1, S32, S64)
1371 .widenScalarToNextPow2(0, 32)
1372 .widenScalarToNextPow2(1, 32)
1373 .custom();
1374
1375 // The 64-bit versions produce 32-bit results, but only on the SALU.
1376 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1377 .legalFor({{S32, S32}, {S32, S64}})
1378 .customIf(scalarNarrowerThan(1, 32))
1379 .clampScalar(0, S32, S32)
1380 .clampScalar(1, S32, S64)
1381 .scalarize(0)
1382 .widenScalarToNextPow2(0, 32)
1383 .widenScalarToNextPow2(1, 32);
1384
1385 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1386 .legalFor({{S32, S32}, {S32, S64}})
1387 .clampScalar(0, S32, S32)
1388 .clampScalar(1, S32, S64)
1389 .scalarize(0)
1390 .widenScalarToNextPow2(0, 32)
1391 .widenScalarToNextPow2(1, 32);
1392
1394 .customFor({{S32, S32}})
1395 .scalarize(0)
1396 .clampScalar(0, S32, S32)
1397 .clampScalar(1, S32, S32);
1398
1399 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1400 // RegBankSelect.
1401 getActionDefinitionsBuilder(G_BITREVERSE)
1402 .legalFor({S32, S64})
1403 .clampScalar(0, S32, S64)
1404 .scalarize(0)
1406
1407 if (ST.has16BitInsts()) {
1409 .legalFor({S16, S32, V2S16})
1410 .clampMaxNumElementsStrict(0, S16, 2)
1411 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1412 // narrowScalar limitation.
1414 .clampScalar(0, S16, S32)
1415 .scalarize(0);
1416
1417 if (ST.hasVOP3PInsts()) {
1419 .legalFor({S32, S16, V2S16})
1420 .clampMaxNumElements(0, S16, 2)
1421 .minScalar(0, S16)
1423 .scalarize(0)
1424 .lower();
1425 if (ST.hasIntMinMax64()) {
1426 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1427 .legalFor({S32, S16, S64, V2S16})
1428 .clampMaxNumElements(0, S16, 2)
1429 .minScalar(0, S16)
1431 .scalarize(0)
1432 .lower();
1433 } else {
1434 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1435 .legalFor({S32, S16, V2S16})
1436 .clampMaxNumElements(0, S16, 2)
1437 .minScalar(0, S16)
1439 .scalarize(0)
1440 .lower();
1441 }
1442 } else {
1443 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1444 .legalFor({S32, S16})
1445 .widenScalarToNextPow2(0)
1446 .minScalar(0, S16)
1447 .scalarize(0)
1448 .lower();
1449 }
1450 } else {
1451 // TODO: Should have same legality without v_perm_b32
1453 .legalFor({S32})
1454 .lowerIf(scalarNarrowerThan(0, 32))
1455 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1456 // narrowScalar limitation.
1458 .maxScalar(0, S32)
1459 .scalarize(0)
1460 .lower();
1461
1462 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1463 .legalFor({S32})
1464 .minScalar(0, S32)
1466 .scalarize(0)
1467 .lower();
1468 }
1469
1470 getActionDefinitionsBuilder(G_INTTOPTR)
1471 // List the common cases
1472 .legalForCartesianProduct(AddrSpaces64, {S64})
1473 .legalForCartesianProduct(AddrSpaces32, {S32})
1474 .scalarize(0)
1475 // Accept any address space as long as the size matches
1476 .legalIf(sameSize(0, 1))
1478 [](const LegalityQuery &Query) {
1479 return std::pair(
1480 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1481 })
1482 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1483 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1484 });
1485
1486 getActionDefinitionsBuilder(G_PTRTOINT)
1487 // List the common cases
1488 .legalForCartesianProduct(AddrSpaces64, {S64})
1489 .legalForCartesianProduct(AddrSpaces32, {S32})
1490 .scalarize(0)
1491 // Accept any address space as long as the size matches
1492 .legalIf(sameSize(0, 1))
1494 [](const LegalityQuery &Query) {
1495 return std::pair(
1496 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1497 })
1498 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1499 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1500 });
1501
1502 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1503 .scalarize(0)
1504 .custom();
1505
1506 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1507 bool IsLoad) -> bool {
1508 const LLT DstTy = Query.Types[0];
1509
1510 // Split vector extloads.
1511 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1512
1513 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1514 return true;
1515
1516 const LLT PtrTy = Query.Types[1];
1517 unsigned AS = PtrTy.getAddressSpace();
1518 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1519 Query.MMODescrs[0].Ordering !=
1521 return true;
1522
1523 // Catch weird sized loads that don't evenly divide into the access sizes
1524 // TODO: May be able to widen depending on alignment etc.
1525 unsigned NumRegs = (MemSize + 31) / 32;
1526 if (NumRegs == 3) {
1527 if (!ST.hasDwordx3LoadStores())
1528 return true;
1529 } else {
1530 // If the alignment allows, these should have been widened.
1531 if (!isPowerOf2_32(NumRegs))
1532 return true;
1533 }
1534
1535 return false;
1536 };
1537
1538 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1539 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1540 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1541
1542 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1543 // LDS
1544 // TODO: Unsupported flat for SI.
1545
1546 for (unsigned Op : {G_LOAD, G_STORE}) {
1547 const bool IsStore = Op == G_STORE;
1548
1549 auto &Actions = getActionDefinitionsBuilder(Op);
1550 // Explicitly list some common cases.
1551 // TODO: Does this help compile time at all?
1552 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1553 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1554 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1555 {S64, GlobalPtr, S64, GlobalAlign32},
1556 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1557 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1558 {S32, GlobalPtr, S8, GlobalAlign8},
1559 {S32, GlobalPtr, S16, GlobalAlign16},
1560
1561 {S32, LocalPtr, S32, 32},
1562 {S64, LocalPtr, S64, 32},
1563 {V2S32, LocalPtr, V2S32, 32},
1564 {S32, LocalPtr, S8, 8},
1565 {S32, LocalPtr, S16, 16},
1566 {V2S16, LocalPtr, S32, 32},
1567
1568 {S32, PrivatePtr, S32, 32},
1569 {S32, PrivatePtr, S8, 8},
1570 {S32, PrivatePtr, S16, 16},
1571 {V2S16, PrivatePtr, S32, 32},
1572
1573 {S32, ConstantPtr, S32, GlobalAlign32},
1574 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1575 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1576 {S64, ConstantPtr, S64, GlobalAlign32},
1577 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1578 Actions.legalIf(
1579 [=](const LegalityQuery &Query) -> bool {
1580 return isLoadStoreLegal(ST, Query);
1581 });
1582
1583 // The custom pointers (fat pointers, buffer resources) don't work with load
1584 // and store at this level. Fat pointers should have been lowered to
1585 // intrinsics before the translation to MIR.
1586 Actions.unsupportedIf(
1587 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1588
1589 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1590 // ptrtoint. This is needed to account for the fact that we can't have i128
1591 // as a register class for SelectionDAG reasons.
1592 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1593 return hasBufferRsrcWorkaround(Query.Types[0]);
1594 });
1595
1596 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1597 // 64-bits.
1598 //
1599 // TODO: Should generalize bitcast action into coerce, which will also cover
1600 // inserting addrspacecasts.
1601 Actions.customIf(typeIs(1, Constant32Ptr));
1602
1603 // Turn any illegal element vectors into something easier to deal
1604 // with. These will ultimately produce 32-bit scalar shifts to extract the
1605 // parts anyway.
1606 //
1607 // For odd 16-bit element vectors, prefer to split those into pieces with
1608 // 16-bit vector parts.
1609 Actions.bitcastIf(
1610 [=](const LegalityQuery &Query) -> bool {
1611 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1612 Query.MMODescrs[0].MemoryTy);
1613 }, bitcastToRegisterType(0));
1614
1615 if (!IsStore) {
1616 // Widen suitably aligned loads by loading extra bytes. The standard
1617 // legalization actions can't properly express widening memory operands.
1618 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1619 return shouldWidenLoad(ST, Query, G_LOAD);
1620 });
1621 }
1622
1623 // FIXME: load/store narrowing should be moved to lower action
1624 Actions
1625 .narrowScalarIf(
1626 [=](const LegalityQuery &Query) -> bool {
1627 return !Query.Types[0].isVector() &&
1628 needToSplitMemOp(Query, Op == G_LOAD);
1629 },
1630 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1631 const LLT DstTy = Query.Types[0];
1632 const LLT PtrTy = Query.Types[1];
1633
1634 const unsigned DstSize = DstTy.getSizeInBits();
1635 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1636
1637 // Split extloads.
1638 if (DstSize > MemSize)
1639 return std::pair(0, LLT::scalar(MemSize));
1640
1641 unsigned MaxSize = maxSizeForAddrSpace(
1642 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1643 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1644 if (MemSize > MaxSize)
1645 return std::pair(0, LLT::scalar(MaxSize));
1646
1647 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1648 return std::pair(0, LLT::scalar(Align));
1649 })
1650 .fewerElementsIf(
1651 [=](const LegalityQuery &Query) -> bool {
1652 return Query.Types[0].isVector() &&
1653 needToSplitMemOp(Query, Op == G_LOAD);
1654 },
1655 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1656 const LLT DstTy = Query.Types[0];
1657 const LLT PtrTy = Query.Types[1];
1658
1659 LLT EltTy = DstTy.getElementType();
1660 unsigned MaxSize = maxSizeForAddrSpace(
1661 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1662 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1663
1664 // FIXME: Handle widened to power of 2 results better. This ends
1665 // up scalarizing.
1666 // FIXME: 3 element stores scalarized on SI
1667
1668 // Split if it's too large for the address space.
1669 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1670 if (MemSize > MaxSize) {
1671 unsigned NumElts = DstTy.getNumElements();
1672 unsigned EltSize = EltTy.getSizeInBits();
1673
1674 if (MaxSize % EltSize == 0) {
1675 return std::pair(
1677 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1678 }
1679
1680 unsigned NumPieces = MemSize / MaxSize;
1681
1682 // FIXME: Refine when odd breakdowns handled
1683 // The scalars will need to be re-legalized.
1684 if (NumPieces == 1 || NumPieces >= NumElts ||
1685 NumElts % NumPieces != 0)
1686 return std::pair(0, EltTy);
1687
1688 return std::pair(0,
1689 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1690 }
1691
1692 // FIXME: We could probably handle weird extending loads better.
1693 if (DstTy.getSizeInBits() > MemSize)
1694 return std::pair(0, EltTy);
1695
1696 unsigned EltSize = EltTy.getSizeInBits();
1697 unsigned DstSize = DstTy.getSizeInBits();
1698 if (!isPowerOf2_32(DstSize)) {
1699 // We're probably decomposing an odd sized store. Try to split
1700 // to the widest type. TODO: Account for alignment. As-is it
1701 // should be OK, since the new parts will be further legalized.
1702 unsigned FloorSize = llvm::bit_floor(DstSize);
1703 return std::pair(
1705 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1706 }
1707
1708 // May need relegalization for the scalars.
1709 return std::pair(0, EltTy);
1710 })
1711 .minScalar(0, S32)
1712 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1714 .widenScalarToNextPow2(0)
1715 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1716 .lower();
1717 }
1718
1719 // FIXME: Unaligned accesses not lowered.
1720 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1721 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1722 {S32, GlobalPtr, S16, 2 * 8},
1723 {S32, LocalPtr, S8, 8},
1724 {S32, LocalPtr, S16, 16},
1725 {S32, PrivatePtr, S8, 8},
1726 {S32, PrivatePtr, S16, 16},
1727 {S32, ConstantPtr, S8, 8},
1728 {S32, ConstantPtr, S16, 2 * 8}})
1729 .legalIf(
1730 [=](const LegalityQuery &Query) -> bool {
1731 return isLoadStoreLegal(ST, Query);
1732 });
1733
1734 if (ST.hasFlatAddressSpace()) {
1735 ExtLoads.legalForTypesWithMemDesc(
1736 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1737 }
1738
1739 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1740 // 64-bits.
1741 //
1742 // TODO: Should generalize bitcast action into coerce, which will also cover
1743 // inserting addrspacecasts.
1744 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1745
1746 ExtLoads.clampScalar(0, S32, S32)
1748 .lower();
1749
1750 auto &Atomics = getActionDefinitionsBuilder(
1751 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1752 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1753 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1754 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1755 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1756 {S64, GlobalPtr}, {S64, LocalPtr},
1757 {S32, RegionPtr}, {S64, RegionPtr}});
1758 if (ST.hasFlatAddressSpace()) {
1759 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1760 }
1761
1762 auto &Atomics32 =
1763 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1764 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1765 if (ST.hasFlatAddressSpace()) {
1766 Atomics32.legalFor({{S32, FlatPtr}});
1767 }
1768
1769 // TODO: v2bf16 operations, and fat buffer pointer support.
1770 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1771 if (ST.hasLDSFPAtomicAddF32()) {
1772 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1773 if (ST.hasLdsAtomicAddF64())
1774 Atomic.legalFor({{S64, LocalPtr}});
1775 if (ST.hasAtomicDsPkAdd16Insts())
1776 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1777 }
1778 if (ST.hasAtomicFaddInsts())
1779 Atomic.legalFor({{S32, GlobalPtr}});
1780 if (ST.hasFlatAtomicFaddF32Inst())
1781 Atomic.legalFor({{S32, FlatPtr}});
1782
1783 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1784 // These are legal with some caveats, and should have undergone expansion in
1785 // the IR in most situations
1786 // TODO: Move atomic expansion into legalizer
1787 Atomic.legalFor({
1788 {S32, GlobalPtr},
1789 {S64, GlobalPtr},
1790 {S64, FlatPtr}
1791 });
1792 }
1793
1794 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1795 ST.hasAtomicBufferGlobalPkAddF16Insts())
1796 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1797 if (ST.hasAtomicGlobalPkAddBF16Inst())
1798 Atomic.legalFor({{V2BF16, GlobalPtr}});
1799 if (ST.hasAtomicFlatPkAdd16Insts())
1800 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1801
1802
1803 // Most of the legalization work here is done by AtomicExpand. We could
1804 // probably use a simpler legality rule that just assumes anything is OK.
1805 auto &AtomicFMinFMax =
1806 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1807 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1808
1809 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1810 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1811 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1812 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1813 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1814 AtomicFMinFMax.legalFor({F32, FlatPtr});
1815 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1816 AtomicFMinFMax.legalFor({F64, FlatPtr});
1817
1818 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1819 // demarshalling
1820 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1821 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1822 {S32, FlatPtr}, {S64, FlatPtr}})
1823 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1824 {S32, RegionPtr}, {S64, RegionPtr}});
1825 // TODO: Pointer types, any 32-bit or 64-bit vector
1826
1827 // Condition should be s32 for scalar, s1 for vector.
1830 LocalPtr, FlatPtr, PrivatePtr,
1831 LLT::fixed_vector(2, LocalPtr),
1832 LLT::fixed_vector(2, PrivatePtr)},
1833 {S1, S32})
1834 .clampScalar(0, S16, S64)
1835 .scalarize(1)
1838 .clampMaxNumElements(0, S32, 2)
1839 .clampMaxNumElements(0, LocalPtr, 2)
1840 .clampMaxNumElements(0, PrivatePtr, 2)
1841 .scalarize(0)
1843 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1844
1845 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1846 // be more flexible with the shift amount type.
1847 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1848 .legalFor({{S32, S32}, {S64, S32}});
1849 if (ST.has16BitInsts()) {
1850 if (ST.hasVOP3PInsts()) {
1851 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1852 .clampMaxNumElements(0, S16, 2);
1853 } else
1854 Shifts.legalFor({{S16, S16}});
1855
1856 // TODO: Support 16-bit shift amounts for all types
1857 Shifts.widenScalarIf(
1858 [=](const LegalityQuery &Query) {
1859 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1860 // 32-bit amount.
1861 const LLT ValTy = Query.Types[0];
1862 const LLT AmountTy = Query.Types[1];
1863 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1864 AmountTy.getSizeInBits() < 16;
1865 }, changeTo(1, S16));
1866 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1867 Shifts.clampScalar(1, S32, S32);
1868 Shifts.widenScalarToNextPow2(0, 16);
1869 Shifts.clampScalar(0, S16, S64);
1870
1871 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1872 .minScalar(0, S16)
1873 .scalarize(0)
1874 .lower();
1875 } else {
1876 // Make sure we legalize the shift amount type first, as the general
1877 // expansion for the shifted type will produce much worse code if it hasn't
1878 // been truncated already.
1879 Shifts.clampScalar(1, S32, S32);
1880 Shifts.widenScalarToNextPow2(0, 32);
1881 Shifts.clampScalar(0, S32, S64);
1882
1883 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1884 .minScalar(0, S32)
1885 .scalarize(0)
1886 .lower();
1887 }
1888 Shifts.scalarize(0);
1889
1890 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1891 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1892 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1893 unsigned IdxTypeIdx = 2;
1894
1896 .customIf([=](const LegalityQuery &Query) {
1897 const LLT EltTy = Query.Types[EltTypeIdx];
1898 const LLT VecTy = Query.Types[VecTypeIdx];
1899 const LLT IdxTy = Query.Types[IdxTypeIdx];
1900 const unsigned EltSize = EltTy.getSizeInBits();
1901 const bool isLegalVecType =
1903 // Address space 8 pointers are 128-bit wide values, but the logic
1904 // below will try to bitcast them to 2N x s64, which will fail.
1905 // Therefore, as an intermediate step, wrap extracts/insertions from a
1906 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1907 // extraction result) in order to produce a vector operation that can
1908 // be handled by the logic below.
1909 if (EltTy.isPointer() && EltSize > 64)
1910 return true;
1911 return (EltSize == 32 || EltSize == 64) &&
1912 VecTy.getSizeInBits() % 32 == 0 &&
1913 VecTy.getSizeInBits() <= MaxRegisterSize &&
1914 IdxTy.getSizeInBits() == 32 &&
1915 isLegalVecType;
1916 })
1917 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1918 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1919 bitcastToVectorElement32(VecTypeIdx))
1920 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1921 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1922 scalarOrEltWiderThan(VecTypeIdx, 64)),
1923 [=](const LegalityQuery &Query) {
1924 // For > 64-bit element types, try to turn this into a
1925 // 64-bit element vector since we may be able to do better
1926 // indexing if this is scalar. If not, fall back to 32.
1927 const LLT EltTy = Query.Types[EltTypeIdx];
1928 const LLT VecTy = Query.Types[VecTypeIdx];
1929 const unsigned DstEltSize = EltTy.getSizeInBits();
1930 const unsigned VecSize = VecTy.getSizeInBits();
1931
1932 const unsigned TargetEltSize =
1933 DstEltSize % 64 == 0 ? 64 : 32;
1934 return std::pair(VecTypeIdx,
1935 LLT::fixed_vector(VecSize / TargetEltSize,
1936 TargetEltSize));
1937 })
1938 .clampScalar(EltTypeIdx, S32, S64)
1939 .clampScalar(VecTypeIdx, S32, S64)
1940 .clampScalar(IdxTypeIdx, S32, S32)
1941 .clampMaxNumElements(VecTypeIdx, S32, 32)
1942 // TODO: Clamp elements for 64-bit vectors?
1943 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1945 // It should only be necessary with variable indexes.
1946 // As a last resort, lower to the stack
1947 .lower();
1948 }
1949
1950 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1951 .unsupportedIf([=](const LegalityQuery &Query) {
1952 const LLT &EltTy = Query.Types[1].getElementType();
1953 return Query.Types[0] != EltTy;
1954 });
1955
1956 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1957 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1958 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1961 [=](const LegalityQuery &Query) {
1962 const LLT BigTy = Query.Types[BigTyIdx];
1963 return (BigTy.getScalarSizeInBits() < 16);
1964 },
1966 .widenScalarIf(
1967 [=](const LegalityQuery &Query) {
1968 const LLT LitTy = Query.Types[LitTyIdx];
1969 return (LitTy.getScalarSizeInBits() < 16);
1970 },
1972 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1973 .widenScalarToNextPow2(BigTyIdx, 32)
1974 .customIf([=](const LegalityQuery &Query) {
1975 // Generic lower operates on the full-width value, producing
1976 // shift+trunc/mask sequences. For simple cases where extract/insert
1977 // values are 32-bit aligned, we can instead unmerge/merge and work on
1978 // the 32-bit components. However, we can't check the offset here so
1979 // custom lower function will have to call generic lowering if offset
1980 // is not 32-bit aligned.
1981 const LLT BigTy = Query.Types[BigTyIdx];
1982 const LLT LitTy = Query.Types[LitTyIdx];
1983 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
1984 LitTy.getSizeInBits() % 32 == 0;
1985 })
1986 .lower();
1987 }
1988
1989 auto &BuildVector =
1990 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1992 .legalForCartesianProduct(AllS64Vectors, {S64})
1993 .clampNumElements(0, V16S32, V32S32)
1998
1999 if (ST.hasScalarPackInsts()) {
2000 BuildVector
2001 // FIXME: Should probably widen s1 vectors straight to s32
2002 .minScalarOrElt(0, S16)
2003 .minScalar(1, S16);
2004
2005 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2006 .legalFor({V2S16, S32})
2007 .lower();
2008 } else {
2009 BuildVector.customFor({V2S16, S16});
2010 BuildVector.minScalarOrElt(0, S32);
2011
2012 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2013 .customFor({V2S16, S32})
2014 .lower();
2015 }
2016
2017 BuildVector.legalIf(isRegisterType(ST, 0));
2018
2019 // FIXME: Clamp maximum size
2020 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2021 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2022 .clampMaxNumElements(0, S32, 32)
2023 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2024 .clampMaxNumElements(0, S16, 64);
2025
2026 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2027
2028 // Merge/Unmerge
2029 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2030 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2031 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2032
2033 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2034 const LLT Ty = Query.Types[TypeIdx];
2035 if (Ty.isVector()) {
2036 const LLT &EltTy = Ty.getElementType();
2037 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2038 return true;
2040 return true;
2041 }
2042 return false;
2043 };
2044
2045 auto &Builder =
2047 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2048 .lowerFor({{S16, V2S16}})
2049 .lowerIf([=](const LegalityQuery &Query) {
2050 const LLT BigTy = Query.Types[BigTyIdx];
2051 return BigTy.getSizeInBits() == 32;
2052 })
2053 // Try to widen to s16 first for small types.
2054 // TODO: Only do this on targets with legal s16 shifts
2055 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2056 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2058 oneMoreElement(BigTyIdx))
2060 elementTypeIs(1, S16)),
2061 changeTo(1, V2S16))
2062 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2063 // not worth considering the multiples of 64 since 2*192 and 2*384
2064 // are not valid.
2065 .clampScalar(LitTyIdx, S32, S512)
2066 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2067 // Break up vectors with weird elements into scalars
2069 [=](const LegalityQuery &Query) {
2070 return notValidElt(Query, LitTyIdx);
2071 },
2072 scalarize(0))
2073 .fewerElementsIf(
2074 [=](const LegalityQuery &Query) {
2075 return notValidElt(Query, BigTyIdx);
2076 },
2077 scalarize(1))
2078 .clampScalar(BigTyIdx, S32, MaxScalar);
2079
2080 if (Op == G_MERGE_VALUES) {
2081 Builder.widenScalarIf(
2082 // TODO: Use 16-bit shifts if legal for 8-bit values?
2083 [=](const LegalityQuery &Query) {
2084 const LLT Ty = Query.Types[LitTyIdx];
2085 return Ty.getSizeInBits() < 32;
2086 },
2087 changeTo(LitTyIdx, S32));
2088 }
2089
2090 Builder.widenScalarIf(
2091 [=](const LegalityQuery &Query) {
2092 const LLT Ty = Query.Types[BigTyIdx];
2093 return Ty.getSizeInBits() % 16 != 0;
2094 },
2095 [=](const LegalityQuery &Query) {
2096 // Pick the next power of 2, or a multiple of 64 over 128.
2097 // Whichever is smaller.
2098 const LLT &Ty = Query.Types[BigTyIdx];
2099 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2100 if (NewSizeInBits >= 256) {
2101 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2102 if (RoundedTo < NewSizeInBits)
2103 NewSizeInBits = RoundedTo;
2104 }
2105 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2106 })
2107 // Any vectors left are the wrong size. Scalarize them.
2108 .scalarize(0)
2109 .scalarize(1);
2110 }
2111
2112 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2113 // RegBankSelect.
2114 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2115 .legalFor({{S32}, {S64}})
2116 .clampScalar(0, S32, S64);
2117
2118 if (ST.hasVOP3PInsts()) {
2119 SextInReg.lowerFor({{V2S16}})
2120 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2121 // get more vector shift opportunities, since we'll get those when
2122 // expanded.
2123 .clampMaxNumElementsStrict(0, S16, 2);
2124 } else if (ST.has16BitInsts()) {
2125 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2126 } else {
2127 // Prefer to promote to s32 before lowering if we don't have 16-bit
2128 // shifts. This avoid a lot of intermediate truncate and extend operations.
2129 SextInReg.lowerFor({{S32}, {S64}});
2130 }
2131
2132 SextInReg
2133 .scalarize(0)
2134 .clampScalar(0, S32, S64)
2135 .lower();
2136
2137 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2138 .scalarize(0)
2139 .lower();
2140
2141 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2142 FSHRActionDefs.legalFor({{S32, S32}})
2143 .clampMaxNumElementsStrict(0, S16, 2);
2144 if (ST.hasVOP3PInsts())
2145 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2146 FSHRActionDefs.scalarize(0).lower();
2147
2148 if (ST.hasVOP3PInsts()) {
2150 .lowerFor({{V2S16, V2S16}})
2151 .clampMaxNumElementsStrict(0, S16, 2)
2152 .scalarize(0)
2153 .lower();
2154 } else {
2156 .scalarize(0)
2157 .lower();
2158 }
2159
2160 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2161 .legalFor({S64});
2162
2163 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2164
2166 .alwaysLegal();
2167
2168 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2169 .scalarize(0)
2170 .minScalar(0, S32)
2171 .lower();
2172
2173 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2174 .legalFor({{S32, S32}, {S64, S32}})
2175 .clampScalar(1, S32, S32)
2176 .clampScalar(0, S32, S64)
2178 .scalarize(0);
2179
2181 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2182 G_FCOPYSIGN,
2183
2184 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2185 G_READ_REGISTER, G_WRITE_REGISTER,
2186
2187 G_SADDO, G_SSUBO})
2188 .lower();
2189
2190 if (ST.hasIEEEMinimumMaximumInsts()) {
2191 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2192 .legalFor(FPTypesPK16)
2193 .clampMaxNumElements(0, S16, 2)
2194 .scalarize(0);
2195 } else if (ST.hasVOP3PInsts()) {
2196 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2197 .lowerFor({V2S16})
2198 .clampMaxNumElementsStrict(0, S16, 2)
2199 .scalarize(0)
2200 .lower();
2201 } else {
2202 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2203 .scalarize(0)
2204 .clampScalar(0, S32, S64)
2205 .lower();
2206 }
2207
2208 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2209 .lower();
2210
2211 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2212
2213 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2214 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2215 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2216 .unsupported();
2217
2219
2221 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2222 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2223 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2224 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2225 .legalFor(AllVectors)
2226 .scalarize(1)
2227 .lower();
2228
2230 verify(*ST.getInstrInfo());
2231}
2232
2235 LostDebugLocObserver &LocObserver) const {
2236 MachineIRBuilder &B = Helper.MIRBuilder;
2237 MachineRegisterInfo &MRI = *B.getMRI();
2238
2239 switch (MI.getOpcode()) {
2240 case TargetOpcode::G_ADDRSPACE_CAST:
2241 return legalizeAddrSpaceCast(MI, MRI, B);
2242 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2243 return legalizeFroundeven(MI, MRI, B);
2244 case TargetOpcode::G_FCEIL:
2245 return legalizeFceil(MI, MRI, B);
2246 case TargetOpcode::G_FREM:
2247 return legalizeFrem(MI, MRI, B);
2248 case TargetOpcode::G_INTRINSIC_TRUNC:
2249 return legalizeIntrinsicTrunc(MI, MRI, B);
2250 case TargetOpcode::G_SITOFP:
2251 return legalizeITOFP(MI, MRI, B, true);
2252 case TargetOpcode::G_UITOFP:
2253 return legalizeITOFP(MI, MRI, B, false);
2254 case TargetOpcode::G_FPTOSI:
2255 return legalizeFPTOI(MI, MRI, B, true);
2256 case TargetOpcode::G_FPTOUI:
2257 return legalizeFPTOI(MI, MRI, B, false);
2258 case TargetOpcode::G_FMINNUM:
2259 case TargetOpcode::G_FMAXNUM:
2260 case TargetOpcode::G_FMINIMUMNUM:
2261 case TargetOpcode::G_FMAXIMUMNUM:
2262 return legalizeMinNumMaxNum(Helper, MI);
2263 case TargetOpcode::G_EXTRACT:
2264 return legalizeExtract(Helper, MI);
2265 case TargetOpcode::G_INSERT:
2266 return legalizeInsert(Helper, MI);
2267 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2268 return legalizeExtractVectorElt(MI, MRI, B);
2269 case TargetOpcode::G_INSERT_VECTOR_ELT:
2270 return legalizeInsertVectorElt(MI, MRI, B);
2271 case TargetOpcode::G_FSIN:
2272 case TargetOpcode::G_FCOS:
2273 return legalizeSinCos(MI, MRI, B);
2274 case TargetOpcode::G_GLOBAL_VALUE:
2275 return legalizeGlobalValue(MI, MRI, B);
2276 case TargetOpcode::G_LOAD:
2277 case TargetOpcode::G_SEXTLOAD:
2278 case TargetOpcode::G_ZEXTLOAD:
2279 return legalizeLoad(Helper, MI);
2280 case TargetOpcode::G_STORE:
2281 return legalizeStore(Helper, MI);
2282 case TargetOpcode::G_FMAD:
2283 return legalizeFMad(MI, MRI, B);
2284 case TargetOpcode::G_FDIV:
2285 return legalizeFDIV(MI, MRI, B);
2286 case TargetOpcode::G_FFREXP:
2287 return legalizeFFREXP(MI, MRI, B);
2288 case TargetOpcode::G_FSQRT:
2289 return legalizeFSQRT(MI, MRI, B);
2290 case TargetOpcode::G_UDIV:
2291 case TargetOpcode::G_UREM:
2292 case TargetOpcode::G_UDIVREM:
2293 return legalizeUnsignedDIV_REM(MI, MRI, B);
2294 case TargetOpcode::G_SDIV:
2295 case TargetOpcode::G_SREM:
2296 case TargetOpcode::G_SDIVREM:
2297 return legalizeSignedDIV_REM(MI, MRI, B);
2298 case TargetOpcode::G_ATOMIC_CMPXCHG:
2299 return legalizeAtomicCmpXChg(MI, MRI, B);
2300 case TargetOpcode::G_FLOG2:
2301 return legalizeFlog2(MI, B);
2302 case TargetOpcode::G_FLOG:
2303 case TargetOpcode::G_FLOG10:
2304 return legalizeFlogCommon(MI, B);
2305 case TargetOpcode::G_FEXP2:
2306 return legalizeFExp2(MI, B);
2307 case TargetOpcode::G_FEXP:
2308 case TargetOpcode::G_FEXP10:
2309 return legalizeFExp(MI, B);
2310 case TargetOpcode::G_FPOW:
2311 return legalizeFPow(MI, B);
2312 case TargetOpcode::G_FFLOOR:
2313 return legalizeFFloor(MI, MRI, B);
2314 case TargetOpcode::G_BUILD_VECTOR:
2315 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2316 return legalizeBuildVector(MI, MRI, B);
2317 case TargetOpcode::G_MUL:
2318 return legalizeMul(Helper, MI);
2319 case TargetOpcode::G_CTLZ:
2320 case TargetOpcode::G_CTTZ:
2321 return legalizeCTLZ_CTTZ(MI, MRI, B);
2322 case TargetOpcode::G_CTLS:
2323 return legalizeCTLS(MI, MRI, B);
2324 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2325 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2326 case TargetOpcode::G_STACKSAVE:
2327 return legalizeStackSave(MI, B);
2328 case TargetOpcode::G_GET_FPENV:
2329 return legalizeGetFPEnv(MI, MRI, B);
2330 case TargetOpcode::G_SET_FPENV:
2331 return legalizeSetFPEnv(MI, MRI, B);
2332 case TargetOpcode::G_TRAP:
2333 return legalizeTrap(MI, MRI, B);
2334 case TargetOpcode::G_DEBUGTRAP:
2335 return legalizeDebugTrap(MI, MRI, B);
2336 default:
2337 return false;
2338 }
2339
2340 llvm_unreachable("expected switch to return");
2341}
2342
2344 unsigned AS,
2346 MachineIRBuilder &B) const {
2347 MachineFunction &MF = B.getMF();
2348 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2349 const LLT S32 = LLT::scalar(32);
2350 const LLT S64 = LLT::scalar(64);
2351
2353
2354 if (ST.hasApertureRegs()) {
2355 // Note: this register is somewhat broken. When used as a 32-bit operand,
2356 // it only returns zeroes. The real value is in the upper 32 bits.
2357 // Thus, we must emit extract the high 32 bits.
2358 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2359 ? AMDGPU::SRC_SHARED_BASE
2360 : AMDGPU::SRC_PRIVATE_BASE;
2361 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2362 !ST.hasGloballyAddressableScratch()) &&
2363 "Cannot use src_private_base with globally addressable scratch!");
2365 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2366 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2367 return B.buildUnmerge(S32, Dst).getReg(1);
2368 }
2369
2372 // For code object version 5, private_base and shared_base are passed through
2373 // implicit kernargs.
2377
2382 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2383
2384 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2386
2387 if (!loadInputValue(KernargPtrReg, B,
2389 return Register();
2390
2392 PtrInfo.getWithOffset(Offset),
2396
2397 // Pointer address
2398 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2399 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2400 // Load address
2401 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2402 }
2403
2406
2408 return Register();
2409
2410 // TODO: Use custom PseudoSourceValue
2412
2413 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2414 // private_segment_aperture_base_hi.
2415 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2416
2418 PtrInfo,
2421 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2422
2423 B.buildObjectPtrOffset(
2424 LoadAddr, QueuePtr,
2425 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2426 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2427}
2428
2429/// Return true if the value is a known valid address, such that a null check is
2430/// not necessary.
2432 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2433 MachineInstr *Def = MRI.getVRegDef(Val);
2434 switch (Def->getOpcode()) {
2435 case AMDGPU::G_FRAME_INDEX:
2436 case AMDGPU::G_GLOBAL_VALUE:
2437 case AMDGPU::G_BLOCK_ADDR:
2438 return true;
2439 case AMDGPU::G_CONSTANT: {
2440 const ConstantInt *CI = Def->getOperand(1).getCImm();
2441 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2442 }
2443 default:
2444 return false;
2445 }
2446
2447 return false;
2448}
2449
2452 MachineIRBuilder &B) const {
2453 MachineFunction &MF = B.getMF();
2454
2455 // MI can either be a G_ADDRSPACE_CAST or a
2456 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2457 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2458 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2459 Intrinsic::amdgcn_addrspacecast_nonnull));
2460
2461 const LLT S32 = LLT::scalar(32);
2462 Register Dst = MI.getOperand(0).getReg();
2463 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2464 : MI.getOperand(1).getReg();
2465 LLT DstTy = MRI.getType(Dst);
2466 LLT SrcTy = MRI.getType(Src);
2467 unsigned DestAS = DstTy.getAddressSpace();
2468 unsigned SrcAS = SrcTy.getAddressSpace();
2469
2470 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2471 // vector element.
2472 assert(!DstTy.isVector());
2473
2474 const AMDGPUTargetMachine &TM
2475 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2476
2477 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2478 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2479 return true;
2480 }
2481
2482 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2483 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2484 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2485 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2486 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2487 ST.hasGloballyAddressableScratch()) {
2488 // flat -> private with globally addressable scratch: subtract
2489 // src_flat_scratch_base_lo.
2490 const LLT S32 = LLT::scalar(32);
2491 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2492 Register FlatScratchBaseLo =
2493 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2494 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2495 .getReg(0);
2496 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2497 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2498 return B.buildIntToPtr(Dst, Sub).getReg(0);
2499 }
2500
2501 // Extract low 32-bits of the pointer.
2502 return B.buildExtract(Dst, Src, 0).getReg(0);
2503 };
2504
2505 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2506 // G_ADDRSPACE_CAST we need to guess.
2507 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2508 castFlatToLocalOrPrivate(Dst);
2509 MI.eraseFromParent();
2510 return true;
2511 }
2512
2513 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2514
2515 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2516 auto FlatNull = B.buildConstant(SrcTy, 0);
2517
2518 // Extract low 32-bits of the pointer.
2519 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2520
2521 auto CmpRes =
2522 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2523 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2524
2525 MI.eraseFromParent();
2526 return true;
2527 }
2528
2529 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2530 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2531 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2532 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2533 // Coerce the type of the low half of the result so we can use
2534 // merge_values.
2535 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2536
2537 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2538 ST.hasGloballyAddressableScratch()) {
2539 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2540 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2541 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2542 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2543 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2544 .addUse(AllOnes)
2545 .addUse(ThreadID)
2546 .getReg(0);
2547 if (ST.isWave64()) {
2548 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2549 .addUse(AllOnes)
2550 .addUse(ThreadID)
2551 .getReg(0);
2552 }
2553 Register ShAmt =
2554 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2555 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2556 Register CvtPtr =
2557 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2558 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2559 // 64-bit hi:lo value.
2560 Register FlatScratchBase =
2561 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2562 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2563 .getReg(0);
2564 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2565 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2566 }
2567
2568 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2569 if (!ApertureReg.isValid())
2570 return false;
2571
2572 // TODO: Should we allow mismatched types but matching sizes in merges to
2573 // avoid the ptrtoint?
2574 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2575 };
2576
2577 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2578 // G_ADDRSPACE_CAST we need to guess.
2579 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2580 castLocalOrPrivateToFlat(Dst);
2581 MI.eraseFromParent();
2582 return true;
2583 }
2584
2585 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2586
2587 auto SegmentNull =
2588 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2589 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2590
2591 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2592 SegmentNull.getReg(0));
2593
2594 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2595
2596 MI.eraseFromParent();
2597 return true;
2598 }
2599
2600 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2601 SrcTy.getSizeInBits() == 64) {
2602 // Truncate.
2603 B.buildExtract(Dst, Src, 0);
2604 MI.eraseFromParent();
2605 return true;
2606 }
2607
2608 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2609 DstTy.getSizeInBits() == 64) {
2611 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2612 auto PtrLo = B.buildPtrToInt(S32, Src);
2613 if (AddrHiVal == 0) {
2614 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2615 B.buildIntToPtr(Dst, Zext);
2616 } else {
2617 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2618 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2619 }
2620
2621 MI.eraseFromParent();
2622 return true;
2623 }
2624
2625 // Invalid casts are poison.
2626 // TODO: Should return poison
2627 B.buildUndef(Dst);
2628 MI.eraseFromParent();
2629 return true;
2630}
2631
2634 MachineIRBuilder &B) const {
2635 Register Src = MI.getOperand(1).getReg();
2636 LLT Ty = MRI.getType(Src);
2637 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2638
2639 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2640 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2641
2642 auto C1 = B.buildFConstant(Ty, C1Val);
2643 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2644
2645 // TODO: Should this propagate fast-math-flags?
2646 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2647 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2648
2649 auto C2 = B.buildFConstant(Ty, C2Val);
2650 auto Fabs = B.buildFAbs(Ty, Src);
2651
2652 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2653 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2654 MI.eraseFromParent();
2655 return true;
2656}
2657
2660 MachineIRBuilder &B) const {
2661
2662 const LLT S1 = LLT::scalar(1);
2663 const LLT S64 = LLT::scalar(64);
2664
2665 Register Src = MI.getOperand(1).getReg();
2666 assert(MRI.getType(Src) == S64);
2667
2668 // result = trunc(src)
2669 // if (src > 0.0 && src != result)
2670 // result += 1.0
2671
2672 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2673
2674 const auto Zero = B.buildFConstant(S64, 0.0);
2675 const auto One = B.buildFConstant(S64, 1.0);
2676 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2677 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2678 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2679 auto Add = B.buildSelect(S64, And, One, Zero);
2680
2681 // TODO: Should this propagate fast-math-flags?
2682 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2683 MI.eraseFromParent();
2684 return true;
2685}
2686
2689 MachineIRBuilder &B) const {
2690 Register DstReg = MI.getOperand(0).getReg();
2691 Register Src0Reg = MI.getOperand(1).getReg();
2692 Register Src1Reg = MI.getOperand(2).getReg();
2693 auto Flags = MI.getFlags();
2694 LLT Ty = MRI.getType(DstReg);
2695
2696 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2697 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2698 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2699 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2700 MI.eraseFromParent();
2701 return true;
2702}
2703
2706 const unsigned FractBits = 52;
2707 const unsigned ExpBits = 11;
2708 LLT S32 = LLT::scalar(32);
2709
2710 auto Const0 = B.buildConstant(S32, FractBits - 32);
2711 auto Const1 = B.buildConstant(S32, ExpBits);
2712
2713 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2714 .addUse(Hi)
2715 .addUse(Const0.getReg(0))
2716 .addUse(Const1.getReg(0));
2717
2718 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2719}
2720
2723 MachineIRBuilder &B) const {
2724 const LLT S1 = LLT::scalar(1);
2725 const LLT S32 = LLT::scalar(32);
2726 const LLT S64 = LLT::scalar(64);
2727
2728 Register Src = MI.getOperand(1).getReg();
2729 assert(MRI.getType(Src) == S64);
2730
2731 // TODO: Should this use extract since the low half is unused?
2732 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2733 Register Hi = Unmerge.getReg(1);
2734
2735 // Extract the upper half, since this is where we will find the sign and
2736 // exponent.
2737 auto Exp = extractF64Exponent(Hi, B);
2738
2739 const unsigned FractBits = 52;
2740
2741 // Extract the sign bit.
2742 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2743 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2744
2745 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2746
2747 const auto Zero32 = B.buildConstant(S32, 0);
2748
2749 // Extend back to 64-bits.
2750 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2751
2752 auto Shr = B.buildAShr(S64, FractMask, Exp);
2753 auto Not = B.buildNot(S64, Shr);
2754 auto Tmp0 = B.buildAnd(S64, Src, Not);
2755 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2756
2757 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2758 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2759
2760 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2761 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2762 MI.eraseFromParent();
2763 return true;
2764}
2765
2768 MachineIRBuilder &B, bool Signed) const {
2769
2770 Register Dst = MI.getOperand(0).getReg();
2771 Register Src = MI.getOperand(1).getReg();
2772
2773 const LLT S64 = LLT::scalar(64);
2774 const LLT S32 = LLT::scalar(32);
2775
2776 assert(MRI.getType(Src) == S64);
2777
2778 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2779 auto ThirtyTwo = B.buildConstant(S32, 32);
2780
2781 if (MRI.getType(Dst) == S64) {
2782 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2783 : B.buildUITOFP(S64, Unmerge.getReg(1));
2784
2785 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2786 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2787
2788 // TODO: Should this propagate fast-math-flags?
2789 B.buildFAdd(Dst, LdExp, CvtLo);
2790 MI.eraseFromParent();
2791 return true;
2792 }
2793
2794 assert(MRI.getType(Dst) == S32);
2795
2796 auto One = B.buildConstant(S32, 1);
2797
2798 MachineInstrBuilder ShAmt;
2799 if (Signed) {
2800 auto ThirtyOne = B.buildConstant(S32, 31);
2801 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2802 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2803 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2804 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2805 .addUse(Unmerge.getReg(1));
2806 auto LS2 = B.buildSub(S32, LS, One);
2807 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2808 } else
2809 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2810 auto Norm = B.buildShl(S64, Src, ShAmt);
2811 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2812 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2813 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2814 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2815 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2816 B.buildFLdexp(Dst, FVal, Scale);
2817 MI.eraseFromParent();
2818 return true;
2819}
2820
2821// TODO: Copied from DAG implementation. Verify logic and document how this
2822// actually works.
2826 bool Signed) const {
2827
2828 Register Dst = MI.getOperand(0).getReg();
2829 Register Src = MI.getOperand(1).getReg();
2830
2831 const LLT S64 = LLT::scalar(64);
2832 const LLT S32 = LLT::scalar(32);
2833
2834 const LLT SrcLT = MRI.getType(Src);
2835 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2836
2837 unsigned Flags = MI.getFlags();
2838
2839 // The basic idea of converting a floating point number into a pair of 32-bit
2840 // integers is illustrated as follows:
2841 //
2842 // tf := trunc(val);
2843 // hif := floor(tf * 2^-32);
2844 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2845 // hi := fptoi(hif);
2846 // lo := fptoi(lof);
2847 //
2848 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2850 if (Signed && SrcLT == S32) {
2851 // However, a 32-bit floating point number has only 23 bits mantissa and
2852 // it's not enough to hold all the significant bits of `lof` if val is
2853 // negative. To avoid the loss of precision, We need to take the absolute
2854 // value after truncating and flip the result back based on the original
2855 // signedness.
2856 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2857 Trunc = B.buildFAbs(S32, Trunc, Flags);
2858 }
2859 MachineInstrBuilder K0, K1;
2860 if (SrcLT == S64) {
2861 K0 = B.buildFConstant(
2862 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2863 K1 = B.buildFConstant(
2864 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2865 } else {
2866 K0 = B.buildFConstant(
2867 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2868 K1 = B.buildFConstant(
2869 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2870 }
2871
2872 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2873 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2874 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2875
2876 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2877 : B.buildFPTOUI(S32, FloorMul);
2878 auto Lo = B.buildFPTOUI(S32, Fma);
2879
2880 if (Signed && SrcLT == S32) {
2881 // Flip the result based on the signedness, which is either all 0s or 1s.
2882 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2883 // r := xor({lo, hi}, sign) - sign;
2884 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2885 Sign);
2886 } else
2887 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2888 MI.eraseFromParent();
2889
2890 return true;
2891}
2892
2894 MachineInstr &MI) const {
2895 MachineFunction &MF = Helper.MIRBuilder.getMF();
2897
2898 // With ieee_mode disabled, the instructions have the correct behavior.
2899 if (!MFI->getMode().IEEE)
2900 return true;
2901
2903}
2904
2906 MachineInstr &MI) const {
2907 MachineIRBuilder &B = Helper.MIRBuilder;
2908 MachineRegisterInfo &MRI = *B.getMRI();
2909 Register DstReg = MI.getOperand(0).getReg();
2910 Register SrcReg = MI.getOperand(1).getReg();
2911 uint64_t Offset = MI.getOperand(2).getImm();
2912
2913 // Fall back to generic lowering for offset 0 (trivial trunc) and
2914 // non-32-bit-aligned cases which require shift+trunc sequences
2915 // that generic code handles correctly.
2916 if (Offset == 0 || Offset % 32 != 0)
2917 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2918
2919 const LLT DstTy = MRI.getType(DstReg);
2920 unsigned StartIdx = Offset / 32;
2921 unsigned DstCount = DstTy.getSizeInBits() / 32;
2922 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2923
2924 if (DstCount == 1) {
2925 if (DstTy.isPointer())
2926 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2927 else
2928 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2929 } else {
2930 SmallVector<Register, 8> MergeVec;
2931 for (unsigned I = 0; I < DstCount; ++I)
2932 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
2933 B.buildMergeLikeInstr(DstReg, MergeVec);
2934 }
2935
2936 MI.eraseFromParent();
2937 return true;
2938}
2939
2941 MachineInstr &MI) const {
2942 MachineIRBuilder &B = Helper.MIRBuilder;
2943 MachineRegisterInfo &MRI = *B.getMRI();
2944 Register DstReg = MI.getOperand(0).getReg();
2945 Register SrcReg = MI.getOperand(1).getReg();
2946 Register InsertSrc = MI.getOperand(2).getReg();
2947 uint64_t Offset = MI.getOperand(3).getImm();
2948
2949 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2950 const LLT InsertTy = MRI.getType(InsertSrc);
2951 unsigned InsertSize = InsertTy.getSizeInBits();
2952
2953 // Fall back to generic lowering for non-32-bit-aligned cases which
2954 // require shift+mask sequences that generic code handles correctly.
2955 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2956 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2957
2958 const LLT S32 = LLT::scalar(32);
2959 unsigned DstCount = DstSize / 32;
2960 unsigned InsertCount = InsertSize / 32;
2961 unsigned StartIdx = Offset / 32;
2962
2963 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
2964
2965 SmallVector<Register, 8> MergeVec;
2966 for (unsigned I = 0; I < StartIdx; ++I)
2967 MergeVec.push_back(SrcUnmerge.getReg(I));
2968
2969 if (InsertCount == 1) {
2970 // Merge-like instructions require same source types. Convert pointer
2971 // to scalar when inserting a pointer value into a scalar.
2972 if (InsertTy.isPointer())
2973 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
2974 MergeVec.push_back(InsertSrc);
2975 } else {
2976 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
2977 for (unsigned I = 0; I < InsertCount; ++I)
2978 MergeVec.push_back(InsertUnmerge.getReg(I));
2979 }
2980
2981 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
2982 MergeVec.push_back(SrcUnmerge.getReg(I));
2983
2984 B.buildMergeLikeInstr(DstReg, MergeVec);
2985
2986 MI.eraseFromParent();
2987 return true;
2988}
2989
2992 MachineIRBuilder &B) const {
2993 // TODO: Should move some of this into LegalizerHelper.
2994
2995 // TODO: Promote dynamic indexing of s16 to s32
2996
2997 Register Dst = MI.getOperand(0).getReg();
2998 Register Vec = MI.getOperand(1).getReg();
2999
3000 LLT VecTy = MRI.getType(Vec);
3001 LLT EltTy = VecTy.getElementType();
3002 assert(EltTy == MRI.getType(Dst));
3003
3004 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3005 // but we can't go directly to that logic becasue you can't bitcast a vector
3006 // of pointers to a vector of integers. Therefore, introduce an intermediate
3007 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3008 // drive the legalization forward.
3009 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3010 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3011 LLT IntVecTy = VecTy.changeElementType(IntTy);
3012
3013 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3014 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3015 B.buildIntToPtr(Dst, IntElt);
3016
3017 MI.eraseFromParent();
3018 return true;
3019 }
3020
3021 // FIXME: Artifact combiner probably should have replaced the truncated
3022 // constant before this, so we shouldn't need
3023 // getIConstantVRegValWithLookThrough.
3024 std::optional<ValueAndVReg> MaybeIdxVal =
3025 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3026 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3027 return true;
3028 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3029
3030 if (IdxVal < VecTy.getNumElements()) {
3031 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3032 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3033 } else {
3034 B.buildUndef(Dst);
3035 }
3036
3037 MI.eraseFromParent();
3038 return true;
3039}
3040
3043 MachineIRBuilder &B) const {
3044 // TODO: Should move some of this into LegalizerHelper.
3045
3046 // TODO: Promote dynamic indexing of s16 to s32
3047
3048 Register Dst = MI.getOperand(0).getReg();
3049 Register Vec = MI.getOperand(1).getReg();
3050 Register Ins = MI.getOperand(2).getReg();
3051
3052 LLT VecTy = MRI.getType(Vec);
3053 LLT EltTy = VecTy.getElementType();
3054 assert(EltTy == MRI.getType(Ins));
3055
3056 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3057 // but we can't go directly to that logic becasue you can't bitcast a vector
3058 // of pointers to a vector of integers. Therefore, make the pointer vector
3059 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3060 // new value, and then inttoptr the result vector back. This will then allow
3061 // the rest of legalization to take over.
3062 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3063 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3064 LLT IntVecTy = VecTy.changeElementType(IntTy);
3065
3066 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3067 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3068 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3069 MI.getOperand(3));
3070 B.buildIntToPtr(Dst, IntVecDest);
3071 MI.eraseFromParent();
3072 return true;
3073 }
3074
3075 // FIXME: Artifact combiner probably should have replaced the truncated
3076 // constant before this, so we shouldn't need
3077 // getIConstantVRegValWithLookThrough.
3078 std::optional<ValueAndVReg> MaybeIdxVal =
3079 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3080 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3081 return true;
3082
3083 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3084
3085 unsigned NumElts = VecTy.getNumElements();
3086 if (IdxVal < NumElts) {
3088 for (unsigned i = 0; i < NumElts; ++i)
3089 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3090 B.buildUnmerge(SrcRegs, Vec);
3091
3092 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3093 B.buildMergeLikeInstr(Dst, SrcRegs);
3094 } else {
3095 B.buildUndef(Dst);
3096 }
3097
3098 MI.eraseFromParent();
3099 return true;
3100}
3101
3104 MachineIRBuilder &B) const {
3105
3106 Register DstReg = MI.getOperand(0).getReg();
3107 Register SrcReg = MI.getOperand(1).getReg();
3108 LLT Ty = MRI.getType(DstReg);
3109 unsigned Flags = MI.getFlags();
3110
3111 Register TrigVal;
3112 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3113 if (ST.hasTrigReducedRange()) {
3114 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3115 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3116 .addUse(MulVal.getReg(0))
3117 .setMIFlags(Flags)
3118 .getReg(0);
3119 } else
3120 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3121
3122 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3123 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3124 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3125 .addUse(TrigVal)
3126 .setMIFlags(Flags);
3127 MI.eraseFromParent();
3128 return true;
3129}
3130
3133 const GlobalValue *GV,
3134 int64_t Offset,
3135 unsigned GAFlags) const {
3136 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3137 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3138 // to the following code sequence:
3139 //
3140 // For constant address space:
3141 // s_getpc_b64 s[0:1]
3142 // s_add_u32 s0, s0, $symbol
3143 // s_addc_u32 s1, s1, 0
3144 //
3145 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3146 // a fixup or relocation is emitted to replace $symbol with a literal
3147 // constant, which is a pc-relative offset from the encoding of the $symbol
3148 // operand to the global variable.
3149 //
3150 // For global address space:
3151 // s_getpc_b64 s[0:1]
3152 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3153 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3154 //
3155 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3156 // fixups or relocations are emitted to replace $symbol@*@lo and
3157 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3158 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3159 // operand to the global variable.
3160
3162
3163 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3164 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3165
3166 if (ST.has64BitLiterals()) {
3167 assert(GAFlags != SIInstrInfo::MO_NONE);
3168
3170 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3171 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3172 } else {
3174 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3175
3176 MIB.addGlobalAddress(GV, Offset, GAFlags);
3177 if (GAFlags == SIInstrInfo::MO_NONE)
3178 MIB.addImm(0);
3179 else
3180 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3181 }
3182
3183 if (!B.getMRI()->getRegClassOrNull(PCReg))
3184 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3185
3186 if (PtrTy.getSizeInBits() == 32)
3187 B.buildExtract(DstReg, PCReg, 0);
3188 return true;
3189}
3190
3191// Emit a ABS32_LO / ABS32_HI relocation stub.
3193 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3194 MachineRegisterInfo &MRI) const {
3195 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3196
3197 if (RequiresHighHalf && ST.has64BitLiterals()) {
3198 if (!MRI.getRegClassOrNull(DstReg))
3199 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3200 B.buildInstr(AMDGPU::S_MOV_B64)
3201 .addDef(DstReg)
3202 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3203 return;
3204 }
3205
3206 LLT S32 = LLT::scalar(32);
3207
3208 // Use the destination directly, if and only if we store the lower address
3209 // part only and we don't have a register class being set.
3210 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3211 ? DstReg
3213
3214 if (!MRI.getRegClassOrNull(AddrLo))
3215 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3216
3217 // Write the lower half.
3218 B.buildInstr(AMDGPU::S_MOV_B32)
3219 .addDef(AddrLo)
3220 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3221
3222 // If required, write the upper half as well.
3223 if (RequiresHighHalf) {
3224 assert(PtrTy.getSizeInBits() == 64 &&
3225 "Must provide a 64-bit pointer type!");
3226
3228 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3229
3230 B.buildInstr(AMDGPU::S_MOV_B32)
3231 .addDef(AddrHi)
3232 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3233
3234 // Use the destination directly, if and only if we don't have a register
3235 // class being set.
3236 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3237 ? DstReg
3239
3240 if (!MRI.getRegClassOrNull(AddrDst))
3241 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3242
3243 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3244
3245 // If we created a new register for the destination, cast the result into
3246 // the final output.
3247 if (AddrDst != DstReg)
3248 B.buildCast(DstReg, AddrDst);
3249 } else if (AddrLo != DstReg) {
3250 // If we created a new register for the destination, cast the result into
3251 // the final output.
3252 B.buildCast(DstReg, AddrLo);
3253 }
3254}
3255
3258 MachineIRBuilder &B) const {
3259 Register DstReg = MI.getOperand(0).getReg();
3260 LLT Ty = MRI.getType(DstReg);
3261 unsigned AS = Ty.getAddressSpace();
3262
3263 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3264 MachineFunction &MF = B.getMF();
3266
3268 if (!MFI->isModuleEntryFunction() &&
3269 GV->getName() != "llvm.amdgcn.module.lds" &&
3271 const Function &Fn = MF.getFunction();
3273 Fn, "local memory global used by non-kernel function",
3274 MI.getDebugLoc(), DS_Warning));
3275
3276 // We currently don't have a way to correctly allocate LDS objects that
3277 // aren't directly associated with a kernel. We do force inlining of
3278 // functions that use local objects. However, if these dead functions are
3279 // not eliminated, we don't want a compile time error. Just emit a warning
3280 // and a trap, since there should be no callable path here.
3281 B.buildTrap();
3282 B.buildUndef(DstReg);
3283 MI.eraseFromParent();
3284 return true;
3285 }
3286
3287 // TODO: We could emit code to handle the initialization somewhere.
3288 // We ignore the initializer for now and legalize it to allow selection.
3289 // The initializer will anyway get errored out during assembly emission.
3290 const SITargetLowering *TLI = ST.getTargetLowering();
3291 if (!TLI->shouldUseLDSConstAddress(GV)) {
3292 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3293 return true; // Leave in place;
3294 }
3295
3296 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3297 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3298 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3299 // zero-sized type in other languages to declare the dynamic shared
3300 // memory which size is not known at the compile time. They will be
3301 // allocated by the runtime and placed directly after the static
3302 // allocated ones. They all share the same offset.
3303 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3304 // Adjust alignment for that dynamic shared memory array.
3305 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3306 LLT S32 = LLT::scalar(32);
3307 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3308 B.buildIntToPtr(DstReg, Sz);
3309 MI.eraseFromParent();
3310 return true;
3311 }
3312 }
3313
3314 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3315 MI.eraseFromParent();
3316 return true;
3317 }
3318
3319 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3320 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3321 MI.eraseFromParent();
3322 return true;
3323 }
3324
3325 const SITargetLowering *TLI = ST.getTargetLowering();
3326
3327 if (TLI->shouldEmitFixup(GV)) {
3328 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3329 MI.eraseFromParent();
3330 return true;
3331 }
3332
3333 if (TLI->shouldEmitPCReloc(GV)) {
3334 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3335 MI.eraseFromParent();
3336 return true;
3337 }
3338
3340 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3341
3342 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3347 LoadTy, Align(8));
3348
3349 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3350
3351 if (Ty.getSizeInBits() == 32) {
3352 // Truncate if this is a 32-bit constant address.
3353 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3354 B.buildExtract(DstReg, Load, 0);
3355 } else
3356 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3357
3358 MI.eraseFromParent();
3359 return true;
3360}
3361
3363 if (Ty.isVector())
3364 return Ty.changeElementCount(
3365 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3366 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3367}
3368
3370 MachineInstr &MI) const {
3371 MachineIRBuilder &B = Helper.MIRBuilder;
3372 MachineRegisterInfo &MRI = *B.getMRI();
3373 GISelChangeObserver &Observer = Helper.Observer;
3374
3375 Register PtrReg = MI.getOperand(1).getReg();
3376 LLT PtrTy = MRI.getType(PtrReg);
3377 unsigned AddrSpace = PtrTy.getAddressSpace();
3378
3379 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3381 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3382 Observer.changingInstr(MI);
3383 MI.getOperand(1).setReg(Cast.getReg(0));
3384 Observer.changedInstr(MI);
3385 return true;
3386 }
3387
3388 if (MI.getOpcode() != AMDGPU::G_LOAD)
3389 return false;
3390
3391 Register ValReg = MI.getOperand(0).getReg();
3392 LLT ValTy = MRI.getType(ValReg);
3393
3394 if (hasBufferRsrcWorkaround(ValTy)) {
3395 Observer.changingInstr(MI);
3396 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3397 Observer.changedInstr(MI);
3398 return true;
3399 }
3400
3401 MachineMemOperand *MMO = *MI.memoperands_begin();
3402 const unsigned ValSize = ValTy.getSizeInBits();
3403 const LLT MemTy = MMO->getMemoryType();
3404 const Align MemAlign = MMO->getAlign();
3405 const unsigned MemSize = MemTy.getSizeInBits();
3406 const uint64_t AlignInBits = 8 * MemAlign.value();
3407
3408 // Widen non-power-of-2 loads to the alignment if needed
3409 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3410 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3411
3412 // This was already the correct extending load result type, so just adjust
3413 // the memory type.
3414 if (WideMemSize == ValSize) {
3415 MachineFunction &MF = B.getMF();
3416
3417 MachineMemOperand *WideMMO =
3418 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3419 Observer.changingInstr(MI);
3420 MI.setMemRefs(MF, {WideMMO});
3421 Observer.changedInstr(MI);
3422 return true;
3423 }
3424
3425 // Don't bother handling edge case that should probably never be produced.
3426 if (ValSize > WideMemSize)
3427 return false;
3428
3429 LLT WideTy = widenToNextPowerOf2(ValTy);
3430
3431 Register WideLoad;
3432 if (!WideTy.isVector()) {
3433 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3434 B.buildTrunc(ValReg, WideLoad).getReg(0);
3435 } else {
3436 // Extract the subvector.
3437
3438 if (isRegisterType(ST, ValTy)) {
3439 // If this a case where G_EXTRACT is legal, use it.
3440 // (e.g. <3 x s32> -> <4 x s32>)
3441 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3442 B.buildExtract(ValReg, WideLoad, 0);
3443 } else {
3444 // For cases where the widened type isn't a nice register value, unmerge
3445 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3446 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3447 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3448 }
3449 }
3450
3451 MI.eraseFromParent();
3452 return true;
3453 }
3454
3455 return false;
3456}
3457
3459 MachineInstr &MI) const {
3460 MachineIRBuilder &B = Helper.MIRBuilder;
3461 MachineRegisterInfo &MRI = *B.getMRI();
3462 GISelChangeObserver &Observer = Helper.Observer;
3463
3464 Register DataReg = MI.getOperand(0).getReg();
3465 LLT DataTy = MRI.getType(DataReg);
3466
3467 if (hasBufferRsrcWorkaround(DataTy)) {
3468 Observer.changingInstr(MI);
3470 Observer.changedInstr(MI);
3471 return true;
3472 }
3473 return false;
3474}
3475
3478 MachineIRBuilder &B) const {
3479 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3480 assert(Ty.isScalar());
3481
3482 MachineFunction &MF = B.getMF();
3484
3485 // TODO: Always legal with future ftz flag.
3486 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3487 // FIXME: Do we need just output?
3488 if (Ty == LLT::scalar(32) &&
3490 return true;
3491 if (Ty == LLT::scalar(16) &&
3493 return true;
3494
3495 MachineIRBuilder HelperBuilder(MI);
3496 GISelObserverWrapper DummyObserver;
3497 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3498 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3499}
3500
3503 Register DstReg = MI.getOperand(0).getReg();
3504 Register PtrReg = MI.getOperand(1).getReg();
3505 Register CmpVal = MI.getOperand(2).getReg();
3506 Register NewVal = MI.getOperand(3).getReg();
3507
3509 "this should not have been custom lowered");
3510
3511 LLT ValTy = MRI.getType(CmpVal);
3512 LLT VecTy = LLT::fixed_vector(2, ValTy);
3513
3514 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3515
3516 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3517 .addDef(DstReg)
3518 .addUse(PtrReg)
3519 .addUse(PackedVal)
3520 .setMemRefs(MI.memoperands());
3521
3522 MI.eraseFromParent();
3523 return true;
3524}
3525
3526/// Return true if it's known that \p Src can never be an f32 denormal value.
3528 Register Src) {
3529 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3530 switch (DefMI->getOpcode()) {
3531 case TargetOpcode::G_INTRINSIC: {
3533 case Intrinsic::amdgcn_frexp_mant:
3534 case Intrinsic::amdgcn_log:
3535 case Intrinsic::amdgcn_log_clamp:
3536 case Intrinsic::amdgcn_exp2:
3537 case Intrinsic::amdgcn_sqrt:
3538 return true;
3539 default:
3540 break;
3541 }
3542
3543 break;
3544 }
3545 case TargetOpcode::G_FSQRT:
3546 return true;
3547 case TargetOpcode::G_FFREXP: {
3548 if (DefMI->getOperand(0).getReg() == Src)
3549 return true;
3550 break;
3551 }
3552 case TargetOpcode::G_FPEXT: {
3553 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3554 }
3555 default:
3556 return false;
3557 }
3558
3559 return false;
3560}
3561
3562static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3563 return Flags & MachineInstr::FmAfn;
3564}
3565
3567 unsigned Flags) {
3568 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3571}
3572
3573std::pair<Register, Register>
3575 unsigned Flags) const {
3576 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3577 return {};
3578
3579 const LLT F32 = LLT::scalar(32);
3580 auto SmallestNormal = B.buildFConstant(
3582 auto IsLtSmallestNormal =
3583 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3584
3585 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3586 auto One = B.buildFConstant(F32, 1.0);
3587 auto ScaleFactor =
3588 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3589 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3590
3591 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3592}
3593
3595 MachineIRBuilder &B) const {
3596 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3597 // If we have to handle denormals, scale up the input and adjust the result.
3598
3599 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3600 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3601
3602 Register Dst = MI.getOperand(0).getReg();
3603 Register Src = MI.getOperand(1).getReg();
3604 LLT Ty = B.getMRI()->getType(Dst);
3605 unsigned Flags = MI.getFlags();
3606
3607 if (Ty == LLT::scalar(16)) {
3608 const LLT F32 = LLT::scalar(32);
3609 // Nothing in half is a denormal when promoted to f32.
3610 auto Ext = B.buildFPExt(F32, Src, Flags);
3611 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3612 .addUse(Ext.getReg(0))
3613 .setMIFlags(Flags);
3614 B.buildFPTrunc(Dst, Log2, Flags);
3615 MI.eraseFromParent();
3616 return true;
3617 }
3618
3619 assert(Ty == LLT::scalar(32));
3620
3621 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3622 if (!ScaledInput) {
3623 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3624 .addUse(Src)
3625 .setMIFlags(Flags);
3626 MI.eraseFromParent();
3627 return true;
3628 }
3629
3630 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3631 .addUse(ScaledInput)
3632 .setMIFlags(Flags);
3633
3634 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3635 auto Zero = B.buildFConstant(Ty, 0.0);
3636 auto ResultOffset =
3637 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3638 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3639
3640 MI.eraseFromParent();
3641 return true;
3642}
3643
3645 Register Z, unsigned Flags) {
3646 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3647 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3648}
3649
3651 MachineIRBuilder &B) const {
3652 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3653 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3654
3655 MachineRegisterInfo &MRI = *B.getMRI();
3656 Register Dst = MI.getOperand(0).getReg();
3657 Register X = MI.getOperand(1).getReg();
3658 unsigned Flags = MI.getFlags();
3659 const LLT Ty = MRI.getType(X);
3660
3661 const LLT F32 = LLT::scalar(32);
3662 const LLT F16 = LLT::scalar(16);
3663
3664 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3665 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3666 // depending on !fpmath metadata.
3667 bool PromoteToF32 =
3668 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3669 if (PromoteToF32) {
3671 auto PromoteSrc = B.buildFPExt(F32, X);
3672 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3673 B.buildFPTrunc(Dst, LogVal);
3674 } else {
3675 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3676 }
3677
3678 MI.eraseFromParent();
3679 return true;
3680 }
3681
3682 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3683 if (ScaledInput)
3684 X = ScaledInput;
3685
3686 auto Y =
3687 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3688
3689 Register R;
3690 if (ST.hasFastFMAF32()) {
3691 // c+cc are ln(2)/ln(10) to more than 49 bits
3692 const float c_log10 = 0x1.344134p-2f;
3693 const float cc_log10 = 0x1.09f79ep-26f;
3694
3695 // c + cc is ln(2) to more than 49 bits
3696 const float c_log = 0x1.62e42ep-1f;
3697 const float cc_log = 0x1.efa39ep-25f;
3698
3699 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3700 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3701 // This adds correction terms for which contraction may lead to an increase
3702 // in the error of the approximation, so disable it.
3703 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3704 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3705 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3706 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3707 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3708 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3709 } else {
3710 // ch+ct is ln(2)/ln(10) to more than 36 bits
3711 const float ch_log10 = 0x1.344000p-2f;
3712 const float ct_log10 = 0x1.3509f6p-18f;
3713
3714 // ch + ct is ln(2) to more than 36 bits
3715 const float ch_log = 0x1.62e000p-1f;
3716 const float ct_log = 0x1.0bfbe8p-15f;
3717
3718 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3719 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3720
3721 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3722 auto YH = B.buildAnd(Ty, Y, MaskConst);
3723 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3724 // This adds correction terms for which contraction may lead to an increase
3725 // in the error of the approximation, so disable it.
3726 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3727 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3728
3729 Register Mad0 =
3730 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3731 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3732 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3733 }
3734
3735 const bool IsFiniteOnly =
3737
3738 if (!IsFiniteOnly) {
3739 // Expand isfinite(x) => fabs(x) < inf
3740 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3741 auto Fabs = B.buildFAbs(Ty, Y);
3742 auto IsFinite =
3743 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3744 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3745 }
3746
3747 if (ScaledInput) {
3748 auto Zero = B.buildFConstant(Ty, 0.0);
3749 auto ShiftK =
3750 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3751 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3752 B.buildFSub(Dst, R, Shift, Flags);
3753 } else {
3754 B.buildCopy(Dst, R);
3755 }
3756
3757 MI.eraseFromParent();
3758 return true;
3759}
3760
3762 Register Src, bool IsLog10,
3763 unsigned Flags) const {
3764 const double Log2BaseInverted =
3766
3767 LLT Ty = B.getMRI()->getType(Dst);
3768
3769 if (Ty == LLT::scalar(32)) {
3770 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3771 if (ScaledInput) {
3772 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3773 .addUse(Src)
3774 .setMIFlags(Flags);
3775 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3776 auto Zero = B.buildFConstant(Ty, 0.0);
3777 auto ResultOffset =
3778 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3779 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3780
3781 if (ST.hasFastFMAF32())
3782 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3783 else {
3784 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3785 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3786 }
3787
3788 return true;
3789 }
3790 }
3791
3792 auto Log2Operand = Ty == LLT::scalar(16)
3793 ? B.buildFLog2(Ty, Src, Flags)
3794 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3795 .addUse(Src)
3796 .setMIFlags(Flags);
3797 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3798 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3799 return true;
3800}
3801
3803 MachineIRBuilder &B) const {
3804 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3805 // If we have to handle denormals, scale up the input and adjust the result.
3806
3807 Register Dst = MI.getOperand(0).getReg();
3808 Register Src = MI.getOperand(1).getReg();
3809 unsigned Flags = MI.getFlags();
3810 LLT Ty = B.getMRI()->getType(Dst);
3811 const LLT F16 = LLT::scalar(16);
3812 const LLT F32 = LLT::scalar(32);
3813 const LLT F64 = LLT::scalar(64);
3814
3815 if (Ty == F64)
3816 return legalizeFEXPF64(MI, B);
3817
3818 if (Ty == F16) {
3819 // Nothing in half is a denormal when promoted to f32.
3820 auto Ext = B.buildFPExt(F32, Src, Flags);
3821 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3822 .addUse(Ext.getReg(0))
3823 .setMIFlags(Flags);
3824 B.buildFPTrunc(Dst, Log2, Flags);
3825 MI.eraseFromParent();
3826 return true;
3827 }
3828
3829 assert(Ty == F32);
3830
3831 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3832 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3833 .addUse(Src)
3834 .setMIFlags(Flags);
3835 MI.eraseFromParent();
3836 return true;
3837 }
3838
3839 // bool needs_scaling = x < -0x1.f80000p+6f;
3840 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3841
3842 // -nextafter(128.0, -1)
3843 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3844 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3845 RangeCheckConst, Flags);
3846
3847 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3848 auto Zero = B.buildFConstant(Ty, 0.0);
3849 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3850 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3851
3852 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3853 .addUse(AddInput.getReg(0))
3854 .setMIFlags(Flags);
3855
3856 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3857 auto One = B.buildFConstant(Ty, 1.0);
3858 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3859 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3860 MI.eraseFromParent();
3861 return true;
3862}
3863
3865 const SrcOp &Src, unsigned Flags) {
3866 LLT Ty = Dst.getLLTTy(*B.getMRI());
3867
3868 if (Ty == LLT::scalar(32)) {
3869 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3870 .addUse(Src.getReg())
3871 .setMIFlags(Flags);
3872 }
3873 return B.buildFExp2(Dst, Src, Flags);
3874}
3875
3877 Register Dst, Register X,
3878 unsigned Flags,
3879 bool IsExp10) const {
3880 LLT Ty = B.getMRI()->getType(X);
3881
3882 // exp(x) -> exp2(M_LOG2E_F * x);
3883 // exp10(x) -> exp2(log2(10) * x);
3884 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3885 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3886 buildExp(B, Dst, Mul, Flags);
3887 return true;
3888}
3889
3891 Register X, unsigned Flags) const {
3892 LLT Ty = B.getMRI()->getType(Dst);
3893 LLT F32 = LLT::scalar(32);
3894
3895 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3896 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3897 }
3898
3899 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3900 auto NeedsScaling =
3901 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3902 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3903 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3904 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3905
3906 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3907 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3908
3909 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3910 .addUse(ExpInput.getReg(0))
3911 .setMIFlags(Flags);
3912
3913 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3914 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3915 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3916 return true;
3917}
3918
3920 Register Dst, Register X,
3921 unsigned Flags) const {
3922 LLT Ty = B.getMRI()->getType(Dst);
3923 LLT F32 = LLT::scalar(32);
3924
3925 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3926 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3927 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3928 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3929
3930 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3931 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3932 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3933 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3934 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3935 return true;
3936 }
3937
3938 // bool s = x < -0x1.2f7030p+5f;
3939 // x += s ? 0x1.0p+5f : 0.0f;
3940 // exp10 = exp2(x * 0x1.a92000p+1f) *
3941 // exp2(x * 0x1.4f0978p-11f) *
3942 // (s ? 0x1.9f623ep-107f : 1.0f);
3943
3944 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3945 auto NeedsScaling =
3946 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3947
3948 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3949 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3950 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3951
3952 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3953 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3954
3955 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3956 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3957 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3958 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3959
3960 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3961 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3962 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3963
3964 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3965 return true;
3966}
3967
3968// This expansion gives a result slightly better than 1ulp.
3970 MachineIRBuilder &B) const {
3971
3972 Register X = MI.getOperand(1).getReg();
3973 LLT S64 = LLT::scalar(64);
3974 LLT S32 = LLT::scalar(32);
3975 LLT S1 = LLT::scalar(1);
3976
3977 // TODO: Check if reassoc is safe. There is an output change in exp2 and
3978 // exp10, which slightly increases ulp.
3979 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
3980
3981 Register Dn, F, T;
3982
3983 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
3984 // Dn = rint(X)
3985 Dn = B.buildFRint(S64, X, Flags).getReg(0);
3986 // F = X - Dn
3987 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
3988 // T = F*C1 + F*C2
3989 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
3990 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
3991 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
3992 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
3993
3994 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
3995 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
3996 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
3997 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
3998
3999 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4000 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
4001 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
4002 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4003 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4004
4005 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4006 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4007 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4008 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4009
4010 } else { // G_FEXP
4011 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4012 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4013 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4014
4015 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4016 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4017 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4018 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4019 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4020 }
4021
4022 // Polynomial chain for P
4023 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4024 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4025 Flags);
4026 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4027 Flags);
4028 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4029 Flags);
4030 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4031 Flags);
4032 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4033 Flags);
4034 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4035 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4036 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4037 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4038
4039 auto One = B.buildFConstant(S64, 1.0);
4040 P = B.buildFMA(S64, T, P, One, Flags);
4041 P = B.buildFMA(S64, T, P, One, Flags);
4042
4043 // Z = FLDEXP(P, (int)Dn)
4044 auto DnInt = B.buildFPTOSI(S32, Dn);
4045 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4046
4047 if (!(Flags & MachineInstr::FmNoInfs)) {
4048 // Overflow guard: if X <= 1024.0 then Z else +inf
4049 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4050 B.buildFConstant(S64, APFloat(1024.0)));
4051 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4052 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4053 }
4054
4055 // Underflow guard: if X >= -1075.0 then Z else 0.0
4056 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4057 B.buildFConstant(S64, APFloat(-1075.0)));
4058 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4059 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4060
4061 MI.eraseFromParent();
4062 return true;
4063}
4064
4066 MachineIRBuilder &B) const {
4067 Register Dst = MI.getOperand(0).getReg();
4068 Register X = MI.getOperand(1).getReg();
4069 const unsigned Flags = MI.getFlags();
4070 MachineFunction &MF = B.getMF();
4071 MachineRegisterInfo &MRI = *B.getMRI();
4072 LLT Ty = MRI.getType(Dst);
4073
4074 const LLT F64 = LLT::scalar(64);
4075
4076 if (Ty == F64)
4077 return legalizeFEXPF64(MI, B);
4078
4079 const LLT F16 = LLT::scalar(16);
4080 const LLT F32 = LLT::scalar(32);
4081 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4082
4083 if (Ty == F16) {
4084 // v_exp_f16 (fmul x, log2e)
4085 if (allowApproxFunc(MF, Flags)) {
4086 // TODO: Does this really require fast?
4087 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4088 : legalizeFExpUnsafe(B, Dst, X, Flags);
4089 MI.eraseFromParent();
4090 return true;
4091 }
4092
4093 // Nothing in half is a denormal when promoted to f32.
4094 //
4095 // exp(f16 x) ->
4096 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4097 //
4098 // exp10(f16 x) ->
4099 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4100 auto Ext = B.buildFPExt(F32, X, Flags);
4102 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4103 B.buildFPTrunc(Dst, Lowered, Flags);
4104 MI.eraseFromParent();
4105 return true;
4106 }
4107
4108 assert(Ty == F32);
4109
4110 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4111 // library behavior. Also, is known-not-daz source sufficient?
4112 if (allowApproxFunc(MF, Flags)) {
4113 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4114 : legalizeFExpUnsafe(B, Dst, X, Flags);
4115 MI.eraseFromParent();
4116 return true;
4117 }
4118
4119 // Algorithm:
4120 //
4121 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4122 //
4123 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4124 // n = 64*m + j, 0 <= j < 64
4125 //
4126 // e^x = 2^((64*m + j + f)/64)
4127 // = (2^m) * (2^(j/64)) * 2^(f/64)
4128 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4129 //
4130 // f = x*(64/ln(2)) - n
4131 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4132 //
4133 // e^x = (2^m) * (2^(j/64)) * e^r
4134 //
4135 // (2^(j/64)) is precomputed
4136 //
4137 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4138 // e^r = 1 + q
4139 //
4140 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4141 //
4142 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4143 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4144 Register PH, PL;
4145
4146 if (ST.hasFastFMAF32()) {
4147 const float c_exp = numbers::log2ef;
4148 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4149 const float c_exp10 = 0x1.a934f0p+1f;
4150 const float cc_exp10 = 0x1.2f346ep-24f;
4151
4152 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4153 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4154 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4155 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4156
4157 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4158 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4159 } else {
4160 const float ch_exp = 0x1.714000p+0f;
4161 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4162
4163 const float ch_exp10 = 0x1.a92000p+1f;
4164 const float cl_exp10 = 0x1.4f0978p-11f;
4165
4166 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4167 auto XH = B.buildAnd(Ty, X, MaskConst);
4168 auto XL = B.buildFSub(Ty, X, XH, Flags);
4169
4170 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4171 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4172
4173 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4174 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4175
4176 Register Mad0 =
4177 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4178 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4179 }
4180
4181 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4182
4183 // It is unsafe to contract this fsub into the PH multiply.
4184 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4185 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4186 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4187
4188 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4189 .addUse(A.getReg(0))
4190 .setMIFlags(Flags);
4191 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4192
4193 auto UnderflowCheckConst =
4194 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4195 auto Zero = B.buildFConstant(Ty, 0.0);
4196 auto Underflow =
4197 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4198
4199 R = B.buildSelect(Ty, Underflow, Zero, R);
4200
4201 if (!(Flags & MachineInstr::FmNoInfs)) {
4202 auto OverflowCheckConst =
4203 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4204
4205 auto Overflow =
4206 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4207 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4208 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4209 }
4210
4211 B.buildCopy(Dst, R);
4212 MI.eraseFromParent();
4213 return true;
4214}
4215
4217 MachineIRBuilder &B) const {
4218 Register Dst = MI.getOperand(0).getReg();
4219 Register Src0 = MI.getOperand(1).getReg();
4220 Register Src1 = MI.getOperand(2).getReg();
4221 unsigned Flags = MI.getFlags();
4222 LLT Ty = B.getMRI()->getType(Dst);
4223 const LLT F16 = LLT::scalar(16); // TODO: Expected LLT::float16()
4224 const LLT F32 = LLT::scalar(32); // TODO: Expected LLT::float32()
4225
4226 if (Ty == F32) {
4227 auto Log = B.buildFLog2(F32, Src0, Flags);
4228 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4229 .addUse(Log.getReg(0))
4230 .addUse(Src1)
4231 .setMIFlags(Flags);
4232 B.buildFExp2(Dst, Mul, Flags);
4233 } else if (Ty == F16) {
4234 // There's no f16 fmul_legacy, so we need to convert for it.
4235 auto Log = B.buildFLog2(F16, Src0, Flags);
4236 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4237 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4238 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4239 .addUse(Ext0.getReg(0))
4240 .addUse(Ext1.getReg(0))
4241 .setMIFlags(Flags);
4242 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4243 } else
4244 return false;
4245
4246 MI.eraseFromParent();
4247 return true;
4248}
4249
4250// Find a source register, ignoring any possible source modifiers.
4252 Register ModSrc = OrigSrc;
4253 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4254 ModSrc = SrcFNeg->getOperand(1).getReg();
4255 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4256 ModSrc = SrcFAbs->getOperand(1).getReg();
4257 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4258 ModSrc = SrcFAbs->getOperand(1).getReg();
4259 return ModSrc;
4260}
4261
4264 MachineIRBuilder &B) const {
4265
4266 const LLT S1 = LLT::scalar(1);
4267 const LLT F64 = LLT::scalar(64); // TODO: Expected float64
4268 Register Dst = MI.getOperand(0).getReg();
4269 Register OrigSrc = MI.getOperand(1).getReg();
4270 unsigned Flags = MI.getFlags();
4271 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4272 "this should not have been custom lowered");
4273
4274 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4275 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4276 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4277 // V_FRACT bug is:
4278 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4279 //
4280 // Convert floor(x) to (x - fract(x))
4281
4282 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4283 .addUse(OrigSrc)
4284 .setMIFlags(Flags);
4285
4286 // Give source modifier matching some assistance before obscuring a foldable
4287 // pattern.
4288
4289 // TODO: We can avoid the neg on the fract? The input sign to fract
4290 // shouldn't matter?
4291 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4292
4293 auto Const =
4294 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4295
4297
4298 // We don't need to concern ourselves with the snan handling difference, so
4299 // use the one which will directly select.
4300 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4301 if (MFI->getMode().IEEE)
4302 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4303 else
4304 B.buildFMinNum(Min, Fract, Const, Flags);
4305
4306 Register CorrectedFract = Min;
4307 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4308 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4309 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4310 }
4311
4312 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4313 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4314
4315 MI.eraseFromParent();
4316 return true;
4317}
4318
4319// Turn an illegal packed v2s16 build vector into bit operations.
4320// TODO: This should probably be a bitcast action in LegalizerHelper.
4323 Register Dst = MI.getOperand(0).getReg();
4324 const LLT S32 = LLT::scalar(32);
4325 const LLT S16 = LLT::scalar(16);
4326 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4327
4328 Register Src0 = MI.getOperand(1).getReg();
4329 Register Src1 = MI.getOperand(2).getReg();
4330
4331 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4332 assert(MRI.getType(Src0) == S32);
4333 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4334 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4335 }
4336
4337 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4338 B.buildBitcast(Dst, Merge);
4339
4340 MI.eraseFromParent();
4341 return true;
4342}
4343
4344// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4345//
4346// Source and accumulation registers must all be 32-bits.
4347//
4348// TODO: When the multiply is uniform, we should produce a code sequence
4349// that is better suited to instruction selection on the SALU. Instead of
4350// the outer loop going over parts of the result, the outer loop should go
4351// over parts of one of the factors. This should result in instruction
4352// selection that makes full use of S_ADDC_U32 instructions.
4355 ArrayRef<Register> Src0,
4356 ArrayRef<Register> Src1,
4357 bool UsePartialMad64_32,
4358 bool SeparateOddAlignedProducts) const {
4359 // Use (possibly empty) vectors of S1 registers to represent the set of
4360 // carries from one pair of positions to the next.
4361 using Carry = SmallVector<Register, 2>;
4362
4363 MachineIRBuilder &B = Helper.MIRBuilder;
4364 GISelValueTracking &VT = *Helper.getValueTracking();
4365
4366 const LLT S1 = LLT::scalar(1);
4367 const LLT S32 = LLT::scalar(32);
4368 const LLT S64 = LLT::scalar(64);
4369
4370 Register Zero32;
4371 Register Zero64;
4372
4373 auto getZero32 = [&]() -> Register {
4374 if (!Zero32)
4375 Zero32 = B.buildConstant(S32, 0).getReg(0);
4376 return Zero32;
4377 };
4378 auto getZero64 = [&]() -> Register {
4379 if (!Zero64)
4380 Zero64 = B.buildConstant(S64, 0).getReg(0);
4381 return Zero64;
4382 };
4383
4384 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4385 for (unsigned i = 0; i < Src0.size(); ++i) {
4386 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4387 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4388 }
4389
4390 // Merge the given carries into the 32-bit LocalAccum, which is modified
4391 // in-place.
4392 //
4393 // Returns the carry-out, which is a single S1 register or null.
4394 auto mergeCarry =
4395 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4396 if (CarryIn.empty())
4397 return Register();
4398
4399 bool HaveCarryOut = true;
4400 Register CarryAccum;
4401 if (CarryIn.size() == 1) {
4402 if (!LocalAccum) {
4403 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4404 return Register();
4405 }
4406
4407 CarryAccum = getZero32();
4408 } else {
4409 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4410 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4411 CarryAccum =
4412 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4413 .getReg(0);
4414 }
4415
4416 if (!LocalAccum) {
4417 LocalAccum = getZero32();
4418 HaveCarryOut = false;
4419 }
4420 }
4421
4422 auto Add =
4423 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4424 LocalAccum = Add.getReg(0);
4425 return HaveCarryOut ? Add.getReg(1) : Register();
4426 };
4427
4428 // Build a multiply-add chain to compute
4429 //
4430 // LocalAccum + (partial products at DstIndex)
4431 // + (opportunistic subset of CarryIn)
4432 //
4433 // LocalAccum is an array of one or two 32-bit registers that are updated
4434 // in-place. The incoming registers may be null.
4435 //
4436 // In some edge cases, carry-ins can be consumed "for free". In that case,
4437 // the consumed carry bits are removed from CarryIn in-place.
4438 auto buildMadChain =
4439 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4440 -> Carry {
4441 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4442 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4443
4444 Carry CarryOut;
4445 unsigned j0 = 0;
4446
4447 // Use plain 32-bit multiplication for the most significant part of the
4448 // result by default.
4449 if (LocalAccum.size() == 1 &&
4450 (!UsePartialMad64_32 || !CarryIn.empty())) {
4451 do {
4452 // Skip multiplication if one of the operands is 0
4453 unsigned j1 = DstIndex - j0;
4454 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4455 ++j0;
4456 continue;
4457 }
4458 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4459 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4460 LocalAccum[0] = Mul.getReg(0);
4461 } else {
4462 if (CarryIn.empty()) {
4463 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4464 } else {
4465 LocalAccum[0] =
4466 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4467 .getReg(0);
4468 CarryIn.pop_back();
4469 }
4470 }
4471 ++j0;
4472 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4473 }
4474
4475 // Build full 64-bit multiplies.
4476 if (j0 <= DstIndex) {
4477 bool HaveSmallAccum = false;
4478 Register Tmp;
4479
4480 if (LocalAccum[0]) {
4481 if (LocalAccum.size() == 1) {
4482 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4483 HaveSmallAccum = true;
4484 } else if (LocalAccum[1]) {
4485 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4486 HaveSmallAccum = false;
4487 } else {
4488 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4489 HaveSmallAccum = true;
4490 }
4491 } else {
4492 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4493 Tmp = getZero64();
4494 HaveSmallAccum = true;
4495 }
4496
4497 do {
4498 unsigned j1 = DstIndex - j0;
4499 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4500 ++j0;
4501 continue;
4502 }
4503 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4504 {Src0[j0], Src1[j1], Tmp});
4505 Tmp = Mad.getReg(0);
4506 if (!HaveSmallAccum)
4507 CarryOut.push_back(Mad.getReg(1));
4508 HaveSmallAccum = false;
4509
4510 ++j0;
4511 } while (j0 <= DstIndex);
4512
4513 auto Unmerge = B.buildUnmerge(S32, Tmp);
4514 LocalAccum[0] = Unmerge.getReg(0);
4515 if (LocalAccum.size() > 1)
4516 LocalAccum[1] = Unmerge.getReg(1);
4517 }
4518
4519 return CarryOut;
4520 };
4521
4522 // Outer multiply loop, iterating over destination parts from least
4523 // significant to most significant parts.
4524 //
4525 // The columns of the following diagram correspond to the destination parts
4526 // affected by one iteration of the outer loop (ignoring boundary
4527 // conditions).
4528 //
4529 // Dest index relative to 2 * i: 1 0 -1
4530 // ------
4531 // Carries from previous iteration: e o
4532 // Even-aligned partial product sum: E E .
4533 // Odd-aligned partial product sum: O O
4534 //
4535 // 'o' is OddCarry, 'e' is EvenCarry.
4536 // EE and OO are computed from partial products via buildMadChain and use
4537 // accumulation where possible and appropriate.
4538 //
4539 Register SeparateOddCarry;
4540 Carry EvenCarry;
4541 Carry OddCarry;
4542
4543 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4544 Carry OddCarryIn = std::move(OddCarry);
4545 Carry EvenCarryIn = std::move(EvenCarry);
4546 OddCarry.clear();
4547 EvenCarry.clear();
4548
4549 // Partial products at offset 2 * i.
4550 if (2 * i < Accum.size()) {
4551 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4552 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4553 }
4554
4555 // Partial products at offset 2 * i - 1.
4556 if (i > 0) {
4557 if (!SeparateOddAlignedProducts) {
4558 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4559 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4560 } else {
4561 bool IsHighest = 2 * i >= Accum.size();
4562 Register SeparateOddOut[2];
4563 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4564 .take_front(IsHighest ? 1 : 2);
4565 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4566
4568
4569 if (i == 1) {
4570 if (!IsHighest)
4571 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4572 else
4573 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4574 } else {
4575 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4576 SeparateOddCarry);
4577 }
4578 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4579
4580 if (!IsHighest) {
4581 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4582 Lo->getOperand(1).getReg());
4583 Accum[2 * i] = Hi.getReg(0);
4584 SeparateOddCarry = Hi.getReg(1);
4585 }
4586 }
4587 }
4588
4589 // Add in the carries from the previous iteration
4590 if (i > 0) {
4591 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4592 EvenCarryIn.push_back(CarryOut);
4593
4594 if (2 * i < Accum.size()) {
4595 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4596 OddCarry.push_back(CarryOut);
4597 }
4598 }
4599 }
4600}
4601
4602// Custom narrowing of wide multiplies using wide multiply-add instructions.
4603//
4604// TODO: If the multiply is followed by an addition, we should attempt to
4605// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4607 MachineInstr &MI) const {
4608 assert(ST.hasMad64_32());
4609 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4610
4611 MachineIRBuilder &B = Helper.MIRBuilder;
4612 MachineRegisterInfo &MRI = *B.getMRI();
4613
4614 Register DstReg = MI.getOperand(0).getReg();
4615 Register Src0 = MI.getOperand(1).getReg();
4616 Register Src1 = MI.getOperand(2).getReg();
4617
4618 LLT Ty = MRI.getType(DstReg);
4619 assert(Ty.isScalar());
4620
4621 unsigned Size = Ty.getSizeInBits();
4622 if (ST.hasVectorMulU64() && Size == 64)
4623 return true;
4624
4625 unsigned NumParts = Size / 32;
4626 assert((Size % 32) == 0);
4627 assert(NumParts >= 2);
4628
4629 // Whether to use MAD_64_32 for partial products whose high half is
4630 // discarded. This avoids some ADD instructions but risks false dependency
4631 // stalls on some subtargets in some cases.
4632 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4633
4634 // Whether to compute odd-aligned partial products separately. This is
4635 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4636 // in an even-aligned VGPR.
4637 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4638
4639 LLT S32 = LLT::scalar(32);
4640 SmallVector<Register, 2> Src0Parts, Src1Parts;
4641 for (unsigned i = 0; i < NumParts; ++i) {
4644 }
4645 B.buildUnmerge(Src0Parts, Src0);
4646 B.buildUnmerge(Src1Parts, Src1);
4647
4648 SmallVector<Register, 2> AccumRegs(NumParts);
4649 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4650 SeparateOddAlignedProducts);
4651
4652 B.buildMergeLikeInstr(DstReg, AccumRegs);
4653 MI.eraseFromParent();
4654 return true;
4655}
4656
4657// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4658// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4659// case with a single min instruction instead of a compare+select.
4662 MachineIRBuilder &B) const {
4663 Register Dst = MI.getOperand(0).getReg();
4664 Register Src = MI.getOperand(1).getReg();
4665 LLT DstTy = MRI.getType(Dst);
4666 LLT SrcTy = MRI.getType(Src);
4667
4668 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4669 ? AMDGPU::G_AMDGPU_FFBH_U32
4670 : AMDGPU::G_AMDGPU_FFBL_B32;
4671 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4672 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4673
4674 MI.eraseFromParent();
4675 return true;
4676}
4677
4680 MachineIRBuilder &B) const {
4681 Register Dst = MI.getOperand(0).getReg();
4682 Register Src = MI.getOperand(1).getReg();
4683 LLT SrcTy = MRI.getType(Src);
4684 TypeSize NumBits = SrcTy.getSizeInBits();
4685
4686 assert(NumBits < 32u);
4687
4688 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4689 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4690 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4691 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4692 B.buildTrunc(Dst, Ctlz);
4693 MI.eraseFromParent();
4694 return true;
4695}
4696
4699 MachineIRBuilder &B) const {
4700 Register Dst = MI.getOperand(0).getReg();
4701 Register Src = MI.getOperand(1).getReg();
4702 LLT SrcTy = MRI.getType(Src);
4703 const LLT S32 = LLT::scalar(32);
4704 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4705 unsigned BitWidth = SrcTy.getSizeInBits();
4706
4707 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4708 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4709 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4710 MI.eraseFromParent();
4711 return true;
4712}
4713
4714// Check that this is a G_XOR x, -1
4715static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4716 if (MI.getOpcode() != TargetOpcode::G_XOR)
4717 return false;
4718 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4719 return ConstVal == -1;
4720}
4721
4722// Return the use branch instruction, otherwise null if the usage is invalid.
4723static MachineInstr *
4725 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4726 Register CondDef = MI.getOperand(0).getReg();
4727 if (!MRI.hasOneNonDBGUse(CondDef))
4728 return nullptr;
4729
4730 MachineBasicBlock *Parent = MI.getParent();
4731 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4732
4733 if (isNot(MRI, *UseMI)) {
4734 Register NegatedCond = UseMI->getOperand(0).getReg();
4735 if (!MRI.hasOneNonDBGUse(NegatedCond))
4736 return nullptr;
4737
4738 // We're deleting the def of this value, so we need to remove it.
4739 eraseInstr(*UseMI, MRI);
4740
4741 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4742 Negated = true;
4743 }
4744
4745 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4746 return nullptr;
4747
4748 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4749 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4750 if (Next == Parent->end()) {
4751 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4752 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4753 return nullptr;
4754 UncondBrTarget = &*NextMBB;
4755 } else {
4756 if (Next->getOpcode() != AMDGPU::G_BR)
4757 return nullptr;
4758 Br = &*Next;
4759 UncondBrTarget = Br->getOperand(0).getMBB();
4760 }
4761
4762 return UseMI;
4763}
4764
4767 const ArgDescriptor *Arg,
4768 const TargetRegisterClass *ArgRC,
4769 LLT ArgTy) const {
4770 MCRegister SrcReg = Arg->getRegister();
4771 assert(SrcReg.isPhysical() && "Physical register expected");
4772 assert(DstReg.isVirtual() && "Virtual register expected");
4773
4774 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4775 *ArgRC, B.getDebugLoc(), ArgTy);
4776 if (Arg->isMasked()) {
4777 // TODO: Should we try to emit this once in the entry block?
4778 const LLT S32 = LLT::scalar(32);
4779 const unsigned Mask = Arg->getMask();
4780 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4781
4782 Register AndMaskSrc = LiveIn;
4783
4784 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4785 // 0.
4786 if (Shift != 0) {
4787 auto ShiftAmt = B.buildConstant(S32, Shift);
4788 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4789 }
4790
4791 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4792 } else {
4793 B.buildCopy(DstReg, LiveIn);
4794 }
4795}
4796
4801 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4802 Register DstReg = MI.getOperand(0).getReg();
4803 if (!ST.hasClusters()) {
4804 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4805 return false;
4806 MI.eraseFromParent();
4807 return true;
4808 }
4809
4810 // Clusters are supported. Return the global position in the grid. If clusters
4811 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4812
4813 // WorkGroupIdXYZ = ClusterId == 0 ?
4814 // ClusterIdXYZ :
4815 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4816 MachineRegisterInfo &MRI = *B.getMRI();
4817 const LLT S32 = LLT::scalar(32);
4818 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4819 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4820 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4821 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4822 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4823 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4824 return false;
4825
4826 auto One = B.buildConstant(S32, 1);
4827 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4828 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4829 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4830
4831 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4832
4833 switch (MFI->getClusterDims().getKind()) {
4836 B.buildCopy(DstReg, GlobalIdXYZ);
4837 MI.eraseFromParent();
4838 return true;
4839 }
4841 B.buildCopy(DstReg, ClusterIdXYZ);
4842 MI.eraseFromParent();
4843 return true;
4844 }
4846 using namespace AMDGPU::Hwreg;
4847 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4848 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4849 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4850 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4851 .addDef(ClusterId)
4852 .addImm(ClusterIdField);
4853 auto Zero = B.buildConstant(S32, 0);
4854 auto NoClusters =
4855 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4856 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4857 MI.eraseFromParent();
4858 return true;
4859 }
4860 }
4861
4862 llvm_unreachable("nothing should reach here");
4863}
4864
4866 Register DstReg, MachineIRBuilder &B,
4868 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4869 const ArgDescriptor *Arg = nullptr;
4870 const TargetRegisterClass *ArgRC;
4871 LLT ArgTy;
4872
4873 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4874 const ArgDescriptor WorkGroupIDX =
4875 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4876 // If GridZ is not programmed in an entry function then the hardware will set
4877 // it to all zeros, so there is no need to mask the GridY value in the low
4878 // order bits.
4879 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4880 AMDGPU::TTMP7,
4881 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4882 const ArgDescriptor WorkGroupIDZ =
4883 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4884 const ArgDescriptor ClusterWorkGroupIDX =
4885 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4886 const ArgDescriptor ClusterWorkGroupIDY =
4887 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4888 const ArgDescriptor ClusterWorkGroupIDZ =
4889 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4890 const ArgDescriptor ClusterWorkGroupMaxIDX =
4891 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4892 const ArgDescriptor ClusterWorkGroupMaxIDY =
4893 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4894 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4895 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4896 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4897 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4898
4899 auto LoadConstant = [&](unsigned N) {
4900 B.buildConstant(DstReg, N);
4901 return true;
4902 };
4903
4904 if (ST.hasArchitectedSGPRs() &&
4906 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4907 bool HasFixedDims = ClusterDims.isFixedDims();
4908
4909 switch (ArgType) {
4911 Arg = &WorkGroupIDX;
4912 ArgRC = &AMDGPU::SReg_32RegClass;
4913 ArgTy = LLT::scalar(32);
4914 break;
4916 Arg = &WorkGroupIDY;
4917 ArgRC = &AMDGPU::SReg_32RegClass;
4918 ArgTy = LLT::scalar(32);
4919 break;
4921 Arg = &WorkGroupIDZ;
4922 ArgRC = &AMDGPU::SReg_32RegClass;
4923 ArgTy = LLT::scalar(32);
4924 break;
4926 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4927 return LoadConstant(0);
4928 Arg = &ClusterWorkGroupIDX;
4929 ArgRC = &AMDGPU::SReg_32RegClass;
4930 ArgTy = LLT::scalar(32);
4931 break;
4933 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4934 return LoadConstant(0);
4935 Arg = &ClusterWorkGroupIDY;
4936 ArgRC = &AMDGPU::SReg_32RegClass;
4937 ArgTy = LLT::scalar(32);
4938 break;
4940 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4941 return LoadConstant(0);
4942 Arg = &ClusterWorkGroupIDZ;
4943 ArgRC = &AMDGPU::SReg_32RegClass;
4944 ArgTy = LLT::scalar(32);
4945 break;
4947 if (HasFixedDims)
4948 return LoadConstant(ClusterDims.getDims()[0] - 1);
4949 Arg = &ClusterWorkGroupMaxIDX;
4950 ArgRC = &AMDGPU::SReg_32RegClass;
4951 ArgTy = LLT::scalar(32);
4952 break;
4954 if (HasFixedDims)
4955 return LoadConstant(ClusterDims.getDims()[1] - 1);
4956 Arg = &ClusterWorkGroupMaxIDY;
4957 ArgRC = &AMDGPU::SReg_32RegClass;
4958 ArgTy = LLT::scalar(32);
4959 break;
4961 if (HasFixedDims)
4962 return LoadConstant(ClusterDims.getDims()[2] - 1);
4963 Arg = &ClusterWorkGroupMaxIDZ;
4964 ArgRC = &AMDGPU::SReg_32RegClass;
4965 ArgTy = LLT::scalar(32);
4966 break;
4968 Arg = &ClusterWorkGroupMaxFlatID;
4969 ArgRC = &AMDGPU::SReg_32RegClass;
4970 ArgTy = LLT::scalar(32);
4971 break;
4972 default:
4973 break;
4974 }
4975 }
4976
4977 if (!Arg)
4978 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4979
4980 if (!Arg) {
4982 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4983 // which case the pointer argument may be missing and we use null.
4984 return LoadConstant(0);
4985 }
4986
4987 // It's undefined behavior if a function marked with the amdgpu-no-*
4988 // attributes uses the corresponding intrinsic.
4989 B.buildUndef(DstReg);
4990 return true;
4991 }
4992
4993 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4994 return false; // TODO: Handle these
4995 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4996 return true;
4997}
4998
5002 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
5003 return false;
5004
5005 MI.eraseFromParent();
5006 return true;
5007}
5008
5010 int64_t C) {
5011 B.buildConstant(MI.getOperand(0).getReg(), C);
5012 MI.eraseFromParent();
5013 return true;
5014}
5015
5018 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5019 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5020 if (MaxID == 0)
5021 return replaceWithConstant(B, MI, 0);
5022
5023 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5024 const ArgDescriptor *Arg;
5025 const TargetRegisterClass *ArgRC;
5026 LLT ArgTy;
5027 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5028
5029 Register DstReg = MI.getOperand(0).getReg();
5030 if (!Arg) {
5031 // It's undefined behavior if a function marked with the amdgpu-no-*
5032 // attributes uses the corresponding intrinsic.
5033 B.buildUndef(DstReg);
5034 MI.eraseFromParent();
5035 return true;
5036 }
5037
5038 if (Arg->isMasked()) {
5039 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5040 // masking operations anyway.
5041 //
5042 // TODO: We could assert the top bit is 0 for the source copy.
5043 if (!loadInputValue(DstReg, B, ArgType))
5044 return false;
5045 } else {
5047 if (!loadInputValue(TmpReg, B, ArgType))
5048 return false;
5049 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5050 }
5051
5052 MI.eraseFromParent();
5053 return true;
5054}
5055
5058 // This isn't really a constant pool but close enough.
5061 return PtrInfo;
5062}
5063
5065 int64_t Offset) const {
5067 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5068
5069 // TODO: If we passed in the base kernel offset we could have a better
5070 // alignment than 4, but we don't really need it.
5071 if (!loadInputValue(KernArgReg, B,
5073 llvm_unreachable("failed to find kernarg segment ptr");
5074
5075 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5076 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5077}
5078
5079/// Legalize a value that's loaded from kernel arguments. This is only used by
5080/// legacy intrinsics.
5084 Align Alignment) const {
5085 Register DstReg = MI.getOperand(0).getReg();
5086
5087 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5088 "unexpected kernarg parameter type");
5089
5092 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5095 MI.eraseFromParent();
5096 return true;
5097}
5098
5101 MachineIRBuilder &B) const {
5102 Register Dst = MI.getOperand(0).getReg();
5103 LLT DstTy = MRI.getType(Dst);
5104 LLT S16 = LLT::scalar(16);
5105 LLT S32 = LLT::scalar(32);
5106 LLT S64 = LLT::scalar(64);
5107
5108 if (DstTy == S16)
5109 return legalizeFDIV16(MI, MRI, B);
5110 if (DstTy == S32)
5111 return legalizeFDIV32(MI, MRI, B);
5112 if (DstTy == S64)
5113 return legalizeFDIV64(MI, MRI, B);
5114
5115 return false;
5116}
5117
5119 Register DstDivReg,
5120 Register DstRemReg,
5121 Register X,
5122 Register Y) const {
5123 const LLT S1 = LLT::scalar(1);
5124 const LLT S32 = LLT::scalar(32);
5125
5126 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5127 // algorithm used here.
5128
5129 // Initial estimate of inv(y).
5130 auto FloatY = B.buildUITOFP(S32, Y);
5131 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5132 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5133 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5134 auto Z = B.buildFPTOUI(S32, ScaledY);
5135
5136 // One round of UNR.
5137 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5138 auto NegYZ = B.buildMul(S32, NegY, Z);
5139 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5140
5141 // Quotient/remainder estimate.
5142 auto Q = B.buildUMulH(S32, X, Z);
5143 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5144
5145 // First quotient/remainder refinement.
5146 auto One = B.buildConstant(S32, 1);
5147 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5148 if (DstDivReg)
5149 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5150 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5151
5152 // Second quotient/remainder refinement.
5153 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5154 if (DstDivReg)
5155 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5156
5157 if (DstRemReg)
5158 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5159}
5160
5161// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5162//
5163// Return lo, hi of result
5164//
5165// %cvt.lo = G_UITOFP Val.lo
5166// %cvt.hi = G_UITOFP Val.hi
5167// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5168// %rcp = G_AMDGPU_RCP_IFLAG %mad
5169// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5170// %mul2 = G_FMUL %mul1, 2**(-32)
5171// %trunc = G_INTRINSIC_TRUNC %mul2
5172// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5173// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5174static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5175 Register Val) {
5176 const LLT S32 = LLT::scalar(32);
5177 auto Unmerge = B.buildUnmerge(S32, Val);
5178
5179 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5180 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5181
5182 auto Mad = B.buildFMAD(
5183 S32, CvtHi, // 2**32
5184 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5185
5186 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5187 auto Mul1 = B.buildFMul(
5188 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5189
5190 // 2**(-32)
5191 auto Mul2 = B.buildFMul(
5192 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5193 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5194
5195 // -(2**32)
5196 auto Mad2 = B.buildFMAD(
5197 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5198 Mul1);
5199
5200 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5201 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5202
5203 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5204}
5205
5207 Register DstDivReg,
5208 Register DstRemReg,
5209 Register Numer,
5210 Register Denom) const {
5211 const LLT S32 = LLT::scalar(32);
5212 const LLT S64 = LLT::scalar(64);
5213 const LLT S1 = LLT::scalar(1);
5214 Register RcpLo, RcpHi;
5215
5216 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5217
5218 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5219
5220 auto Zero64 = B.buildConstant(S64, 0);
5221 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5222
5223 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5224 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5225
5226 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5227 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5228 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5229
5230 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5231 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5232 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5233
5234 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5235 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5236 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5237 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5238 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5239
5240 auto Zero32 = B.buildConstant(S32, 0);
5241 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5242 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5243 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5244
5245 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5246 Register NumerLo = UnmergeNumer.getReg(0);
5247 Register NumerHi = UnmergeNumer.getReg(1);
5248
5249 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5250 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5251 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5252 Register Mul3_Lo = UnmergeMul3.getReg(0);
5253 Register Mul3_Hi = UnmergeMul3.getReg(1);
5254 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5255 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5256 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5257 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5258
5259 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5260 Register DenomLo = UnmergeDenom.getReg(0);
5261 Register DenomHi = UnmergeDenom.getReg(1);
5262
5263 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5264 auto C1 = B.buildSExt(S32, CmpHi);
5265
5266 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5267 auto C2 = B.buildSExt(S32, CmpLo);
5268
5269 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5270 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5271
5272 // TODO: Here and below portions of the code can be enclosed into if/endif.
5273 // Currently control flow is unconditional and we have 4 selects after
5274 // potential endif to substitute PHIs.
5275
5276 // if C3 != 0 ...
5277 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5278 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5279 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5280 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5281
5282 auto One64 = B.buildConstant(S64, 1);
5283 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5284
5285 auto C4 =
5286 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5287 auto C5 =
5288 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5289 auto C6 = B.buildSelect(
5290 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5291
5292 // if (C6 != 0)
5293 auto Add4 = B.buildAdd(S64, Add3, One64);
5294 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5295
5296 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5297 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5298 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5299
5300 // endif C6
5301 // endif C3
5302
5303 if (DstDivReg) {
5304 auto Sel1 = B.buildSelect(
5305 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5306 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5307 Sel1, MulHi3);
5308 }
5309
5310 if (DstRemReg) {
5311 auto Sel2 = B.buildSelect(
5312 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5313 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5314 Sel2, Sub1);
5315 }
5316}
5317
5320 MachineIRBuilder &B) const {
5321 Register DstDivReg, DstRemReg;
5322 switch (MI.getOpcode()) {
5323 default:
5324 llvm_unreachable("Unexpected opcode!");
5325 case AMDGPU::G_UDIV: {
5326 DstDivReg = MI.getOperand(0).getReg();
5327 break;
5328 }
5329 case AMDGPU::G_UREM: {
5330 DstRemReg = MI.getOperand(0).getReg();
5331 break;
5332 }
5333 case AMDGPU::G_UDIVREM: {
5334 DstDivReg = MI.getOperand(0).getReg();
5335 DstRemReg = MI.getOperand(1).getReg();
5336 break;
5337 }
5338 }
5339
5340 const LLT S64 = LLT::scalar(64);
5341 const LLT S32 = LLT::scalar(32);
5342 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5343 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5344 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5345 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5346
5347 if (Ty == S32)
5348 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5349 else if (Ty == S64)
5350 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5351 else
5352 return false;
5353
5354 MI.eraseFromParent();
5355 return true;
5356}
5357
5360 MachineIRBuilder &B) const {
5361 const LLT S64 = LLT::scalar(64);
5362 const LLT S32 = LLT::scalar(32);
5363
5364 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5365 if (Ty != S32 && Ty != S64)
5366 return false;
5367
5368 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5369 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5370 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5371
5372 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5373 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5374 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5375
5376 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5377 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5378
5379 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5380 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5381
5382 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5383 switch (MI.getOpcode()) {
5384 default:
5385 llvm_unreachable("Unexpected opcode!");
5386 case AMDGPU::G_SDIV: {
5387 DstDivReg = MI.getOperand(0).getReg();
5388 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5389 break;
5390 }
5391 case AMDGPU::G_SREM: {
5392 DstRemReg = MI.getOperand(0).getReg();
5393 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5394 break;
5395 }
5396 case AMDGPU::G_SDIVREM: {
5397 DstDivReg = MI.getOperand(0).getReg();
5398 DstRemReg = MI.getOperand(1).getReg();
5399 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5400 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5401 break;
5402 }
5403 }
5404
5405 if (Ty == S32)
5406 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5407 else
5408 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5409
5410 if (DstDivReg) {
5411 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5412 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5413 B.buildSub(DstDivReg, SignXor, Sign);
5414 }
5415
5416 if (DstRemReg) {
5417 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5418 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5419 B.buildSub(DstRemReg, SignXor, Sign);
5420 }
5421
5422 MI.eraseFromParent();
5423 return true;
5424}
5425
5428 MachineIRBuilder &B) const {
5429 Register Res = MI.getOperand(0).getReg();
5430 Register LHS = MI.getOperand(1).getReg();
5431 Register RHS = MI.getOperand(2).getReg();
5432 uint16_t Flags = MI.getFlags();
5433 LLT ResTy = MRI.getType(Res);
5434
5435 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5436
5437 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5438 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5439 return false;
5440
5441 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5442 // the CI documentation has a worst case error of 1 ulp.
5443 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5444 // use it as long as we aren't trying to use denormals.
5445 //
5446 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5447
5448 // 1 / x -> RCP(x)
5449 if (CLHS->isExactlyValue(1.0)) {
5450 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5451 .addUse(RHS)
5452 .setMIFlags(Flags);
5453
5454 MI.eraseFromParent();
5455 return true;
5456 }
5457
5458 // -1 / x -> RCP( FNEG(x) )
5459 if (CLHS->isExactlyValue(-1.0)) {
5460 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5461 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5462 .addUse(FNeg.getReg(0))
5463 .setMIFlags(Flags);
5464
5465 MI.eraseFromParent();
5466 return true;
5467 }
5468 }
5469
5470 // For f16 require afn or arcp.
5471 // For f32 require afn.
5472 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5473 !MI.getFlag(MachineInstr::FmArcp)))
5474 return false;
5475
5476 // x / y -> x * (1.0 / y)
5477 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5478 .addUse(RHS)
5479 .setMIFlags(Flags);
5480 B.buildFMul(Res, LHS, RCP, Flags);
5481
5482 MI.eraseFromParent();
5483 return true;
5484}
5485
5488 MachineIRBuilder &B) const {
5489 Register Res = MI.getOperand(0).getReg();
5490 Register X = MI.getOperand(1).getReg();
5491 Register Y = MI.getOperand(2).getReg();
5492 uint16_t Flags = MI.getFlags();
5493 LLT ResTy = MRI.getType(Res);
5494
5495 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5496
5497 if (!AllowInaccurateRcp)
5498 return false;
5499
5500 const ConstantFP *CLHS = getConstantFPVRegVal(X, MRI);
5501 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
5502
5503 // Pull out the negation so it folds for free into the source modifiers.
5504 if (IsNegRcp)
5505 X = B.buildFConstant(ResTy, 1.0).getReg(0);
5506
5507 Register NegY = IsNegRcp ? Y : B.buildFNeg(ResTy, Y).getReg(0);
5508 auto One = B.buildFConstant(ResTy, 1.0);
5509
5510 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5511 .addUse(Y)
5512 .setMIFlags(Flags);
5513 if (IsNegRcp)
5514 R = B.buildFNeg(ResTy, R);
5515
5516 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5517 R = B.buildFMA(ResTy, Tmp0, R, R);
5518
5519 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5520 R = B.buildFMA(ResTy, Tmp1, R, R);
5521
5522 // Skip the last 2 correction terms for reciprocal.
5523 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0))) {
5524 B.buildCopy(Res, R);
5525 MI.eraseFromParent();
5526 return true;
5527 }
5528
5529 auto Ret = B.buildFMul(ResTy, X, R);
5530 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5531
5532 B.buildFMA(Res, Tmp2, R, Ret);
5533 MI.eraseFromParent();
5534 return true;
5535}
5536
5539 MachineIRBuilder &B) const {
5540 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5541 return true;
5542
5543 Register Res = MI.getOperand(0).getReg();
5544 Register LHS = MI.getOperand(1).getReg();
5545 Register RHS = MI.getOperand(2).getReg();
5546
5547 uint16_t Flags = MI.getFlags();
5548
5549 LLT S16 = LLT::scalar(16);
5550 LLT S32 = LLT::scalar(32);
5551
5552 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5553 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5554 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5555 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5556 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5557 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5558 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5559 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5560 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5561 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5562 // q16.u = opx(V_CVT_F16_F32, q32.u);
5563 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5564
5565 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5566 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5567 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5568 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5569 .addUse(RHSExt.getReg(0))
5570 .setMIFlags(Flags);
5571 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5573 if (ST.hasMadMacF32Insts()) {
5574 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5575 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5576 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5577 } else {
5578 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5579 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5580 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5581 }
5582 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5583 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5584 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5585 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5586 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5587 .addUse(RDst.getReg(0))
5588 .addUse(RHS)
5589 .addUse(LHS)
5590 .setMIFlags(Flags);
5591
5592 MI.eraseFromParent();
5593 return true;
5594}
5595
5596static constexpr unsigned SPDenormModeBitField =
5598
5599// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5600// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5602 const GCNSubtarget &ST,
5604 // Set SP denorm mode to this value.
5605 unsigned SPDenormMode =
5606 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5607
5608 if (ST.hasDenormModeInst()) {
5609 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5610 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5611
5612 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5613 B.buildInstr(AMDGPU::S_DENORM_MODE)
5614 .addImm(NewDenormModeValue);
5615
5616 } else {
5617 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5618 .addImm(SPDenormMode)
5619 .addImm(SPDenormModeBitField);
5620 }
5621}
5622
5625 MachineIRBuilder &B) const {
5626 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5627 return true;
5628
5629 Register Res = MI.getOperand(0).getReg();
5630 Register LHS = MI.getOperand(1).getReg();
5631 Register RHS = MI.getOperand(2).getReg();
5632 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5633 SIModeRegisterDefaults Mode = MFI->getMode();
5634
5635 uint16_t Flags = MI.getFlags();
5636
5637 LLT S32 = LLT::scalar(32);
5638 LLT S1 = LLT::scalar(1);
5639
5640 auto One = B.buildFConstant(S32, 1.0f);
5641
5642 auto DenominatorScaled =
5643 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5644 .addUse(LHS)
5645 .addUse(RHS)
5646 .addImm(0)
5647 .setMIFlags(Flags);
5648 auto NumeratorScaled =
5649 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5650 .addUse(LHS)
5651 .addUse(RHS)
5652 .addImm(1)
5653 .setMIFlags(Flags);
5654
5655 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5656 .addUse(DenominatorScaled.getReg(0))
5657 .setMIFlags(Flags);
5658 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5659
5660 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5661 const bool HasDynamicDenormals =
5662 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5663 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5664
5665 Register SavedSPDenormMode;
5666 if (!PreservesDenormals) {
5667 if (HasDynamicDenormals) {
5668 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5669 B.buildInstr(AMDGPU::S_GETREG_B32)
5670 .addDef(SavedSPDenormMode)
5671 .addImm(SPDenormModeBitField);
5672 }
5673 toggleSPDenormMode(true, B, ST, Mode);
5674 }
5675
5676 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5677 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5678 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5679 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5680 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5681 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5682
5683 if (!PreservesDenormals) {
5684 if (HasDynamicDenormals) {
5685 assert(SavedSPDenormMode);
5686 B.buildInstr(AMDGPU::S_SETREG_B32)
5687 .addReg(SavedSPDenormMode)
5688 .addImm(SPDenormModeBitField);
5689 } else
5690 toggleSPDenormMode(false, B, ST, Mode);
5691 }
5692
5693 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5694 .addUse(Fma4.getReg(0))
5695 .addUse(Fma1.getReg(0))
5696 .addUse(Fma3.getReg(0))
5697 .addUse(NumeratorScaled.getReg(1))
5698 .setMIFlags(Flags);
5699
5700 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5701 .addUse(Fmas.getReg(0))
5702 .addUse(RHS)
5703 .addUse(LHS)
5704 .setMIFlags(Flags);
5705
5706 MI.eraseFromParent();
5707 return true;
5708}
5709
5712 MachineIRBuilder &B) const {
5713 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5714 return true;
5715
5716 Register Res = MI.getOperand(0).getReg();
5717 Register LHS = MI.getOperand(1).getReg();
5718 Register RHS = MI.getOperand(2).getReg();
5719
5720 uint16_t Flags = MI.getFlags();
5721
5722 LLT S64 = LLT::scalar(64);
5723 LLT S1 = LLT::scalar(1);
5724
5725 auto One = B.buildFConstant(S64, 1.0);
5726
5727 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5728 .addUse(LHS)
5729 .addUse(RHS)
5730 .addImm(0)
5731 .setMIFlags(Flags);
5732
5733 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5734
5735 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5736 .addUse(DivScale0.getReg(0))
5737 .setMIFlags(Flags);
5738
5739 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5740 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5741 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5742
5743 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5744 .addUse(LHS)
5745 .addUse(RHS)
5746 .addImm(1)
5747 .setMIFlags(Flags);
5748
5749 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5750 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5751 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5752
5753 Register Scale;
5754 if (!ST.hasUsableDivScaleConditionOutput()) {
5755 // Workaround a hardware bug on SI where the condition output from div_scale
5756 // is not usable.
5757
5758 LLT S32 = LLT::scalar(32);
5759
5760 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5761 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5762 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5763 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5764
5765 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5766 Scale1Unmerge.getReg(1));
5767 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5768 Scale0Unmerge.getReg(1));
5769 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5770 } else {
5771 Scale = DivScale1.getReg(1);
5772 }
5773
5774 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5775 .addUse(Fma4.getReg(0))
5776 .addUse(Fma3.getReg(0))
5777 .addUse(Mul.getReg(0))
5778 .addUse(Scale)
5779 .setMIFlags(Flags);
5780
5781 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5782 .addUse(Fmas.getReg(0))
5783 .addUse(RHS)
5784 .addUse(LHS)
5785 .setMIFlags(Flags);
5786
5787 MI.eraseFromParent();
5788 return true;
5789}
5790
5793 MachineIRBuilder &B) const {
5794 Register Res0 = MI.getOperand(0).getReg();
5795 Register Res1 = MI.getOperand(1).getReg();
5796 Register Val = MI.getOperand(2).getReg();
5797 uint16_t Flags = MI.getFlags();
5798
5799 LLT Ty = MRI.getType(Res0);
5800 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5801
5802 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5803 .addUse(Val)
5804 .setMIFlags(Flags);
5805 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5806 .addUse(Val)
5807 .setMIFlags(Flags);
5808
5809 if (ST.hasFractBug()) {
5810 auto Fabs = B.buildFAbs(Ty, Val);
5811 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5812 auto IsFinite =
5813 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5814 auto Zero = B.buildConstant(InstrExpTy, 0);
5815 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5816 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5817 }
5818
5819 B.buildCopy(Res0, Mant);
5820 B.buildSExtOrTrunc(Res1, Exp);
5821
5822 MI.eraseFromParent();
5823 return true;
5824}
5825
5828 MachineIRBuilder &B) const {
5829 Register Res = MI.getOperand(0).getReg();
5830 Register LHS = MI.getOperand(2).getReg();
5831 Register RHS = MI.getOperand(3).getReg();
5832 uint16_t Flags = MI.getFlags();
5833
5834 LLT S32 = LLT::scalar(32);
5835 LLT S1 = LLT::scalar(1);
5836
5837 auto Abs = B.buildFAbs(S32, RHS, Flags);
5838 const APFloat C0Val(1.0f);
5839
5840 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5841 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5842 auto C2 = B.buildFConstant(S32, 1.0f);
5843
5844 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5845 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5846
5847 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5848
5849 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5850 .addUse(Mul0.getReg(0))
5851 .setMIFlags(Flags);
5852
5853 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5854
5855 B.buildFMul(Res, Sel, Mul1, Flags);
5856
5857 MI.eraseFromParent();
5858 return true;
5859}
5860
5863 MachineIRBuilder &B) const {
5864 // Bypass the correct expansion a standard promotion through G_FSQRT would
5865 // get. The f32 op is accurate enough for the f16 cas.
5866 unsigned Flags = MI.getFlags();
5867 assert(!ST.has16BitInsts());
5868 const LLT F32 = LLT::scalar(32);
5869 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5870 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5871 .addUse(Ext.getReg(0))
5872 .setMIFlags(Flags);
5873 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5874 MI.eraseFromParent();
5875 return true;
5876}
5877
5880 MachineIRBuilder &B) const {
5881 MachineFunction &MF = B.getMF();
5882 Register Dst = MI.getOperand(0).getReg();
5883 Register X = MI.getOperand(1).getReg();
5884 const unsigned Flags = MI.getFlags();
5885 const LLT S1 = LLT::scalar(1);
5886 const LLT F32 = LLT::scalar(32);
5887 const LLT I32 = LLT::scalar(32);
5888
5889 if (allowApproxFunc(MF, Flags)) {
5890 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5891 .addUse(X)
5892 .setMIFlags(Flags);
5893 MI.eraseFromParent();
5894 return true;
5895 }
5896
5897 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5898 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5899 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5900 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5901 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5902
5904 if (needsDenormHandlingF32(MF, X, Flags)) {
5905 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5906 .addUse(SqrtX.getReg(0))
5907 .setMIFlags(Flags);
5908
5909 auto NegOne = B.buildConstant(I32, -1);
5910 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5911
5912 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5913 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5914
5915 auto PosOne = B.buildConstant(I32, 1);
5916 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5917
5918 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5919 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5920
5921 auto Zero = B.buildFConstant(F32, 0.0f);
5922 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5923
5924 SqrtS =
5925 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5926
5927 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5928 SqrtS =
5929 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5930 } else {
5931 auto SqrtR =
5932 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5933 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5934
5935 auto Half = B.buildFConstant(F32, 0.5f);
5936 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5937 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5938 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5939 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5940 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5941 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5942 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5943 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5944 }
5945
5946 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5947
5948 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5949
5950 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5951
5952 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5953 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5954
5955 MI.eraseFromParent();
5956 return true;
5957}
5958
5961 MachineIRBuilder &B) const {
5962 // For double type, the SQRT and RSQ instructions don't have required
5963 // precision, we apply Goldschmidt's algorithm to improve the result:
5964 //
5965 // y0 = rsq(x)
5966 // g0 = x * y0
5967 // h0 = 0.5 * y0
5968 //
5969 // r0 = 0.5 - h0 * g0
5970 // g1 = g0 * r0 + g0
5971 // h1 = h0 * r0 + h0
5972 //
5973 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5974 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5975 // h2 = h1 * r1 + h1
5976 //
5977 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5978 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5979 //
5980 // sqrt(x) = g3
5981
5982 const LLT S1 = LLT::scalar(1);
5983 const LLT S32 = LLT::scalar(32);
5984 const LLT F64 = LLT::scalar(64);
5985
5986 Register Dst = MI.getOperand(0).getReg();
5987 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5988
5989 Register X = MI.getOperand(1).getReg();
5990 unsigned Flags = MI.getFlags();
5991
5992 Register SqrtX = X;
5993 Register Scaling, ZeroInt;
5994 if (!MI.getFlag(MachineInstr::FmAfn)) {
5995 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5996
5997 ZeroInt = B.buildConstant(S32, 0).getReg(0);
5998 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
5999
6000 // Scale up input if it is too small.
6001 auto ScaleUpFactor = B.buildConstant(S32, 256);
6002 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
6003 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
6004 }
6005
6006 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
6007
6008 auto Half = B.buildFConstant(F64, 0.5);
6009 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
6010 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
6011
6012 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
6013 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
6014
6015 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
6016 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
6017
6018 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
6019 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6020
6021 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6022
6023 Register SqrtRet = SqrtS2.getReg(0);
6024 if (!MI.getFlag(MachineInstr::FmAfn)) {
6025 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6026 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6027 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6028
6029 // Scale down the result.
6030 auto ScaleDownFactor = B.buildConstant(S32, -128);
6031 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6032 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6033 }
6034
6035 Register IsZeroOrInf;
6036 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6037 auto ZeroFP = B.buildFConstant(F64, 0.0);
6038 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6039 } else {
6040 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6041 }
6042
6043 // TODO: Check for DAZ and expand to subnormals
6044
6045 // If x is +INF, +0, or -0, use its original value
6046 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6047
6048 MI.eraseFromParent();
6049 return true;
6050}
6051
6054 MachineIRBuilder &B) const {
6055 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6056 if (Ty == LLT::scalar(32))
6057 return legalizeFSQRTF32(MI, MRI, B);
6058 if (Ty == LLT::scalar(64))
6059 return legalizeFSQRTF64(MI, MRI, B);
6060 if (Ty == LLT::scalar(16))
6061 return legalizeFSQRTF16(MI, MRI, B);
6062 return false;
6063}
6064
6065// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6066// FIXME: Why do we handle this one but not other removed instructions?
6067//
6068// Reciprocal square root. The clamp prevents infinite results, clamping
6069// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6070// +-max_float.
6073 MachineIRBuilder &B) const {
6074 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6075 return true;
6076
6077 Register Dst = MI.getOperand(0).getReg();
6078 Register Src = MI.getOperand(2).getReg();
6079 auto Flags = MI.getFlags();
6080
6081 LLT Ty = MRI.getType(Dst);
6082
6083 const fltSemantics *FltSemantics;
6084 if (Ty == LLT::scalar(32))
6085 FltSemantics = &APFloat::IEEEsingle();
6086 else if (Ty == LLT::scalar(64))
6087 FltSemantics = &APFloat::IEEEdouble();
6088 else
6089 return false;
6090
6091 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6092 .addUse(Src)
6093 .setMIFlags(Flags);
6094
6095 // We don't need to concern ourselves with the snan handling difference, since
6096 // the rsq quieted (or not) so use the one which will directly select.
6097 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6098 const bool UseIEEE = MFI->getMode().IEEE;
6099
6100 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6101 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6102 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6103
6104 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6105
6106 if (UseIEEE)
6107 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6108 else
6109 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6110 MI.eraseFromParent();
6111 return true;
6112}
6113
6114// TODO: Fix pointer type handling
6117 Intrinsic::ID IID) const {
6118
6119 MachineIRBuilder &B = Helper.MIRBuilder;
6120 MachineRegisterInfo &MRI = *B.getMRI();
6121
6122 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6123 IID == Intrinsic::amdgcn_permlanex16;
6124 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6125 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6126
6127 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6128 Register Src2, LLT VT) -> Register {
6129 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6130 switch (IID) {
6131 case Intrinsic::amdgcn_readfirstlane:
6132 case Intrinsic::amdgcn_permlane64:
6133 return LaneOp.getReg(0);
6134 case Intrinsic::amdgcn_readlane:
6135 case Intrinsic::amdgcn_set_inactive:
6136 case Intrinsic::amdgcn_set_inactive_chain_arg:
6137 return LaneOp.addUse(Src1).getReg(0);
6138 case Intrinsic::amdgcn_writelane:
6139 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6140 case Intrinsic::amdgcn_permlane16:
6141 case Intrinsic::amdgcn_permlanex16: {
6142 Register Src3 = MI.getOperand(5).getReg();
6143 int64_t Src4 = MI.getOperand(6).getImm();
6144 int64_t Src5 = MI.getOperand(7).getImm();
6145 return LaneOp.addUse(Src1)
6146 .addUse(Src2)
6147 .addUse(Src3)
6148 .addImm(Src4)
6149 .addImm(Src5)
6150 .getReg(0);
6151 }
6152 case Intrinsic::amdgcn_mov_dpp8:
6153 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6154 case Intrinsic::amdgcn_update_dpp:
6155 return LaneOp.addUse(Src1)
6156 .addImm(MI.getOperand(4).getImm())
6157 .addImm(MI.getOperand(5).getImm())
6158 .addImm(MI.getOperand(6).getImm())
6159 .addImm(MI.getOperand(7).getImm())
6160 .getReg(0);
6161 default:
6162 llvm_unreachable("unhandled lane op");
6163 }
6164 };
6165
6166 Register DstReg = MI.getOperand(0).getReg();
6167 Register Src0 = MI.getOperand(2).getReg();
6168 Register Src1, Src2;
6169 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6170 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6171 Src1 = MI.getOperand(3).getReg();
6172 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6173 Src2 = MI.getOperand(4).getReg();
6174 }
6175 }
6176
6177 LLT Ty = MRI.getType(DstReg);
6178 unsigned Size = Ty.getSizeInBits();
6179
6180 unsigned SplitSize = 32;
6181 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6182 ST.hasDPALU_DPP() &&
6183 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6184 SplitSize = 64;
6185
6186 if (Size == SplitSize) {
6187 // Already legal
6188 return true;
6189 }
6190
6191 if (Size < 32) {
6192 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6193
6194 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6195 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6196
6197 if (IID == Intrinsic::amdgcn_writelane)
6198 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6199
6200 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6201 B.buildTrunc(DstReg, LaneOpDst);
6202 MI.eraseFromParent();
6203 return true;
6204 }
6205
6206 if (Size % SplitSize != 0)
6207 return false;
6208
6209 LLT PartialResTy = LLT::scalar(SplitSize);
6210 bool NeedsBitcast = false;
6211 if (Ty.isVector()) {
6212 LLT EltTy = Ty.getElementType();
6213 unsigned EltSize = EltTy.getSizeInBits();
6214 if (EltSize == SplitSize) {
6215 PartialResTy = EltTy;
6216 } else if (EltSize == 16 || EltSize == 32) {
6217 unsigned NElem = SplitSize / EltSize;
6218 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6219 } else {
6220 // Handle all other cases via S32/S64 pieces
6221 NeedsBitcast = true;
6222 }
6223 }
6224
6225 SmallVector<Register, 4> PartialRes;
6226 unsigned NumParts = Size / SplitSize;
6227 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6228 MachineInstrBuilder Src1Parts, Src2Parts;
6229
6230 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6231 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6232
6233 if (IID == Intrinsic::amdgcn_writelane)
6234 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6235
6236 for (unsigned i = 0; i < NumParts; ++i) {
6237 Src0 = Src0Parts.getReg(i);
6238
6239 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6240 Src1 = Src1Parts.getReg(i);
6241
6242 if (IID == Intrinsic::amdgcn_writelane)
6243 Src2 = Src2Parts.getReg(i);
6244
6245 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6246 }
6247
6248 if (NeedsBitcast)
6249 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6250 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6251 else
6252 B.buildMergeLikeInstr(DstReg, PartialRes);
6253
6254 MI.eraseFromParent();
6255 return true;
6256}
6257
6260 MachineIRBuilder &B) const {
6262 ST.getTargetLowering()->getImplicitParameterOffset(
6264 LLT DstTy = MRI.getType(DstReg);
6265 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6266
6267 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6268 if (!loadInputValue(KernargPtrReg, B,
6270 return false;
6271
6272 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6273 B.buildConstant(IdxTy, Offset).getReg(0));
6274 return true;
6275}
6276
6277/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6278/// bits of the pointer and replace them with the stride argument, then
6279/// merge_values everything together. In the common case of a raw buffer (the
6280/// stride component is 0), we can just AND off the upper half.
6283 Register Result = MI.getOperand(0).getReg();
6284 Register Pointer = MI.getOperand(2).getReg();
6285 Register Stride = MI.getOperand(3).getReg();
6286 Register NumRecords = MI.getOperand(4).getReg();
6287 Register Flags = MI.getOperand(5).getReg();
6288
6289 LLT S32 = LLT::scalar(32);
6290 LLT S64 = LLT::scalar(64);
6291
6292 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6293
6294 auto ExtStride = B.buildAnyExt(S32, Stride);
6295
6296 if (ST.has45BitNumRecordsBufferResource()) {
6297 Register Zero = B.buildConstant(S32, 0).getReg(0);
6298 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6299 // num_records.
6300 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6301 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6302 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6303 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6304 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6305
6306 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6307 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6308 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6309 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6310 auto ExtShiftedStride =
6311 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6312 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6313 auto ExtShiftedFlags =
6314 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6315 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6316 Register HighHalf =
6317 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6318 B.buildMergeValues(Result, {LowHalf, HighHalf});
6319 } else {
6320 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6321 auto Unmerge = B.buildUnmerge(S32, Pointer);
6322 auto LowHalf = Unmerge.getReg(0);
6323 auto HighHalf = Unmerge.getReg(1);
6324
6325 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6326 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6327 auto ShiftConst = B.buildConstant(S32, 16);
6328 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6329 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6330 Register NewHighHalfReg = NewHighHalf.getReg(0);
6331 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6332 }
6333
6334 MI.eraseFromParent();
6335 return true;
6336}
6337
6340 MachineIRBuilder &B) const {
6341 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6342 if (!MFI->isEntryFunction()) {
6343 return legalizePreloadedArgIntrin(MI, MRI, B,
6345 }
6346
6347 Register DstReg = MI.getOperand(0).getReg();
6348 if (!getImplicitArgPtr(DstReg, MRI, B))
6349 return false;
6350
6351 MI.eraseFromParent();
6352 return true;
6353}
6354
6357 MachineIRBuilder &B) const {
6358 Function &F = B.getMF().getFunction();
6359 std::optional<uint32_t> KnownSize =
6361 if (KnownSize.has_value())
6362 B.buildConstant(DstReg, *KnownSize);
6363 return false;
6364}
6365
6368 MachineIRBuilder &B) const {
6369
6370 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6371 if (!MFI->isEntryFunction()) {
6372 return legalizePreloadedArgIntrin(MI, MRI, B,
6374 }
6375
6376 Register DstReg = MI.getOperand(0).getReg();
6377 if (!getLDSKernelId(DstReg, MRI, B))
6378 return false;
6379
6380 MI.eraseFromParent();
6381 return true;
6382}
6383
6387 unsigned AddrSpace) const {
6388 const LLT S32 = LLT::scalar(32);
6389 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6390 Register Hi32 = Unmerge.getReg(1);
6391
6392 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6393 ST.hasGloballyAddressableScratch()) {
6394 Register FlatScratchBaseHi =
6395 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6396 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6397 .getReg(0);
6398 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6399 // Test bits 63..58 against the aperture address.
6400 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6401 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6402 B.buildConstant(S32, 1u << 26));
6403 } else {
6404 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6405 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6406 }
6407 MI.eraseFromParent();
6408 return true;
6409}
6410
6411// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6412// offset (the offset that is included in bounds checking and swizzling, to be
6413// split between the instruction's voffset and immoffset fields) and soffset
6414// (the offset that is excluded from bounds checking and swizzling, to go in
6415// the instruction's soffset field). This function takes the first kind of
6416// offset and figures out how to split it between voffset and immoffset.
6417std::pair<Register, unsigned>
6419 Register OrigOffset) const {
6420 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6421 Register BaseReg;
6422 unsigned ImmOffset;
6423 const LLT S32 = LLT::scalar(32);
6424 MachineRegisterInfo &MRI = *B.getMRI();
6425
6426 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6427 // being added, so we can only safely match a 32-bit addition with no unsigned
6428 // overflow.
6429 bool CheckNUW = ST.hasGFX1250Insts();
6430 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6431 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6432
6433 // If BaseReg is a pointer, convert it to int.
6434 if (MRI.getType(BaseReg).isPointer())
6435 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6436
6437 // If the immediate value is too big for the immoffset field, put only bits
6438 // that would normally fit in the immoffset field. The remaining value that
6439 // is copied/added for the voffset field is a large power of 2, and it
6440 // stands more chance of being CSEd with the copy/add for another similar
6441 // load/store.
6442 // However, do not do that rounding down if that is a negative
6443 // number, as it appears to be illegal to have a negative offset in the
6444 // vgpr, even if adding the immediate offset makes it positive.
6445 unsigned Overflow = ImmOffset & ~MaxImm;
6446 ImmOffset -= Overflow;
6447 if ((int32_t)Overflow < 0) {
6448 Overflow += ImmOffset;
6449 ImmOffset = 0;
6450 }
6451
6452 if (Overflow != 0) {
6453 if (!BaseReg) {
6454 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6455 } else {
6456 auto OverflowVal = B.buildConstant(S32, Overflow);
6457 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6458 }
6459 }
6460
6461 if (!BaseReg)
6462 BaseReg = B.buildConstant(S32, 0).getReg(0);
6463
6464 return std::pair(BaseReg, ImmOffset);
6465}
6466
6467/// Handle register layout difference for f16 images for some subtargets.
6470 Register Reg,
6471 bool ImageStore) const {
6472 const LLT S16 = LLT::scalar(16);
6473 const LLT S32 = LLT::scalar(32);
6474 LLT StoreVT = MRI.getType(Reg);
6475 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6476
6477 if (ST.hasUnpackedD16VMem()) {
6478 auto Unmerge = B.buildUnmerge(S16, Reg);
6479
6480 SmallVector<Register, 4> WideRegs;
6481 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6482 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6483
6484 int NumElts = StoreVT.getNumElements();
6485
6486 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6487 .getReg(0);
6488 }
6489
6490 if (ImageStore && ST.hasImageStoreD16Bug()) {
6491 if (StoreVT.getNumElements() == 2) {
6492 SmallVector<Register, 4> PackedRegs;
6493 Reg = B.buildBitcast(S32, Reg).getReg(0);
6494 PackedRegs.push_back(Reg);
6495 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6496 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6497 .getReg(0);
6498 }
6499
6500 if (StoreVT.getNumElements() == 3) {
6501 SmallVector<Register, 4> PackedRegs;
6502 auto Unmerge = B.buildUnmerge(S16, Reg);
6503 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6504 PackedRegs.push_back(Unmerge.getReg(I));
6505 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6506 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6507 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6508 }
6509
6510 if (StoreVT.getNumElements() == 4) {
6511 SmallVector<Register, 4> PackedRegs;
6512 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6513 auto Unmerge = B.buildUnmerge(S32, Reg);
6514 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6515 PackedRegs.push_back(Unmerge.getReg(I));
6516 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6517 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6518 .getReg(0);
6519 }
6520
6521 llvm_unreachable("invalid data type");
6522 }
6523
6524 if (StoreVT == LLT::fixed_vector(3, S16)) {
6525 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6526 .getReg(0);
6527 }
6528 return Reg;
6529}
6530
6532 Register VData, LLT MemTy,
6533 bool IsFormat) const {
6534 MachineRegisterInfo *MRI = B.getMRI();
6535 LLT Ty = MRI->getType(VData);
6536
6537 const LLT S16 = LLT::scalar(16);
6538
6539 // Fixup buffer resources themselves needing to be v4i128.
6541 return castBufferRsrcToV4I32(VData, B);
6542
6543 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6544 Ty = getBitcastRegisterType(Ty);
6545 VData = B.buildBitcast(Ty, VData).getReg(0);
6546 }
6547 // Fixup illegal register types for i8 stores.
6548 if (Ty == LLT::scalar(8) || Ty == S16) {
6549 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6550 return AnyExt;
6551 }
6552
6553 if (Ty.isVector()) {
6554 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6555 if (IsFormat)
6556 return handleD16VData(B, *MRI, VData);
6557 }
6558 }
6559
6560 return VData;
6561}
6562
6564 LegalizerHelper &Helper,
6565 bool IsTyped,
6566 bool IsFormat) const {
6567 MachineIRBuilder &B = Helper.MIRBuilder;
6568 MachineRegisterInfo &MRI = *B.getMRI();
6569
6570 Register VData = MI.getOperand(1).getReg();
6571 LLT Ty = MRI.getType(VData);
6572 LLT EltTy = Ty.getScalarType();
6573 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6574 const LLT S32 = LLT::scalar(32);
6575
6576 MachineMemOperand *MMO = *MI.memoperands_begin();
6577 const int MemSize = MMO->getSize().getValue();
6578 LLT MemTy = MMO->getMemoryType();
6579
6580 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6581
6583 Register RSrc = MI.getOperand(2).getReg();
6584
6585 unsigned ImmOffset;
6586
6587 // The typed intrinsics add an immediate after the registers.
6588 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6589
6590 // The struct intrinsic variants add one additional operand over raw.
6591 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6592 Register VIndex;
6593 int OpOffset = 0;
6594 if (HasVIndex) {
6595 VIndex = MI.getOperand(3).getReg();
6596 OpOffset = 1;
6597 } else {
6598 VIndex = B.buildConstant(S32, 0).getReg(0);
6599 }
6600
6601 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6602 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6603
6604 unsigned Format = 0;
6605 if (IsTyped) {
6606 Format = MI.getOperand(5 + OpOffset).getImm();
6607 ++OpOffset;
6608 }
6609
6610 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6611
6612 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6613
6614 unsigned Opc;
6615 if (IsTyped) {
6616 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6617 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6618 } else if (IsFormat) {
6619 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6620 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6621 } else {
6622 switch (MemSize) {
6623 case 1:
6624 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6625 break;
6626 case 2:
6627 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6628 break;
6629 default:
6630 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6631 break;
6632 }
6633 }
6634
6635 auto MIB = B.buildInstr(Opc)
6636 .addUse(VData) // vdata
6637 .addUse(RSrc) // rsrc
6638 .addUse(VIndex) // vindex
6639 .addUse(VOffset) // voffset
6640 .addUse(SOffset) // soffset
6641 .addImm(ImmOffset); // offset(imm)
6642
6643 if (IsTyped)
6644 MIB.addImm(Format);
6645
6646 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6647 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6648 .addMemOperand(MMO);
6649
6650 MI.eraseFromParent();
6651 return true;
6652}
6653
6654static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6655 Register VIndex, Register VOffset, Register SOffset,
6656 unsigned ImmOffset, unsigned Format,
6657 unsigned AuxiliaryData, MachineMemOperand *MMO,
6658 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6659 auto MIB = B.buildInstr(Opc)
6660 .addDef(LoadDstReg) // vdata
6661 .addUse(RSrc) // rsrc
6662 .addUse(VIndex) // vindex
6663 .addUse(VOffset) // voffset
6664 .addUse(SOffset) // soffset
6665 .addImm(ImmOffset); // offset(imm)
6666
6667 if (IsTyped)
6668 MIB.addImm(Format);
6669
6670 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6671 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6672 .addMemOperand(MMO);
6673}
6674
6676 LegalizerHelper &Helper,
6677 bool IsFormat,
6678 bool IsTyped) const {
6679 MachineIRBuilder &B = Helper.MIRBuilder;
6680 MachineRegisterInfo &MRI = *B.getMRI();
6681 GISelChangeObserver &Observer = Helper.Observer;
6682
6683 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6684 MachineMemOperand *MMO = *MI.memoperands_begin();
6685 const LLT MemTy = MMO->getMemoryType();
6686 const LLT S32 = LLT::scalar(32);
6687
6688 Register Dst = MI.getOperand(0).getReg();
6689
6690 Register StatusDst;
6691 int OpOffset = 0;
6692 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6693 bool IsTFE = MI.getNumExplicitDefs() == 2;
6694 if (IsTFE) {
6695 StatusDst = MI.getOperand(1).getReg();
6696 ++OpOffset;
6697 }
6698
6699 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6700 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6701
6702 // The typed intrinsics add an immediate after the registers.
6703 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6704
6705 // The struct intrinsic variants add one additional operand over raw.
6706 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6707 Register VIndex;
6708 if (HasVIndex) {
6709 VIndex = MI.getOperand(3 + OpOffset).getReg();
6710 ++OpOffset;
6711 } else {
6712 VIndex = B.buildConstant(S32, 0).getReg(0);
6713 }
6714
6715 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6716 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6717
6718 unsigned Format = 0;
6719 if (IsTyped) {
6720 Format = MI.getOperand(5 + OpOffset).getImm();
6721 ++OpOffset;
6722 }
6723
6724 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6725 unsigned ImmOffset;
6726
6727 LLT Ty = MRI.getType(Dst);
6728 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6729 // logic doesn't have to handle that case.
6730 if (hasBufferRsrcWorkaround(Ty)) {
6731 Observer.changingInstr(MI);
6732 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6733 Observer.changedInstr(MI);
6734 Dst = MI.getOperand(0).getReg();
6735 B.setInsertPt(B.getMBB(), MI);
6736 }
6737 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6738 Ty = getBitcastRegisterType(Ty);
6739 Observer.changingInstr(MI);
6740 Helper.bitcastDst(MI, Ty, 0);
6741 Observer.changedInstr(MI);
6742 Dst = MI.getOperand(0).getReg();
6743 B.setInsertPt(B.getMBB(), MI);
6744 }
6745
6746 LLT EltTy = Ty.getScalarType();
6747 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6748 const bool Unpacked = ST.hasUnpackedD16VMem();
6749
6750 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6751
6752 unsigned Opc;
6753
6754 // TODO: Support TFE for typed and narrow loads.
6755 if (IsTyped) {
6756 if (IsTFE)
6757 return false;
6758 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6759 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6760 } else if (IsFormat) {
6761 if (IsD16) {
6762 if (IsTFE)
6763 return false;
6764 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6765 } else {
6766 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6767 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6768 }
6769 } else {
6770 switch (MemTy.getSizeInBits()) {
6771 case 8:
6772 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6773 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6774 break;
6775 case 16:
6776 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6777 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6778 break;
6779 default:
6780 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6781 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6782 break;
6783 }
6784 }
6785
6786 if (IsTFE) {
6787 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6788 unsigned NumLoadDWords = NumValueDWords + 1;
6789 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6790 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6791 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6792 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6793 if (MemTy.getSizeInBits() < 32) {
6794 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6795 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6796 B.buildTrunc(Dst, ExtDst);
6797 } else if (NumValueDWords == 1) {
6798 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6799 } else {
6800 SmallVector<Register, 5> LoadElts;
6801 for (unsigned I = 0; I != NumValueDWords; ++I)
6802 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6803 LoadElts.push_back(StatusDst);
6804 B.buildUnmerge(LoadElts, LoadDstReg);
6805 LoadElts.truncate(NumValueDWords);
6806 B.buildMergeLikeInstr(Dst, LoadElts);
6807 }
6808 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6809 (IsD16 && !Ty.isVector())) {
6810 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6811 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6812 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6813 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6814 B.buildTrunc(Dst, LoadDstReg);
6815 } else if (Unpacked && IsD16 && Ty.isVector()) {
6816 LLT UnpackedTy = Ty.changeElementSize(32);
6817 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6818 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6819 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6820 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6821 // FIXME: G_TRUNC should work, but legalization currently fails
6822 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6824 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6825 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6826 B.buildMergeLikeInstr(Dst, Repack);
6827 } else {
6828 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6829 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6830 }
6831
6832 MI.eraseFromParent();
6833 return true;
6834}
6835
6836static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6837 switch (IntrID) {
6838 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6839 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6840 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6841 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6842 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6843 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6844 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6845 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6846 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6847 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6848 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6849 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6850 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6851 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6852 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6853 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6854 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6855 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6856 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6857 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6858 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6859 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6860 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6861 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6862 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6863 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6864 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6865 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6866 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6867 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6868 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6870 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6871 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6872 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6873 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6874 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6875 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6876 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6877 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6878 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6879 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6880 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6881 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6882 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6883 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6885 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6887 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6888 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6889 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6890 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6893 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6895 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6896 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6898 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6900 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6903 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6904 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6905 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6908 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6910 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6911 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6912 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6913 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6915 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6916 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6918 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6919 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6920 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6921 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6922 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6923 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6925 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6926 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6927 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6928 default:
6929 llvm_unreachable("unhandled atomic opcode");
6930 }
6931}
6932
6935 Intrinsic::ID IID) const {
6936 const bool IsCmpSwap =
6937 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6938 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6939 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6940 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6941
6942 Register Dst = MI.getOperand(0).getReg();
6943 // Since we don't have 128-bit atomics, we don't need to handle the case of
6944 // p8 argmunents to the atomic itself
6945 Register VData = MI.getOperand(2).getReg();
6946
6947 Register CmpVal;
6948 int OpOffset = 0;
6949
6950 if (IsCmpSwap) {
6951 CmpVal = MI.getOperand(3).getReg();
6952 ++OpOffset;
6953 }
6954
6955 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6956 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6957 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6958
6959 // The struct intrinsic variants add one additional operand over raw.
6960 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6961 Register VIndex;
6962 if (HasVIndex) {
6963 VIndex = MI.getOperand(4 + OpOffset).getReg();
6964 ++OpOffset;
6965 } else {
6966 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6967 }
6968
6969 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6970 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6971 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6972
6973 MachineMemOperand *MMO = *MI.memoperands_begin();
6974
6975 unsigned ImmOffset;
6976 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6977
6978 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6979 .addDef(Dst)
6980 .addUse(VData); // vdata
6981
6982 if (IsCmpSwap)
6983 MIB.addReg(CmpVal);
6984
6985 MIB.addUse(RSrc) // rsrc
6986 .addUse(VIndex) // vindex
6987 .addUse(VOffset) // voffset
6988 .addUse(SOffset) // soffset
6989 .addImm(ImmOffset) // offset(imm)
6990 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6991 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6992 .addMemOperand(MMO);
6993
6994 MI.eraseFromParent();
6995 return true;
6996}
6997
6998/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6999/// vector with s16 typed elements.
7001 SmallVectorImpl<Register> &PackedAddrs,
7002 unsigned ArgOffset,
7004 bool IsA16, bool IsG16) {
7005 const LLT S16 = LLT::scalar(16);
7006 const LLT V2S16 = LLT::fixed_vector(2, 16);
7007 auto EndIdx = Intr->VAddrEnd;
7008
7009 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7010 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7011 if (!SrcOp.isReg())
7012 continue; // _L to _LZ may have eliminated this.
7013
7014 Register AddrReg = SrcOp.getReg();
7015
7016 if ((I < Intr->GradientStart) ||
7017 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7018 (I >= Intr->CoordStart && !IsA16)) {
7019 if ((I < Intr->GradientStart) && IsA16 &&
7020 (B.getMRI()->getType(AddrReg) == S16)) {
7021 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7022 // Special handling of bias when A16 is on. Bias is of type half but
7023 // occupies full 32-bit.
7024 PackedAddrs.push_back(
7025 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7026 .getReg(0));
7027 } else {
7028 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7029 "Bias needs to be converted to 16 bit in A16 mode");
7030 // Handle any gradient or coordinate operands that should not be packed
7031 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7032 PackedAddrs.push_back(AddrReg);
7033 }
7034 } else {
7035 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7036 // derivatives dx/dh and dx/dv are packed with undef.
7037 if (((I + 1) >= EndIdx) ||
7038 ((Intr->NumGradients / 2) % 2 == 1 &&
7039 (I == static_cast<unsigned>(Intr->GradientStart +
7040 (Intr->NumGradients / 2) - 1) ||
7041 I == static_cast<unsigned>(Intr->GradientStart +
7042 Intr->NumGradients - 1))) ||
7043 // Check for _L to _LZ optimization
7044 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7045 PackedAddrs.push_back(
7046 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7047 .getReg(0));
7048 } else {
7049 PackedAddrs.push_back(
7050 B.buildBuildVector(
7051 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7052 .getReg(0));
7053 ++I;
7054 }
7055 }
7056 }
7057}
7058
7059/// Convert from separate vaddr components to a single vector address register,
7060/// and replace the remaining operands with $noreg.
7062 int DimIdx, int NumVAddrs) {
7063 const LLT S32 = LLT::scalar(32);
7064 (void)S32;
7065 SmallVector<Register, 8> AddrRegs;
7066 for (int I = 0; I != NumVAddrs; ++I) {
7067 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7068 if (SrcOp.isReg()) {
7069 AddrRegs.push_back(SrcOp.getReg());
7070 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7071 }
7072 }
7073
7074 int NumAddrRegs = AddrRegs.size();
7075 if (NumAddrRegs != 1) {
7076 auto VAddr =
7077 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7078 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7079 }
7080
7081 for (int I = 1; I != NumVAddrs; ++I) {
7082 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7083 if (SrcOp.isReg())
7084 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7085 }
7086}
7087
7088/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7089///
7090/// Depending on the subtarget, load/store with 16-bit element data need to be
7091/// rewritten to use the low half of 32-bit registers, or directly use a packed
7092/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7093/// registers.
7094///
7095/// We don't want to directly select image instructions just yet, but also want
7096/// to exposes all register repacking to the legalizer/combiners. We also don't
7097/// want a selected instruction entering RegBankSelect. In order to avoid
7098/// defining a multitude of intermediate image instructions, directly hack on
7099/// the intrinsic's arguments. In cases like a16 addresses, this requires
7100/// padding now unnecessary arguments with $noreg.
7103 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7104
7105 const MachineFunction &MF = *MI.getMF();
7106 const unsigned NumDefs = MI.getNumExplicitDefs();
7107 const unsigned ArgOffset = NumDefs + 1;
7108 bool IsTFE = NumDefs == 2;
7109 // We are only processing the operands of d16 image operations on subtargets
7110 // that use the unpacked register layout, or need to repack the TFE result.
7111
7112 // TODO: Do we need to guard against already legalized intrinsics?
7113 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7115
7116 MachineRegisterInfo *MRI = B.getMRI();
7117 const LLT S32 = LLT::scalar(32);
7118 const LLT S16 = LLT::scalar(16);
7119 const LLT V2S16 = LLT::fixed_vector(2, 16);
7120
7121 unsigned DMask = 0;
7122 Register VData;
7123 LLT Ty;
7124
7125 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7126 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7127 Ty = MRI->getType(VData);
7128 }
7129
7130 const bool IsAtomicPacked16Bit =
7131 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7132 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7133
7134 // Check for 16 bit addresses and pack if true.
7135 LLT GradTy =
7136 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7137 LLT AddrTy =
7138 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7139 const bool IsG16 =
7140 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7141 const bool IsA16 = AddrTy == S16;
7142 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7143
7144 int DMaskLanes = 0;
7145 if (!BaseOpcode->Atomic) {
7146 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7147 if (BaseOpcode->Gather4) {
7148 DMaskLanes = 4;
7149 } else if (DMask != 0) {
7150 DMaskLanes = llvm::popcount(DMask);
7151 } else if (!IsTFE && !BaseOpcode->Store) {
7152 // If dmask is 0, this is a no-op load. This can be eliminated.
7153 B.buildUndef(MI.getOperand(0));
7154 MI.eraseFromParent();
7155 return true;
7156 }
7157 }
7158
7159 Observer.changingInstr(MI);
7160 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7161
7162 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7163 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7164 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7165 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7166 unsigned NewOpcode = LoadOpcode;
7167 if (BaseOpcode->Store)
7168 NewOpcode = StoreOpcode;
7169 else if (BaseOpcode->NoReturn)
7170 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7171
7172 // Track that we legalized this
7173 MI.setDesc(B.getTII().get(NewOpcode));
7174
7175 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7176 // dmask to be at least 1 otherwise the instruction will fail
7177 if (IsTFE && DMask == 0) {
7178 DMask = 0x1;
7179 DMaskLanes = 1;
7180 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7181 }
7182
7183 if (BaseOpcode->Atomic) {
7184 Register VData0 = MI.getOperand(2).getReg();
7185 LLT Ty = MRI->getType(VData0);
7186
7187 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7188 if (Ty.isVector() && !IsAtomicPacked16Bit)
7189 return false;
7190
7191 if (BaseOpcode->AtomicX2) {
7192 Register VData1 = MI.getOperand(3).getReg();
7193 // The two values are packed in one register.
7194 LLT PackedTy = LLT::fixed_vector(2, Ty);
7195 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7196 MI.getOperand(2).setReg(Concat.getReg(0));
7197 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7198 }
7199 }
7200
7201 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7202
7203 // Rewrite the addressing register layout before doing anything else.
7204 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7205 // 16 bit gradients are supported, but are tied to the A16 control
7206 // so both gradients and addresses must be 16 bit
7207 return false;
7208 }
7209
7210 if (IsA16 && !ST.hasA16()) {
7211 // A16 not supported
7212 return false;
7213 }
7214
7215 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7216 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7217
7218 if (IsA16 || IsG16) {
7219 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7220 // instructions expect VGPR_32
7221 SmallVector<Register, 4> PackedRegs;
7222
7223 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7224
7225 // See also below in the non-a16 branch
7226 const bool UseNSA = ST.hasNSAEncoding() &&
7227 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7228 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7229 const bool UsePartialNSA =
7230 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7231
7232 if (UsePartialNSA) {
7233 // Pack registers that would go over NSAMaxSize into last VAddr register
7234 LLT PackedAddrTy =
7235 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7236 auto Concat = B.buildConcatVectors(
7237 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7238 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7239 PackedRegs.resize(NSAMaxSize);
7240 } else if (!UseNSA && PackedRegs.size() > 1) {
7241 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7242 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7243 PackedRegs[0] = Concat.getReg(0);
7244 PackedRegs.resize(1);
7245 }
7246
7247 const unsigned NumPacked = PackedRegs.size();
7248 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7249 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7250 if (!SrcOp.isReg()) {
7251 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7252 continue;
7253 }
7254
7255 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7256
7257 if (I - Intr->VAddrStart < NumPacked)
7258 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7259 else
7260 SrcOp.setReg(AMDGPU::NoRegister);
7261 }
7262 } else {
7263 // If the register allocator cannot place the address registers contiguously
7264 // without introducing moves, then using the non-sequential address encoding
7265 // is always preferable, since it saves VALU instructions and is usually a
7266 // wash in terms of code size or even better.
7267 //
7268 // However, we currently have no way of hinting to the register allocator
7269 // that MIMG addresses should be placed contiguously when it is possible to
7270 // do so, so force non-NSA for the common 2-address case as a heuristic.
7271 //
7272 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7273 // allocation when possible.
7274 //
7275 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7276 // set of the remaining addresses.
7277 const bool UseNSA = ST.hasNSAEncoding() &&
7278 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7279 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7280 const bool UsePartialNSA =
7281 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7282
7283 if (UsePartialNSA) {
7285 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7286 Intr->NumVAddrs - NSAMaxSize + 1);
7287 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7288 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7289 Intr->NumVAddrs);
7290 }
7291 }
7292
7293 int Flags = 0;
7294 if (IsA16)
7295 Flags |= 1;
7296 if (IsG16)
7297 Flags |= 2;
7298 MI.addOperand(MachineOperand::CreateImm(Flags));
7299
7300 if (BaseOpcode->NoReturn) { // No TFE for stores?
7301 // TODO: Handle dmask trim
7302 if (!Ty.isVector() || !IsD16)
7303 return true;
7304
7305 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7306 if (RepackedReg != VData) {
7307 MI.getOperand(1).setReg(RepackedReg);
7308 }
7309
7310 return true;
7311 }
7312
7313 Register DstReg = MI.getOperand(0).getReg();
7314 const LLT EltTy = Ty.getScalarType();
7315 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7316
7317 // Confirm that the return type is large enough for the dmask specified
7318 if (NumElts < DMaskLanes)
7319 return false;
7320
7321 if (NumElts > 4 || DMaskLanes > 4)
7322 return false;
7323
7324 // Image atomic instructions are using DMask to specify how many bits
7325 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7326 // DMaskLanes for image atomic has default value '0'.
7327 // We must be sure that atomic variants (especially packed) will not be
7328 // truncated from v2s16 or v4s16 to s16 type.
7329 //
7330 // ChangeElementCount will be needed for image load where Ty is always scalar.
7331 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7332 const LLT AdjustedTy =
7333 DMaskLanes == 0
7334 ? Ty
7335 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7336
7337 // The raw dword aligned data component of the load. The only legal cases
7338 // where this matters should be when using the packed D16 format, for
7339 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7340 LLT RoundedTy;
7341
7342 // S32 vector to cover all data, plus TFE result element.
7343 LLT TFETy;
7344
7345 // Register type to use for each loaded component. Will be S32 or V2S16.
7346 LLT RegTy;
7347
7348 if (IsD16 && ST.hasUnpackedD16VMem()) {
7349 RoundedTy =
7350 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7351 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7352 RegTy = S32;
7353 } else {
7354 unsigned EltSize = EltTy.getSizeInBits();
7355 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7356 unsigned RoundedSize = 32 * RoundedElts;
7357 RoundedTy = LLT::scalarOrVector(
7358 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7359 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7360 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7361 }
7362
7363 // The return type does not need adjustment.
7364 // TODO: Should we change s16 case to s32 or <2 x s16>?
7365 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7366 return true;
7367
7368 Register Dst1Reg;
7369
7370 // Insert after the instruction.
7371 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7372
7373 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7374 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7375 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7376 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7377
7378 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7379
7380 MI.getOperand(0).setReg(NewResultReg);
7381
7382 // In the IR, TFE is supposed to be used with a 2 element struct return
7383 // type. The instruction really returns these two values in one contiguous
7384 // register, with one additional dword beyond the loaded data. Rewrite the
7385 // return type to use a single register result.
7386
7387 if (IsTFE) {
7388 Dst1Reg = MI.getOperand(1).getReg();
7389 if (MRI->getType(Dst1Reg) != S32)
7390 return false;
7391
7392 // TODO: Make sure the TFE operand bit is set.
7393 MI.removeOperand(1);
7394
7395 // Handle the easy case that requires no repack instructions.
7396 if (Ty == S32) {
7397 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7398 return true;
7399 }
7400 }
7401
7402 // Now figure out how to copy the new result register back into the old
7403 // result.
7404 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7405
7406 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7407
7408 if (ResultNumRegs == 1) {
7409 assert(!IsTFE);
7410 ResultRegs[0] = NewResultReg;
7411 } else {
7412 // We have to repack into a new vector of some kind.
7413 for (int I = 0; I != NumDataRegs; ++I)
7414 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7415 B.buildUnmerge(ResultRegs, NewResultReg);
7416
7417 // Drop the final TFE element to get the data part. The TFE result is
7418 // directly written to the right place already.
7419 if (IsTFE)
7420 ResultRegs.resize(NumDataRegs);
7421 }
7422
7423 // For an s16 scalar result, we form an s32 result with a truncate regardless
7424 // of packed vs. unpacked.
7425 if (IsD16 && !Ty.isVector()) {
7426 B.buildTrunc(DstReg, ResultRegs[0]);
7427 return true;
7428 }
7429
7430 // Avoid a build/concat_vector of 1 entry.
7431 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7432 B.buildBitcast(DstReg, ResultRegs[0]);
7433 return true;
7434 }
7435
7436 assert(Ty.isVector());
7437
7438 if (IsD16) {
7439 // For packed D16 results with TFE enabled, all the data components are
7440 // S32. Cast back to the expected type.
7441 //
7442 // TODO: We don't really need to use load s32 elements. We would only need one
7443 // cast for the TFE result if a multiple of v2s16 was used.
7444 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7445 for (Register &Reg : ResultRegs)
7446 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7447 } else if (ST.hasUnpackedD16VMem()) {
7448 for (Register &Reg : ResultRegs)
7449 Reg = B.buildTrunc(S16, Reg).getReg(0);
7450 }
7451 }
7452
7453 auto padWithUndef = [&](LLT Ty, int NumElts) {
7454 if (NumElts == 0)
7455 return;
7456 Register Undef = B.buildUndef(Ty).getReg(0);
7457 for (int I = 0; I != NumElts; ++I)
7458 ResultRegs.push_back(Undef);
7459 };
7460
7461 // Pad out any elements eliminated due to the dmask.
7462 LLT ResTy = MRI->getType(ResultRegs[0]);
7463 if (!ResTy.isVector()) {
7464 padWithUndef(ResTy, NumElts - ResultRegs.size());
7465 B.buildBuildVector(DstReg, ResultRegs);
7466 return true;
7467 }
7468
7469 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7470 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7471
7472 // Deal with the one annoying legal case.
7473 const LLT V3S16 = LLT::fixed_vector(3, 16);
7474 if (Ty == V3S16) {
7475 if (IsTFE) {
7476 if (ResultRegs.size() == 1) {
7477 NewResultReg = ResultRegs[0];
7478 } else if (ResultRegs.size() == 2) {
7479 LLT V4S16 = LLT::fixed_vector(4, 16);
7480 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7481 } else {
7482 return false;
7483 }
7484 }
7485
7486 if (MRI->getType(DstReg).getNumElements() <
7487 MRI->getType(NewResultReg).getNumElements()) {
7488 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7489 } else {
7490 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7491 }
7492 return true;
7493 }
7494
7495 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7496 B.buildConcatVectors(DstReg, ResultRegs);
7497 return true;
7498}
7499
7501 MachineInstr &MI) const {
7502 MachineIRBuilder &B = Helper.MIRBuilder;
7503 GISelChangeObserver &Observer = Helper.Observer;
7504
7505 Register OrigDst = MI.getOperand(0).getReg();
7506 Register Dst;
7507 LLT Ty = B.getMRI()->getType(OrigDst);
7508 unsigned Size = Ty.getSizeInBits();
7509 MachineFunction &MF = B.getMF();
7510 unsigned Opc = 0;
7511 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7512 assert(Size == 8 || Size == 16);
7513 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7514 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7515 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7516 // destination register.
7517 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7518 } else {
7519 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7520 Dst = OrigDst;
7521 }
7522
7523 Observer.changingInstr(MI);
7524
7525 // Handle needing to s.buffer.load() a p8 value.
7526 if (hasBufferRsrcWorkaround(Ty)) {
7527 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7528 B.setInsertPt(B.getMBB(), MI);
7529 }
7531 Ty = getBitcastRegisterType(Ty);
7532 Helper.bitcastDst(MI, Ty, 0);
7533 B.setInsertPt(B.getMBB(), MI);
7534 }
7535
7536 // FIXME: We don't really need this intermediate instruction. The intrinsic
7537 // should be fixed to have a memory operand. Since it's readnone, we're not
7538 // allowed to add one.
7539 MI.setDesc(B.getTII().get(Opc));
7540 MI.removeOperand(1); // Remove intrinsic ID
7541
7542 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7543 const unsigned MemSize = (Size + 7) / 8;
7544 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7550 MemSize, MemAlign);
7551 MI.addMemOperand(MF, MMO);
7552 if (Dst != OrigDst) {
7553 MI.getOperand(0).setReg(Dst);
7554 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7555 B.buildTrunc(OrigDst, Dst);
7556 }
7557
7558 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7559 // always be legal. We may need to restore this to a 96-bit result if it turns
7560 // out this needs to be converted to a vector load during RegBankSelect.
7561 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7562 if (Ty.isVector())
7564 else
7565 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7566 }
7567
7568 Observer.changedInstr(MI);
7569 return true;
7570}
7571
7573 MachineInstr &MI) const {
7574 MachineIRBuilder &B = Helper.MIRBuilder;
7575 GISelChangeObserver &Observer = Helper.Observer;
7576 Observer.changingInstr(MI);
7577 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7578 MI.removeOperand(0); // Remove intrinsic ID
7580 Observer.changedInstr(MI);
7581 return true;
7582}
7583
7584// TODO: Move to selection
7587 MachineIRBuilder &B) const {
7588 if (!ST.hasTrapHandler() ||
7589 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7590 return legalizeTrapEndpgm(MI, MRI, B);
7591
7592 return ST.supportsGetDoorbellID() ?
7594}
7595
7598 const DebugLoc &DL = MI.getDebugLoc();
7599 MachineBasicBlock &BB = B.getMBB();
7600 MachineFunction *MF = BB.getParent();
7601
7602 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7603 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7604 .addImm(0);
7605 MI.eraseFromParent();
7606 return true;
7607 }
7608
7609 // We need a block split to make the real endpgm a terminator. We also don't
7610 // want to break phis in successor blocks, so we can't just delete to the
7611 // end of the block.
7612 BB.splitAt(MI, false /*UpdateLiveIns*/);
7614 MF->push_back(TrapBB);
7615 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7616 .addImm(0);
7617 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7618 .addMBB(TrapBB);
7619
7620 BB.addSuccessor(TrapBB);
7621 MI.eraseFromParent();
7622 return true;
7623}
7624
7627 MachineFunction &MF = B.getMF();
7628 const LLT S64 = LLT::scalar(64);
7629
7630 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7631 // For code object version 5, queue_ptr is passed through implicit kernarg.
7637 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7638
7639 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7641
7642 if (!loadInputValue(KernargPtrReg, B,
7644 return false;
7645
7646 // TODO: can we be smarter about machine pointer info?
7649 PtrInfo.getWithOffset(Offset),
7653
7654 // Pointer address
7657 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7658 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7659 // Load address
7660 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7661 B.buildCopy(SGPR01, Temp);
7662 B.buildInstr(AMDGPU::S_TRAP)
7663 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7664 .addReg(SGPR01, RegState::Implicit);
7665 MI.eraseFromParent();
7666 return true;
7667 }
7668
7669 // Pass queue pointer to trap handler as input, and insert trap instruction
7670 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7671 Register LiveIn =
7674 return false;
7675
7676 B.buildCopy(SGPR01, LiveIn);
7677 B.buildInstr(AMDGPU::S_TRAP)
7678 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7679 .addReg(SGPR01, RegState::Implicit);
7680
7681 MI.eraseFromParent();
7682 return true;
7683}
7684
7687 MachineIRBuilder &B) const {
7688 // We need to simulate the 's_trap 2' instruction on targets that run in
7689 // PRIV=1 (where it is treated as a nop).
7690 if (ST.hasPrivEnabledTrap2NopBug()) {
7691 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7692 MI.getDebugLoc());
7693 MI.eraseFromParent();
7694 return true;
7695 }
7696
7697 B.buildInstr(AMDGPU::S_TRAP)
7698 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7699 MI.eraseFromParent();
7700 return true;
7701}
7702
7705 MachineIRBuilder &B) const {
7706 // Is non-HSA path or trap-handler disabled? Then, report a warning
7707 // accordingly
7708 if (!ST.hasTrapHandler() ||
7709 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7710 Function &Fn = B.getMF().getFunction();
7712 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7713 } else {
7714 // Insert debug-trap instruction
7715 B.buildInstr(AMDGPU::S_TRAP)
7716 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7717 }
7718
7719 MI.eraseFromParent();
7720 return true;
7721}
7722
7724 MachineInstr &MI, MachineIRBuilder &B) const {
7725 MachineRegisterInfo &MRI = *B.getMRI();
7726 const LLT S16 = LLT::scalar(16);
7727 const LLT S32 = LLT::scalar(32);
7728 const LLT V2S16 = LLT::fixed_vector(2, 16);
7729 const LLT V3S32 = LLT::fixed_vector(3, 32);
7730
7731 Register DstReg = MI.getOperand(0).getReg();
7732 Register NodePtr = MI.getOperand(2).getReg();
7733 Register RayExtent = MI.getOperand(3).getReg();
7734 Register RayOrigin = MI.getOperand(4).getReg();
7735 Register RayDir = MI.getOperand(5).getReg();
7736 Register RayInvDir = MI.getOperand(6).getReg();
7737 Register TDescr = MI.getOperand(7).getReg();
7738
7739 if (!ST.hasGFX10_AEncoding()) {
7740 Function &Fn = B.getMF().getFunction();
7742 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7743 return false;
7744 }
7745
7746 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7747 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7748 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7749 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7750 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7751 const unsigned NumVDataDwords = 4;
7752 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7753 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7754 const bool UseNSA =
7755 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7756
7757 const unsigned BaseOpcodes[2][2] = {
7758 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7759 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7760 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7761 int Opcode;
7762 if (UseNSA) {
7763 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7764 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7765 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7766 : AMDGPU::MIMGEncGfx10NSA,
7767 NumVDataDwords, NumVAddrDwords);
7768 } else {
7769 assert(!IsGFX12Plus);
7770 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7771 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7772 : AMDGPU::MIMGEncGfx10Default,
7773 NumVDataDwords, NumVAddrDwords);
7774 }
7775 assert(Opcode != -1);
7776
7778 if (UseNSA && IsGFX11Plus) {
7779 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7780 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7781 auto Merged = B.buildMergeLikeInstr(
7782 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7783 Ops.push_back(Merged.getReg(0));
7784 };
7785
7786 Ops.push_back(NodePtr);
7787 Ops.push_back(RayExtent);
7788 packLanes(RayOrigin);
7789
7790 if (IsA16) {
7791 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7792 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7793 auto MergedDir = B.buildMergeLikeInstr(
7794 V3S32,
7795 {B.buildBitcast(
7796 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7797 UnmergeRayDir.getReg(0)}))
7798 .getReg(0),
7799 B.buildBitcast(
7800 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7801 UnmergeRayDir.getReg(1)}))
7802 .getReg(0),
7803 B.buildBitcast(
7804 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7805 UnmergeRayDir.getReg(2)}))
7806 .getReg(0)});
7807 Ops.push_back(MergedDir.getReg(0));
7808 } else {
7809 packLanes(RayDir);
7810 packLanes(RayInvDir);
7811 }
7812 } else {
7813 if (Is64) {
7814 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7815 Ops.push_back(Unmerge.getReg(0));
7816 Ops.push_back(Unmerge.getReg(1));
7817 } else {
7818 Ops.push_back(NodePtr);
7819 }
7820 Ops.push_back(RayExtent);
7821
7822 auto packLanes = [&Ops, &S32, &B](Register Src) {
7823 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7824 Ops.push_back(Unmerge.getReg(0));
7825 Ops.push_back(Unmerge.getReg(1));
7826 Ops.push_back(Unmerge.getReg(2));
7827 };
7828
7829 packLanes(RayOrigin);
7830 if (IsA16) {
7831 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7832 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7836 B.buildMergeLikeInstr(R1,
7837 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7838 B.buildMergeLikeInstr(
7839 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7840 B.buildMergeLikeInstr(
7841 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7842 Ops.push_back(R1);
7843 Ops.push_back(R2);
7844 Ops.push_back(R3);
7845 } else {
7846 packLanes(RayDir);
7847 packLanes(RayInvDir);
7848 }
7849 }
7850
7851 if (!UseNSA) {
7852 // Build a single vector containing all the operands so far prepared.
7853 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7854 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7855 Ops.clear();
7856 Ops.push_back(MergedOps);
7857 }
7858
7859 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7860 .addDef(DstReg)
7861 .addImm(Opcode);
7862
7863 for (Register R : Ops) {
7864 MIB.addUse(R);
7865 }
7866
7867 MIB.addUse(TDescr)
7868 .addImm(IsA16 ? 1 : 0)
7869 .cloneMemRefs(MI);
7870
7871 MI.eraseFromParent();
7872 return true;
7873}
7874
7876 MachineInstr &MI, MachineIRBuilder &B) const {
7877 const LLT S32 = LLT::scalar(32);
7878 const LLT V2S32 = LLT::fixed_vector(2, 32);
7879
7880 Register DstReg = MI.getOperand(0).getReg();
7881 Register DstOrigin = MI.getOperand(1).getReg();
7882 Register DstDir = MI.getOperand(2).getReg();
7883 Register NodePtr = MI.getOperand(4).getReg();
7884 Register RayExtent = MI.getOperand(5).getReg();
7885 Register InstanceMask = MI.getOperand(6).getReg();
7886 Register RayOrigin = MI.getOperand(7).getReg();
7887 Register RayDir = MI.getOperand(8).getReg();
7888 Register Offsets = MI.getOperand(9).getReg();
7889 Register TDescr = MI.getOperand(10).getReg();
7890
7891 if (!ST.hasBVHDualAndBVH8Insts()) {
7892 Function &Fn = B.getMF().getFunction();
7894 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7895 return false;
7896 }
7897
7898 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7899 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7900 const unsigned NumVDataDwords = 10;
7901 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7902 int Opcode = AMDGPU::getMIMGOpcode(
7903 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7904 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7905 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7906 assert(Opcode != -1);
7907
7908 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7909 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7910
7911 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7912 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7913 .addDef(DstReg)
7914 .addDef(DstOrigin)
7915 .addDef(DstDir)
7916 .addImm(Opcode)
7917 .addUse(NodePtr)
7918 .addUse(RayExtentInstanceMaskVec.getReg(0))
7919 .addUse(RayOrigin)
7920 .addUse(RayDir)
7921 .addUse(Offsets)
7922 .addUse(TDescr)
7923 .cloneMemRefs(MI);
7924
7925 MI.eraseFromParent();
7926 return true;
7927}
7928
7930 MachineIRBuilder &B) const {
7931 const SITargetLowering *TLI = ST.getTargetLowering();
7933 Register DstReg = MI.getOperand(0).getReg();
7934 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7935 MI.eraseFromParent();
7936 return true;
7937}
7938
7940 MachineIRBuilder &B) const {
7941 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7942 if (!ST.hasArchitectedSGPRs())
7943 return false;
7944 LLT S32 = LLT::scalar(32);
7945 Register DstReg = MI.getOperand(0).getReg();
7946 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7947 auto LSB = B.buildConstant(S32, 25);
7948 auto Width = B.buildConstant(S32, 5);
7949 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7950 MI.eraseFromParent();
7951 return true;
7952}
7953
7956 AMDGPU::Hwreg::Id HwReg,
7957 unsigned LowBit,
7958 unsigned Width) const {
7959 MachineRegisterInfo &MRI = *B.getMRI();
7960 Register DstReg = MI.getOperand(0).getReg();
7961 if (!MRI.getRegClassOrNull(DstReg))
7962 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7963 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7964 .addDef(DstReg)
7965 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7966 MI.eraseFromParent();
7967 return true;
7968}
7969
7970static constexpr unsigned FPEnvModeBitField =
7972
7973static constexpr unsigned FPEnvTrapBitField =
7975
7978 MachineIRBuilder &B) const {
7979 Register Src = MI.getOperand(0).getReg();
7980 if (MRI.getType(Src) != S64)
7981 return false;
7982
7983 auto ModeReg =
7984 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7985 /*HasSideEffects=*/true, /*isConvergent=*/false)
7986 .addImm(FPEnvModeBitField);
7987 auto TrapReg =
7988 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7989 /*HasSideEffects=*/true, /*isConvergent=*/false)
7990 .addImm(FPEnvTrapBitField);
7991 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7992 MI.eraseFromParent();
7993 return true;
7994}
7995
7998 MachineIRBuilder &B) const {
7999 Register Src = MI.getOperand(0).getReg();
8000 if (MRI.getType(Src) != S64)
8001 return false;
8002
8003 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
8004 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8005 /*HasSideEffects=*/true, /*isConvergent=*/false)
8006 .addImm(static_cast<int16_t>(FPEnvModeBitField))
8007 .addReg(Unmerge.getReg(0));
8008 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8009 /*HasSideEffects=*/true, /*isConvergent=*/false)
8010 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
8011 .addReg(Unmerge.getReg(1));
8012 MI.eraseFromParent();
8013 return true;
8014}
8015
8017 MachineInstr &MI) const {
8018 MachineIRBuilder &B = Helper.MIRBuilder;
8019 MachineRegisterInfo &MRI = *B.getMRI();
8020
8021 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8022 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8023 switch (IntrID) {
8024 case Intrinsic::sponentry:
8025 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8026 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8027 // that we can remove this cast.
8028 const LLT S32 = LLT::scalar(32);
8030 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8031
8032 Register DstReg = MI.getOperand(0).getReg();
8033 B.buildIntToPtr(DstReg, TmpReg);
8034 MI.eraseFromParent();
8035 } else {
8036 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8037 1, 0, /*IsImmutable=*/false);
8038 B.buildFrameIndex(MI.getOperand(0), FI);
8039 MI.eraseFromParent();
8040 }
8041 return true;
8042 case Intrinsic::amdgcn_if:
8043 case Intrinsic::amdgcn_else: {
8044 MachineInstr *Br = nullptr;
8045 MachineBasicBlock *UncondBrTarget = nullptr;
8046 bool Negated = false;
8047 if (MachineInstr *BrCond =
8048 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8049 const SIRegisterInfo *TRI
8050 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8051
8052 Register Def = MI.getOperand(1).getReg();
8053 Register Use = MI.getOperand(3).getReg();
8054
8055 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8056
8057 if (Negated)
8058 std::swap(CondBrTarget, UncondBrTarget);
8059
8060 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8061 if (IntrID == Intrinsic::amdgcn_if) {
8062 B.buildInstr(AMDGPU::SI_IF)
8063 .addDef(Def)
8064 .addUse(Use)
8065 .addMBB(UncondBrTarget);
8066 } else {
8067 B.buildInstr(AMDGPU::SI_ELSE)
8068 .addDef(Def)
8069 .addUse(Use)
8070 .addMBB(UncondBrTarget);
8071 }
8072
8073 if (Br) {
8074 Br->getOperand(0).setMBB(CondBrTarget);
8075 } else {
8076 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8077 // since we're swapping branch targets it needs to be reinserted.
8078 // FIXME: IRTranslator should probably not do this
8079 B.buildBr(*CondBrTarget);
8080 }
8081
8082 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8083 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8084 MI.eraseFromParent();
8085 BrCond->eraseFromParent();
8086 return true;
8087 }
8088
8089 return false;
8090 }
8091 case Intrinsic::amdgcn_loop: {
8092 MachineInstr *Br = nullptr;
8093 MachineBasicBlock *UncondBrTarget = nullptr;
8094 bool Negated = false;
8095 if (MachineInstr *BrCond =
8096 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8097 const SIRegisterInfo *TRI
8098 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8099
8100 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8101 Register Reg = MI.getOperand(2).getReg();
8102
8103 if (Negated)
8104 std::swap(CondBrTarget, UncondBrTarget);
8105
8106 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8107 B.buildInstr(AMDGPU::SI_LOOP)
8108 .addUse(Reg)
8109 .addMBB(UncondBrTarget);
8110
8111 if (Br)
8112 Br->getOperand(0).setMBB(CondBrTarget);
8113 else
8114 B.buildBr(*CondBrTarget);
8115
8116 MI.eraseFromParent();
8117 BrCond->eraseFromParent();
8118 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8119 return true;
8120 }
8121
8122 return false;
8123 }
8124 case Intrinsic::amdgcn_addrspacecast_nonnull:
8125 return legalizeAddrSpaceCast(MI, MRI, B);
8126 case Intrinsic::amdgcn_make_buffer_rsrc:
8127 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8128 case Intrinsic::amdgcn_kernarg_segment_ptr:
8129 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8130 // This only makes sense to call in a kernel, so just lower to null.
8131 B.buildConstant(MI.getOperand(0).getReg(), 0);
8132 MI.eraseFromParent();
8133 return true;
8134 }
8135
8138 case Intrinsic::amdgcn_implicitarg_ptr:
8139 return legalizeImplicitArgPtr(MI, MRI, B);
8140 case Intrinsic::amdgcn_workitem_id_x:
8141 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8143 case Intrinsic::amdgcn_workitem_id_y:
8144 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8146 case Intrinsic::amdgcn_workitem_id_z:
8147 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8149 case Intrinsic::amdgcn_workgroup_id_x:
8150 return legalizeWorkGroupId(
8154 case Intrinsic::amdgcn_workgroup_id_y:
8155 return legalizeWorkGroupId(
8159 case Intrinsic::amdgcn_workgroup_id_z:
8160 return legalizeWorkGroupId(
8164 case Intrinsic::amdgcn_cluster_id_x:
8165 return ST.hasClusters() &&
8168 case Intrinsic::amdgcn_cluster_id_y:
8169 return ST.hasClusters() &&
8172 case Intrinsic::amdgcn_cluster_id_z:
8173 return ST.hasClusters() &&
8176 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8177 return ST.hasClusters() &&
8180 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8181 return ST.hasClusters() &&
8184 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8185 return ST.hasClusters() &&
8188 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8189 return ST.hasClusters() &&
8191 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8192 return ST.hasClusters() &&
8195 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8196 return ST.hasClusters() &&
8199 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8200 return ST.hasClusters() &&
8203 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8204 return ST.hasClusters() &&
8206 MI, MRI, B,
8208 case Intrinsic::amdgcn_wave_id:
8209 return legalizeWaveID(MI, B);
8210 case Intrinsic::amdgcn_lds_kernel_id:
8211 return legalizePreloadedArgIntrin(MI, MRI, B,
8213 case Intrinsic::amdgcn_dispatch_ptr:
8214 return legalizePreloadedArgIntrin(MI, MRI, B,
8216 case Intrinsic::amdgcn_queue_ptr:
8217 return legalizePreloadedArgIntrin(MI, MRI, B,
8219 case Intrinsic::amdgcn_implicit_buffer_ptr:
8222 case Intrinsic::amdgcn_dispatch_id:
8223 return legalizePreloadedArgIntrin(MI, MRI, B,
8225 case Intrinsic::r600_read_ngroups_x:
8226 // TODO: Emit error for hsa
8229 case Intrinsic::r600_read_ngroups_y:
8232 case Intrinsic::r600_read_ngroups_z:
8235 case Intrinsic::r600_read_local_size_x:
8236 // TODO: Could insert G_ASSERT_ZEXT from s16
8238 case Intrinsic::r600_read_local_size_y:
8239 // TODO: Could insert G_ASSERT_ZEXT from s16
8241 // TODO: Could insert G_ASSERT_ZEXT from s16
8242 case Intrinsic::r600_read_local_size_z:
8245 case Intrinsic::amdgcn_fdiv_fast:
8246 return legalizeFDIVFastIntrin(MI, MRI, B);
8247 case Intrinsic::amdgcn_is_shared:
8249 case Intrinsic::amdgcn_is_private:
8251 case Intrinsic::amdgcn_wavefrontsize: {
8252 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8253 MI.eraseFromParent();
8254 return true;
8255 }
8256 case Intrinsic::amdgcn_s_buffer_load:
8257 return legalizeSBufferLoad(Helper, MI);
8258 case Intrinsic::amdgcn_raw_buffer_store:
8259 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8260 case Intrinsic::amdgcn_struct_buffer_store:
8261 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8262 return legalizeBufferStore(MI, Helper, false, false);
8263 case Intrinsic::amdgcn_raw_buffer_store_format:
8264 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8265 case Intrinsic::amdgcn_struct_buffer_store_format:
8266 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8267 return legalizeBufferStore(MI, Helper, false, true);
8268 case Intrinsic::amdgcn_raw_tbuffer_store:
8269 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8270 case Intrinsic::amdgcn_struct_tbuffer_store:
8271 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8272 return legalizeBufferStore(MI, Helper, true, true);
8273 case Intrinsic::amdgcn_raw_buffer_load:
8274 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8275 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8276 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8277 case Intrinsic::amdgcn_struct_buffer_load:
8278 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8279 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8280 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8281 return legalizeBufferLoad(MI, Helper, false, false);
8282 case Intrinsic::amdgcn_raw_buffer_load_format:
8283 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8284 case Intrinsic::amdgcn_struct_buffer_load_format:
8285 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8286 return legalizeBufferLoad(MI, Helper, true, false);
8287 case Intrinsic::amdgcn_raw_tbuffer_load:
8288 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8289 case Intrinsic::amdgcn_struct_tbuffer_load:
8290 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8291 return legalizeBufferLoad(MI, Helper, true, true);
8292 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8293 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8294 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8295 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8296 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8298 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8299 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8300 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8301 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8302 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8303 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8304 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8305 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8306 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8308 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8309 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8310 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8311 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8312 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8313 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8314 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8315 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8316 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8317 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8318 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8319 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8320 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8321 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8322 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8323 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8324 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8325 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8326 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8327 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8328 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8329 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8330 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8331 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8332 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8333 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8334 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8335 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8336 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8337 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8338 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8339 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8340 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8342 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8344 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8345 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8346 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8347 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8348 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8349 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8350 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8351 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8352 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8353 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8354 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8355 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8356 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8357 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8358 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8359 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8360 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8361 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8362 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8363 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8364 return legalizeBufferAtomic(MI, B, IntrID);
8365 case Intrinsic::amdgcn_rsq_clamp:
8366 return legalizeRsqClampIntrinsic(MI, MRI, B);
8367 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8369 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8370 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8372 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8373 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8374 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8375 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8376 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8377 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8378 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8379 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8380 Register Index = MI.getOperand(5).getReg();
8381 LLT S64 = LLT::scalar(64);
8382 LLT IndexArgTy = MRI.getType(Index);
8383 if (IndexArgTy != S64) {
8384 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8385 : B.buildAnyExt(S64, Index);
8386 MI.getOperand(5).setReg(NewIndex.getReg(0));
8387 }
8388 return true;
8389 }
8390 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8391 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8392 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8393 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8394 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8395 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8396 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8397 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8398 Register Index = MI.getOperand(5).getReg();
8399 LLT S32 = LLT::scalar(32);
8400 if (MRI.getType(Index) != S32)
8401 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8402 return true;
8403 }
8404 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8405 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8406 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8407 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8408 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8409 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8410 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8411 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8412 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8413 Register Index = MI.getOperand(7).getReg();
8414 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8415 ? LLT::scalar(64)
8416 : LLT::scalar(32);
8417 LLT IndexArgTy = MRI.getType(Index);
8418 if (IndexArgTy != IdxTy) {
8419 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8420 : B.buildAnyExt(IdxTy, Index);
8421 MI.getOperand(7).setReg(NewIndex.getReg(0));
8422 }
8423 return true;
8424 }
8425
8426 case Intrinsic::amdgcn_fmed3: {
8427 GISelChangeObserver &Observer = Helper.Observer;
8428
8429 // FIXME: This is to workaround the inability of tablegen match combiners to
8430 // match intrinsics in patterns.
8431 Observer.changingInstr(MI);
8432 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8433 MI.removeOperand(1);
8434 Observer.changedInstr(MI);
8435 return true;
8436 }
8437 case Intrinsic::amdgcn_readlane:
8438 case Intrinsic::amdgcn_writelane:
8439 case Intrinsic::amdgcn_readfirstlane:
8440 case Intrinsic::amdgcn_permlane16:
8441 case Intrinsic::amdgcn_permlanex16:
8442 case Intrinsic::amdgcn_permlane64:
8443 case Intrinsic::amdgcn_set_inactive:
8444 case Intrinsic::amdgcn_set_inactive_chain_arg:
8445 case Intrinsic::amdgcn_mov_dpp8:
8446 case Intrinsic::amdgcn_update_dpp:
8447 return legalizeLaneOp(Helper, MI, IntrID);
8448 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8449 return legalizeSBufferPrefetch(Helper, MI);
8450 case Intrinsic::amdgcn_dead: {
8451 // TODO: Use poison instead of undef
8452 for (const MachineOperand &Def : MI.defs())
8453 B.buildUndef(Def);
8454 MI.eraseFromParent();
8455 return true;
8456 }
8457 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8458 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8459 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8460 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8461 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8462 MI.eraseFromParent();
8463 return true;
8464 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8465 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8466 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8467 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8468 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8469 MI.eraseFromParent();
8470 return true;
8471 case Intrinsic::amdgcn_flat_load_monitor_b32:
8472 case Intrinsic::amdgcn_flat_load_monitor_b64:
8473 case Intrinsic::amdgcn_flat_load_monitor_b128:
8474 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8475 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8476 .add(MI.getOperand(0))
8477 .add(MI.getOperand(2))
8478 .addMemOperand(*MI.memoperands_begin());
8479 MI.eraseFromParent();
8480 return true;
8481 case Intrinsic::amdgcn_global_load_monitor_b32:
8482 case Intrinsic::amdgcn_global_load_monitor_b64:
8483 case Intrinsic::amdgcn_global_load_monitor_b128:
8484 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8485 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8486 .add(MI.getOperand(0))
8487 .add(MI.getOperand(2))
8488 .addMemOperand(*MI.memoperands_begin());
8489 MI.eraseFromParent();
8490 return true;
8491 default: {
8492 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8494 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8495 return true;
8496 }
8497 }
8498
8499 return true;
8500}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:77
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1273
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1213
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1193
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1153
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:387
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:857
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:557
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:1995
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:652
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:460
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:313
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1677
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.