LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::scalar(32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::scalar(64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
327constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
328constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
329constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
330constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
331constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
332constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
333constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
334constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
335constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
336constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
337constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
338constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
341constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
342constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
343constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
344constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
345constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
346constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
347constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
350constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
364
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
375
378 (ST.useRealTrue16Insts() && Ty == S16) ||
380}
381
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 Query.MMODescrs[0].Ordering !=
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
500 Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
601 RoundedSize, AddrSpace, Align(AlignInBits / 8),
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
612 Query.MMODescrs[0].AlignInBits,
613 Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(Idx);
622
623 const LLT PointerTy = MRI.getType(MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
647 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Unmerged.getReg(I));
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 }
675 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
677}
678
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
686 return;
688}
689
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const std::initializer_list<LLT> FPTypesPK16_64 = {S32, S64, S16, V2S16,
736 V2S64};
737
738 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
739
741
742 // s1 for VCC branches, s32 for SCC branches.
744
745 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
746 // elements for v3s16
749 .legalFor(AllS32Vectors)
751 .legalFor(AddrSpaces64)
752 .legalFor(AddrSpaces32)
753 .legalFor(AddrSpaces128)
754 .legalIf(isPointer(0))
755 .clampScalar(0, S16, S256)
757 .clampMaxNumElements(0, S32, 16)
759 .scalarize(0);
760
761 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
762 // Full set of gfx9 features.
763 if (ST.hasPackedU64Ops()) {
764 getActionDefinitionsBuilder({G_ADD, G_SUB})
765 .legalFor({S64, S32, S16, V2S16, V2S64})
766 .clampMaxNumElementsStrict(0, S16, 2)
768 .scalarize(0)
769 .minScalar(0, S16)
771 .maxScalar(0, S32);
772 } else if (ST.hasScalarAddSub64()) {
773 getActionDefinitionsBuilder({G_ADD, G_SUB})
774 .legalFor({S64, S32, S16, V2S16})
775 .clampMaxNumElementsStrict(0, S16, 2)
776 .scalarize(0)
777 .minScalar(0, S16)
779 .maxScalar(0, S32);
780 } else {
781 getActionDefinitionsBuilder({G_ADD, G_SUB})
782 .legalFor({S32, S16, V2S16})
783 .clampMaxNumElementsStrict(0, S16, 2)
784 .scalarize(0)
785 .minScalar(0, S16)
787 .maxScalar(0, S32);
788 }
789
790 if (ST.hasScalarSMulU64()) {
792 .legalFor({S64, S32, S16, V2S16})
793 .clampMaxNumElementsStrict(0, S16, 2)
794 .scalarize(0)
795 .minScalar(0, S16)
797 .custom();
798 } else {
800 .legalFor({S32, S16, V2S16})
801 .clampMaxNumElementsStrict(0, S16, 2)
802 .scalarize(0)
803 .minScalar(0, S16)
805 .custom();
806 }
807 assert(ST.hasMad64_32());
808
809 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
810 .legalFor({S32, S16, V2S16}) // Clamp modifier
811 .minScalarOrElt(0, S16)
813 .scalarize(0)
815 .lower();
816 } else if (ST.has16BitInsts()) {
817 getActionDefinitionsBuilder({G_ADD, G_SUB})
818 .legalFor({S32, S16})
819 .minScalar(0, S16)
821 .maxScalar(0, S32)
822 .scalarize(0);
823
825 .legalFor({S32, S16})
826 .scalarize(0)
827 .minScalar(0, S16)
829 .custom();
830 assert(ST.hasMad64_32());
831
832 // Technically the saturating operations require clamp bit support, but this
833 // was introduced at the same time as 16-bit operations.
834 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
835 .legalFor({S32, S16}) // Clamp modifier
836 .minScalar(0, S16)
837 .scalarize(0)
839 .lower();
840
841 // We're just lowering this, but it helps get a better result to try to
842 // coerce to the desired type first.
843 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
844 .minScalar(0, S16)
845 .scalarize(0)
846 .lower();
847 } else {
848 getActionDefinitionsBuilder({G_ADD, G_SUB})
849 .legalFor({S32})
850 .widenScalarToNextMultipleOf(0, 32)
851 .clampScalar(0, S32, S32)
852 .scalarize(0);
853
854 auto &Mul = getActionDefinitionsBuilder(G_MUL)
855 .legalFor({S32})
856 .scalarize(0)
857 .minScalar(0, S32)
859
860 if (ST.hasMad64_32())
861 Mul.custom();
862 else
863 Mul.maxScalar(0, S32);
864
865 if (ST.hasIntClamp()) {
866 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
867 .legalFor({S32}) // Clamp modifier.
868 .scalarize(0)
870 .lower();
871 } else {
872 // Clamp bit support was added in VI, along with 16-bit operations.
873 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
874 .minScalar(0, S32)
875 .scalarize(0)
876 .lower();
877 }
878
879 // FIXME: DAG expansion gets better results. The widening uses the smaller
880 // range values and goes for the min/max lowering directly.
881 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
882 .minScalar(0, S32)
883 .scalarize(0)
884 .lower();
885 }
886
888 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
889 .customFor({S32, S64})
890 .clampScalar(0, S32, S64)
892 .scalarize(0);
893
894 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
895 .legalFor({S32})
896 .maxScalar(0, S32);
897
898 if (ST.hasVOP3PInsts()) {
899 Mulh
900 .clampMaxNumElements(0, S8, 2)
901 .lowerFor({V2S8});
902 }
903
904 Mulh
905 .scalarize(0)
906 .lower();
907
908 // Report legal for any types we can handle anywhere. For the cases only legal
909 // on the SALU, RegBankSelect will be able to re-legalize.
910 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
911 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
912 .clampScalar(0, S32, S64)
918 .scalarize(0);
919
921 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
922 .legalFor({{S32, S1}, {S32, S32}})
923 .clampScalar(0, S32, S32)
924 .scalarize(0);
925
927 // Don't worry about the size constraint.
929 .lower();
930
932 .legalFor({S1, S32, S64, S16, GlobalPtr,
933 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
934 .legalIf(isPointer(0))
935 .clampScalar(0, S32, S64)
937
938 getActionDefinitionsBuilder(G_FCONSTANT)
939 .legalFor({S32, S64, S16})
940 .clampScalar(0, S16, S64);
941
942 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
943 .legalIf(isRegisterClassType(ST, 0))
944 // s1 and s16 are special cases because they have legal operations on
945 // them, but don't really occupy registers in the normal way.
946 .legalFor({S1, S16})
947 .clampNumElements(0, V16S32, V32S32)
951 .clampMaxNumElements(0, S32, 16);
952
953 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
954
955 // If the amount is divergent, we have to do a wave reduction to get the
956 // maximum value, so this is expanded during RegBankSelect.
957 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
958 .legalFor({{PrivatePtr, S32}});
959
960 getActionDefinitionsBuilder(G_STACKSAVE)
961 .customFor({PrivatePtr});
962 getActionDefinitionsBuilder(G_STACKRESTORE)
963 .legalFor({PrivatePtr});
964
965 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
966
967 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
968 .customIf(typeIsNot(0, PrivatePtr));
969
970 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
971
972 auto &FPOpActions = getActionDefinitionsBuilder(
973 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
974 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
975 .legalFor({S32, S64});
976 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
977 .customFor({S32, S64});
978 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
979 .customFor({S32, S64});
980
981 if (ST.has16BitInsts()) {
982 if (ST.hasVOP3PInsts())
983 FPOpActions.legalFor({S16, V2S16});
984 else
985 FPOpActions.legalFor({S16});
986
987 TrigActions.customFor({S16});
988 FDIVActions.customFor({S16});
989 }
990
991 if (ST.hasPackedFP32Ops()) {
992 FPOpActions.legalFor({V2S32});
993 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
994 }
995
996 if (ST.hasPackedFP64Ops()) {
997 FPOpActions.legalFor({V2S64});
998 FPOpActions.clampMaxNumElementsStrict(0, S64, 2);
999 }
1000
1001 if (ST.hasPackedFP64Ops()) {
1002 FPOpActions.legalFor({V2S64});
1003 FPOpActions.clampMaxNumElementsStrict(0, S64, 2);
1004 }
1005
1006 auto &MinNumMaxNumIeee =
1007 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
1008
1009 if (ST.hasVOP3PInsts()) {
1010 MinNumMaxNumIeee.legalFor(FPTypesPK16)
1011 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1012 .clampMaxNumElements(0, S16, 2)
1013 .clampScalar(0, S16, S64)
1014 .scalarize(0);
1015 } else if (ST.has16BitInsts()) {
1016 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
1017 } else {
1018 MinNumMaxNumIeee.legalFor(FPTypesBase)
1019 .clampScalar(0, S32, S64)
1020 .scalarize(0);
1021 }
1022
1023 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1024 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1025
1026 if (ST.hasPackedFP64Ops()) {
1027 MinNumMaxNum.customFor(FPTypesPK16_64)
1028 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1029 .clampMaxNumElements(0, S16, 2)
1030 .clampMaxNumElements(0, S64, 2)
1031 .clampScalar(0, S16, S64)
1032 .scalarize(0);
1033 } else if (ST.hasVOP3PInsts()) {
1034 MinNumMaxNum.customFor(FPTypesPK16)
1035 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1036 .clampMaxNumElements(0, S16, 2)
1037 .clampScalar(0, S16, S64)
1038 .scalarize(0);
1039 } else if (ST.has16BitInsts()) {
1040 MinNumMaxNum.customFor(FPTypes16)
1041 .clampScalar(0, S16, S64)
1042 .scalarize(0);
1043 } else {
1044 MinNumMaxNum.customFor(FPTypesBase)
1045 .clampScalar(0, S32, S64)
1046 .scalarize(0);
1047 }
1048
1049 if (ST.hasVOP3PInsts())
1050 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1051
1052 FPOpActions
1053 .scalarize(0)
1054 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1055
1056 TrigActions
1057 .scalarize(0)
1058 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1059
1060 FDIVActions
1061 .scalarize(0)
1062 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1063
1064 auto &FNegAbs = getActionDefinitionsBuilder({G_FNEG, G_FABS});
1065 FNegAbs.legalFor(FPTypesPK16)
1066 .legalFor(ST.hasPackedFP32Ops(), {V2S32})
1068 if (ST.hasPackedFP32Ops())
1069 FNegAbs.clampMaxNumElementsStrict(0, S32, 2);
1070 FNegAbs.scalarize(0).clampScalar(0, S16, S64);
1071
1072 if (ST.has16BitInsts()) {
1074 .legalFor({S16})
1075 .customFor({S32, S64})
1076 .scalarize(0)
1077 .unsupported();
1079 .legalFor({S32, S64, S16})
1080 .scalarize(0)
1081 .clampScalar(0, S16, S64);
1082
1083 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1084 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1085 .scalarize(0)
1086 .maxScalarIf(typeIs(0, S16), 1, S16)
1087 .clampScalar(1, S32, S32)
1088 .lower();
1089
1091 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1092 .scalarize(0)
1093 .lower();
1094
1096 .lowerFor({S16, S32, S64})
1097 .scalarize(0)
1098 .lower();
1099 } else {
1101 .customFor({S32, S64, S16})
1102 .scalarize(0)
1103 .unsupported();
1104
1105
1106 if (ST.hasFractBug()) {
1108 .customFor({S64})
1109 .legalFor({S32, S64})
1110 .scalarize(0)
1111 .clampScalar(0, S32, S64);
1112 } else {
1114 .legalFor({S32, S64})
1115 .scalarize(0)
1116 .clampScalar(0, S32, S64);
1117 }
1118
1119 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1120 .legalFor({{S32, S32}, {S64, S32}})
1121 .scalarize(0)
1122 .clampScalar(0, S32, S64)
1123 .clampScalar(1, S32, S32)
1124 .lower();
1125
1127 .customFor({{S32, S32}, {S64, S32}})
1128 .scalarize(0)
1129 .minScalar(0, S32)
1130 .clampScalar(1, S32, S32)
1131 .lower();
1132
1134 .lowerFor({S32, S64})
1135 .scalarize(0)
1136 .lower();
1137 }
1138
1139 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1140 if (ST.hasCvtPkF16F32Inst()) {
1141 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1142 .clampMaxNumElements(0, S16, 2);
1143 } else {
1144 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1145 }
1146 FPTruncActions.scalarize(0).lower();
1147
1149 .legalFor({{S64, S32}, {S32, S16}})
1150 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1151 .scalarize(0);
1152
1153 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1154 if (ST.has16BitInsts()) {
1155 FSubActions
1156 // Use actual fsub instruction
1157 .legalFor({S32, S16})
1158 // Must use fadd + fneg
1159 .lowerFor({S64, V2S16});
1160 } else {
1161 FSubActions
1162 // Use actual fsub instruction
1163 .legalFor({S32})
1164 // Must use fadd + fneg
1165 .lowerFor({S64, S16, V2S16});
1166 }
1167
1168 if (ST.hasPackedFP32Ops())
1169 FSubActions.lowerFor({V2S32}).clampMaxNumElements(0, S32, 2);
1170
1171 FSubActions
1172 .clampMaxNumElements(0, S16, 2)
1173 .scalarize(0)
1174 .clampScalar(0, S32, S64);
1175
1176 // Whether this is legal depends on the floating point mode for the function.
1177 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1178 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1179 FMad.customFor({S32, S16});
1180 else if (ST.hasMadMacF32Insts())
1181 FMad.customFor({S32});
1182 else if (ST.hasMadF16())
1183 FMad.customFor({S16});
1184 FMad.scalarize(0)
1185 .lower();
1186
1187 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1188 if (ST.has16BitInsts()) {
1189 FRem.customFor({S16, S32, S64});
1190 } else {
1191 FRem.minScalar(0, S32)
1192 .customFor({S32, S64});
1193 }
1194 FRem.scalarize(0);
1195
1196 // TODO: Do we need to clamp maximum bitwidth?
1198 .legalIf(isScalar(0))
1199 .legalFor({{V2S16, V2S32}})
1200 .clampMaxNumElements(0, S16, 2)
1201 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1202 // situations (like an invalid implicit use), we don't want to infinite loop
1203 // in the legalizer.
1205 .alwaysLegal();
1206
1207 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1208 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1209 {S32, S1}, {S64, S1}, {S16, S1}})
1210 .scalarize(0)
1211 .clampScalar(0, S32, S64)
1212 .widenScalarToNextPow2(1, 32);
1213
1214 // TODO: Split s1->s64 during regbankselect for VALU.
1215 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1216 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1217 .lowerIf(typeIs(1, S1))
1218 .customFor({{S32, S64}, {S64, S64}});
1219 if (ST.has16BitInsts())
1220 IToFP.legalFor({{S16, S16}});
1221 IToFP.clampScalar(1, S32, S64)
1222 .minScalar(0, S32)
1223 .scalarize(0)
1225
1226 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1227 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1228 .customFor({{S64, S32}, {S64, S64}})
1229 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1230 if (ST.has16BitInsts())
1231 FPToI.legalFor({{S16, S16}});
1232 else
1233 FPToI.minScalar(1, S32);
1234
1235 FPToI.minScalar(0, S32)
1236 .widenScalarToNextPow2(0, 32)
1237 .scalarize(0)
1238 .lower();
1239
1240 // clang-format off
1241 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1242 .legalFor({{S32, S32}, {S32, S64}})
1243 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1244 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1245
1246 // If available, widen width <16 to i16, intead of i32 so v_cvt_i16/u16_f16 can be used.
1247 if (ST.has16BitInsts())
1248 FPToISat.minScalarIf(typeIs(1, S16), 0, S16);
1249
1250 FPToISat.minScalar(1, S32);
1251 FPToISat.minScalar(0, S32)
1252 .widenScalarToNextPow2(0, 32)
1253 .scalarize(0)
1254 .lower();
1255 // clang-format on
1256
1257 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1258 .clampScalar(0, S16, S64)
1259 .scalarize(0)
1260 .lower();
1261
1262 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1263 .legalFor({S16, S32})
1264 .scalarize(0)
1265 .lower();
1266
1267 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1268 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1269 .scalarize(0)
1270 .lower();
1271
1272 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1273 .clampScalar(0, S16, S64)
1274 .scalarize(0)
1275 .lower();
1276
1277 if (ST.has16BitInsts()) {
1278 getActionDefinitionsBuilder(
1279 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1280 .legalFor({S16, S32, S64})
1281 .clampScalar(0, S16, S64)
1282 .scalarize(0);
1283 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1284 getActionDefinitionsBuilder(
1285 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1286 .legalFor({S32, S64})
1287 .clampScalar(0, S32, S64)
1288 .scalarize(0);
1289 } else {
1290 getActionDefinitionsBuilder(
1291 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1292 .legalFor({S32})
1293 .customFor({S64})
1294 .clampScalar(0, S32, S64)
1295 .scalarize(0);
1296 }
1297
1298 getActionDefinitionsBuilder(G_PTR_ADD)
1299 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1300 .legalIf(all(isPointer(0), sameSize(0, 1)))
1301 .scalarize(0)
1302 .scalarSameSizeAs(1, 0);
1303
1304 getActionDefinitionsBuilder(G_PTRMASK)
1305 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1306 .scalarSameSizeAs(1, 0)
1307 .scalarize(0);
1308
1309 auto &CmpBuilder =
1310 getActionDefinitionsBuilder(G_ICMP)
1311 // The compare output type differs based on the register bank of the output,
1312 // so make both s1 and s32 legal.
1313 //
1314 // Scalar compares producing output in scc will be promoted to s32, as that
1315 // is the allocatable register type that will be needed for the copy from
1316 // scc. This will be promoted during RegBankSelect, and we assume something
1317 // before that won't try to use s32 result types.
1318 //
1319 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1320 // bank.
1322 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1323 .legalForCartesianProduct(
1324 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1325 if (ST.has16BitInsts()) {
1326 CmpBuilder.legalFor({{S1, S16}});
1327 }
1328
1329 CmpBuilder
1331 .clampScalar(1, S32, S64)
1332 .scalarize(0)
1333 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1334
1335 auto &FCmpBuilder =
1336 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1337 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1338
1339 if (ST.hasSALUFloatInsts())
1340 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1341
1342 FCmpBuilder
1344 .clampScalar(1, S32, S64)
1345 .scalarize(0);
1346
1347 // FIXME: fpow has a selection pattern that should move to custom lowering.
1348 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1349 if (ST.has16BitInsts())
1350 ExpOps.customFor({{S32}, {S16}});
1351 else
1352 ExpOps.customFor({S32});
1353 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1354 .scalarize(0);
1355
1356 getActionDefinitionsBuilder(G_FPOWI)
1357 .clampScalar(0, MinScalarFPTy, S32)
1358 .lower();
1359
1360 getActionDefinitionsBuilder(G_FLOG2)
1361 .legalFor(ST.has16BitInsts(), {S16})
1362 .customFor({S32, S16})
1363 .scalarize(0)
1364 .lower();
1365
1366 getActionDefinitionsBuilder(G_FEXP2)
1367 .legalFor(ST.has16BitInsts(), {S16})
1368 .customFor({S32, S64, S16})
1369 .scalarize(0)
1370 .lower();
1371
1372 auto &LogOps =
1373 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1374 LogOps.customFor({S32, S16, S64});
1375 LogOps.clampScalar(0, MinScalarFPTy, S32)
1376 .scalarize(0);
1377
1378 // The 64-bit versions produce 32-bit results, but only on the SALU.
1379 getActionDefinitionsBuilder(G_CTPOP)
1380 .legalFor({{S32, S32}, {S32, S64}})
1381 .clampScalar(0, S32, S32)
1382 .widenScalarToNextPow2(1, 32)
1383 .clampScalar(1, S32, S64)
1384 .scalarize(0)
1385 .widenScalarToNextPow2(0, 32);
1386
1387 // If no 16 bit instr is available, lower into different instructions.
1388 if (ST.has16BitInsts())
1389 getActionDefinitionsBuilder(G_IS_FPCLASS)
1390 .legalForCartesianProduct({S1}, FPTypes16)
1391 .widenScalarToNextPow2(1)
1392 .scalarize(0)
1393 .lower();
1394 else
1395 getActionDefinitionsBuilder(G_IS_FPCLASS)
1396 .legalForCartesianProduct({S1}, FPTypesBase)
1397 .lowerFor({S1, S16})
1398 .widenScalarToNextPow2(1)
1399 .scalarize(0)
1400 .lower();
1401
1402 // The hardware instructions return a different result on 0 than the generic
1403 // instructions expect. The hardware produces -1, but these produce the
1404 // bitwidth.
1405 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1406 .scalarize(0)
1407 .clampScalar(0, S32, S32)
1408 .clampScalar(1, S32, S64)
1409 .widenScalarToNextPow2(0, 32)
1410 .widenScalarToNextPow2(1, 32)
1411 .custom();
1412
1413 // The 64-bit versions produce 32-bit results, but only on the SALU.
1414 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1415 .legalFor({{S32, S32}, {S32, S64}})
1416 .customIf(scalarNarrowerThan(1, 32))
1417 .clampScalar(0, S32, S32)
1418 .clampScalar(1, S32, S64)
1419 .scalarize(0)
1420 .widenScalarToNextPow2(0, 32)
1421 .widenScalarToNextPow2(1, 32);
1422
1423 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1424 .legalFor({{S32, S32}, {S32, S64}})
1425 .clampScalar(0, S32, S32)
1426 .clampScalar(1, S32, S64)
1427 .scalarize(0)
1428 .widenScalarToNextPow2(0, 32)
1429 .widenScalarToNextPow2(1, 32);
1430
1431 getActionDefinitionsBuilder(G_CTLS)
1432 .customFor({{S32, S32}})
1433 .scalarize(0)
1434 .clampScalar(0, S32, S32)
1435 .clampScalar(1, S32, S32);
1436
1437 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1438 // RegBankSelect.
1439 getActionDefinitionsBuilder(G_BITREVERSE)
1440 .legalFor({S32, S64})
1441 .clampScalar(0, S32, S64)
1442 .scalarize(0)
1443 .widenScalarToNextPow2(0);
1444
1445 if (ST.has16BitInsts()) {
1446 getActionDefinitionsBuilder(G_BSWAP)
1447 .legalFor({S16, S32, V2S16})
1448 .clampMaxNumElementsStrict(0, S16, 2)
1449 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1450 // narrowScalar limitation.
1451 .widenScalarToNextPow2(0)
1452 .clampScalar(0, S16, S32)
1453 .scalarize(0);
1454
1455 if (ST.hasVOP3PInsts()) {
1456 getActionDefinitionsBuilder(G_ABS)
1457 .legalFor({S32, S16, V2S16})
1458 .clampMaxNumElements(0, S16, 2)
1459 .minScalar(0, S16)
1460 .widenScalarToNextPow2(0)
1461 .scalarize(0)
1462 .lower();
1463 if (ST.hasMinMaxI64Insts()) {
1464 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1465 .legalFor({S32, S16, S64, V2S16})
1466 .clampMaxNumElements(0, S16, 2)
1467 .minScalar(0, S16)
1468 .widenScalarToNextPow2(0)
1469 .scalarize(0)
1470 .lower();
1471 } else {
1472 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1473 .legalFor({S32, S16, V2S16})
1474 .clampMaxNumElements(0, S16, 2)
1475 .minScalar(0, S16)
1476 .widenScalarToNextPow2(0)
1477 .scalarize(0)
1478 .lower();
1479 }
1480 } else {
1481 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1482 .legalFor({S32, S16})
1483 .widenScalarToNextPow2(0)
1484 .minScalar(0, S16)
1485 .scalarize(0)
1486 .lower();
1487 }
1488 } else {
1489 // TODO: Should have same legality without v_perm_b32
1490 getActionDefinitionsBuilder(G_BSWAP)
1491 .legalFor({S32})
1492 .lowerIf(scalarNarrowerThan(0, 32))
1493 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1494 // narrowScalar limitation.
1495 .widenScalarToNextPow2(0)
1496 .maxScalar(0, S32)
1497 .scalarize(0)
1498 .lower();
1499
1500 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1501 .legalFor({S32})
1502 .minScalar(0, S32)
1503 .widenScalarToNextPow2(0)
1504 .scalarize(0)
1505 .lower();
1506 }
1507
1508 getActionDefinitionsBuilder(G_INTTOPTR)
1509 // List the common cases
1510 .legalForCartesianProduct(AddrSpaces64, {S64})
1511 .legalForCartesianProduct(AddrSpaces32, {S32})
1512 .scalarize(0)
1513 // Accept any address space as long as the size matches
1514 .legalIf(sameSize(0, 1))
1515 .widenScalarIf(smallerThan(1, 0),
1516 [](const LegalityQuery &Query) {
1517 return std::pair(
1518 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1519 })
1520 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1521 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1522 });
1523
1524 getActionDefinitionsBuilder(G_PTRTOINT)
1525 // List the common cases
1526 .legalForCartesianProduct(AddrSpaces64, {S64})
1527 .legalForCartesianProduct(AddrSpaces32, {S32})
1528 .scalarize(0)
1529 // Accept any address space as long as the size matches
1530 .legalIf(sameSize(0, 1))
1531 .widenScalarIf(smallerThan(0, 1),
1532 [](const LegalityQuery &Query) {
1533 return std::pair(
1534 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1535 })
1536 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1537 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1538 });
1539
1540 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1541 .scalarize(0)
1542 .custom();
1543
1544 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1545 bool IsLoad) -> bool {
1546 const LLT DstTy = Query.Types[0];
1547
1548 // Split vector extloads.
1549 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1550
1551 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1552 return true;
1553
1554 const LLT PtrTy = Query.Types[1];
1555 unsigned AS = PtrTy.getAddressSpace();
1556 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1557 Query.MMODescrs[0].Ordering !=
1559 return true;
1560
1561 // Catch weird sized loads that don't evenly divide into the access sizes
1562 // TODO: May be able to widen depending on alignment etc.
1563 unsigned NumRegs = (MemSize + 31) / 32;
1564 if (NumRegs == 3) {
1565 if (!ST.hasDwordx3LoadStores())
1566 return true;
1567 } else {
1568 // If the alignment allows, these should have been widened.
1569 if (!isPowerOf2_32(NumRegs))
1570 return true;
1571 }
1572
1573 return false;
1574 };
1575
1576 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1577 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1578 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1579
1580 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1581 // LDS
1582 // TODO: Unsupported flat for SI.
1583
1584 for (unsigned Op : {G_LOAD, G_STORE}) {
1585 const bool IsStore = Op == G_STORE;
1586
1587 auto &Actions = getActionDefinitionsBuilder(Op);
1588 // Explicitly list some common cases.
1589 // TODO: Does this help compile time at all?
1590 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1591 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1592 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1593 {S64, GlobalPtr, S64, GlobalAlign32},
1594 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1595 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1596 {S32, GlobalPtr, S8, GlobalAlign8},
1597 {S32, GlobalPtr, S16, GlobalAlign16},
1598
1599 {S32, LocalPtr, S32, 32},
1600 {S64, LocalPtr, S64, 32},
1601 {V2S32, LocalPtr, V2S32, 32},
1602 {S32, LocalPtr, S8, 8},
1603 {S32, LocalPtr, S16, 16},
1604 {V2S16, LocalPtr, S32, 32},
1605
1606 {S32, PrivatePtr, S32, 32},
1607 {S32, PrivatePtr, S8, 8},
1608 {S32, PrivatePtr, S16, 16},
1609 {V2S16, PrivatePtr, S32, 32},
1610
1611 {S32, ConstantPtr, S32, GlobalAlign32},
1612 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1613 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1614 {S64, ConstantPtr, S64, GlobalAlign32},
1615 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1616
1617 Actions.legalForTypesWithMemDesc(ST.useRealTrue16Insts(), /* Pred */
1618 {{S16, GlobalPtr, S8, GlobalAlign8},
1619 {S16, GlobalPtr, S16, GlobalAlign16},
1620 {S16, LocalPtr, S8, 8},
1621 {S16, LocalPtr, S16, 16},
1622 {S16, PrivatePtr, S8, 8},
1623 {S16, PrivatePtr, S16, 16}});
1624
1625 Actions.legalIf(
1626 [=](const LegalityQuery &Query) -> bool {
1627 return isLoadStoreLegal(ST, Query);
1628 });
1629
1630 // The custom pointers (fat pointers, buffer resources) don't work with load
1631 // and store at this level. Fat pointers should have been lowered to
1632 // intrinsics before the translation to MIR.
1633 Actions.unsupportedIf(
1634 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1635
1636 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1637 // ptrtoint. This is needed to account for the fact that we can't have i128
1638 // as a register class for SelectionDAG reasons.
1639 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1640 return hasBufferRsrcWorkaround(Query.Types[0]);
1641 });
1642
1643 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1644 // 64-bits.
1645 //
1646 // TODO: Should generalize bitcast action into coerce, which will also cover
1647 // inserting addrspacecasts.
1648 Actions.customIf(typeIs(1, Constant32Ptr));
1649
1650 // Turn any illegal element vectors into something easier to deal
1651 // with. These will ultimately produce 32-bit scalar shifts to extract the
1652 // parts anyway.
1653 //
1654 // For odd 16-bit element vectors, prefer to split those into pieces with
1655 // 16-bit vector parts.
1656 Actions.bitcastIf(
1657 [=](const LegalityQuery &Query) -> bool {
1658 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1659 Query.MMODescrs[0].MemoryTy);
1660 }, bitcastToRegisterType(0));
1661
1662 if (!IsStore) {
1663 // Widen suitably aligned loads by loading extra bytes. The standard
1664 // legalization actions can't properly express widening memory operands.
1665 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1666 return shouldWidenLoad(ST, Query, G_LOAD);
1667 });
1668 }
1669
1670 // FIXME: load/store narrowing should be moved to lower action
1671 Actions
1672 .narrowScalarIf(
1673 [=](const LegalityQuery &Query) -> bool {
1674 return !Query.Types[0].isVector() &&
1675 needToSplitMemOp(Query, Op == G_LOAD);
1676 },
1677 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1678 const LLT DstTy = Query.Types[0];
1679 const LLT PtrTy = Query.Types[1];
1680
1681 const unsigned DstSize = DstTy.getSizeInBits();
1682 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1683
1684 // Split extloads.
1685 if (DstSize > MemSize)
1686 return std::pair(0, LLT::scalar(MemSize));
1687
1688 unsigned MaxSize = maxSizeForAddrSpace(
1689 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1690 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1691 if (MemSize > MaxSize)
1692 return std::pair(0, LLT::scalar(MaxSize));
1693
1694 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1695 return std::pair(0, LLT::scalar(Align));
1696 })
1697 .fewerElementsIf(
1698 [=](const LegalityQuery &Query) -> bool {
1699 return Query.Types[0].isVector() &&
1700 needToSplitMemOp(Query, Op == G_LOAD);
1701 },
1702 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1703 const LLT DstTy = Query.Types[0];
1704 const LLT PtrTy = Query.Types[1];
1705
1706 LLT EltTy = DstTy.getElementType();
1707 unsigned MaxSize = maxSizeForAddrSpace(
1708 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1709 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1710
1711 // FIXME: Handle widened to power of 2 results better. This ends
1712 // up scalarizing.
1713 // FIXME: 3 element stores scalarized on SI
1714
1715 // Split if it's too large for the address space.
1716 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1717 if (MemSize > MaxSize) {
1718 unsigned NumElts = DstTy.getNumElements();
1719 unsigned EltSize = EltTy.getSizeInBits();
1720
1721 if (MaxSize % EltSize == 0) {
1722 return std::pair(
1724 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1725 }
1726
1727 unsigned NumPieces = MemSize / MaxSize;
1728
1729 // FIXME: Refine when odd breakdowns handled
1730 // The scalars will need to be re-legalized.
1731 if (NumPieces == 1 || NumPieces >= NumElts ||
1732 NumElts % NumPieces != 0)
1733 return std::pair(0, EltTy);
1734
1735 return std::pair(0,
1736 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1737 }
1738
1739 // FIXME: We could probably handle weird extending loads better.
1740 if (DstTy.getSizeInBits() > MemSize)
1741 return std::pair(0, EltTy);
1742
1743 unsigned EltSize = EltTy.getSizeInBits();
1744 unsigned DstSize = DstTy.getSizeInBits();
1745 if (!isPowerOf2_32(DstSize)) {
1746 // We're probably decomposing an odd sized store. Try to split
1747 // to the widest type. TODO: Account for alignment. As-is it
1748 // should be OK, since the new parts will be further legalized.
1749 unsigned FloorSize = llvm::bit_floor(DstSize);
1750 return std::pair(
1752 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1753 }
1754
1755 // May need relegalization for the scalars.
1756 return std::pair(0, EltTy);
1757 })
1758 .minScalar(0, S32)
1759 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1761 .widenScalarToNextPow2(0)
1762 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1763 .lower();
1764 }
1765
1766 // FIXME: Unaligned accesses not lowered.
1767 auto &ExtLoads =
1768 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1769 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1770 {S32, GlobalPtr, S16, 2 * 8},
1771 {S32, LocalPtr, S8, 8},
1772 {S32, LocalPtr, S16, 16},
1773 {S32, PrivatePtr, S8, 8},
1774 {S32, PrivatePtr, S16, 16},
1775 {S32, ConstantPtr, S8, 8},
1776 {S32, ConstantPtr, S16, 2 * 8}})
1777 .legalForTypesWithMemDesc(ST.useRealTrue16Insts(),
1778 {{S16, GlobalPtr, S8, GlobalAlign8},
1779 {S16, LocalPtr, S8, GlobalAlign8},
1780 {S16, PrivatePtr, S8, GlobalAlign8},
1781 {S16, ConstantPtr, S8, GlobalAlign8}})
1782 .legalIf([=](const LegalityQuery &Query) -> bool {
1783 return isLoadStoreLegal(ST, Query);
1784 });
1785
1786 if (ST.hasFlatAddressSpace()) {
1787 ExtLoads.legalForTypesWithMemDesc(
1788 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1789
1790 ExtLoads.legalForTypesWithMemDesc(ST.useRealTrue16Insts(),
1791 {{S16, FlatPtr, S8, GlobalAlign8}});
1792 }
1793
1794 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1795 // 64-bits.
1796 //
1797 // TODO: Should generalize bitcast action into coerce, which will also cover
1798 // inserting addrspacecasts.
1799 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1800
1801 ExtLoads.narrowScalarIf(
1802 [](const LegalityQuery &Query) {
1803 LLT MemTy = Query.MMODescrs[0].MemoryTy;
1804 return MemTy.isAnyScalar() && MemTy.getSizeInBits() > 32 &&
1805 Query.Types[0].getSizeInBits() > MemTy.getSizeInBits();
1806 }, // For large MemSize, narrowscalar to MemSize (load MemSize + ext)
1808 ExtLoads.clampScalar(0, S32, S32)
1809 .widenScalarToNextPow2(0)
1810 .lower();
1811
1812 auto &Atomics = getActionDefinitionsBuilder(
1813 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1814 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1815 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1816 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1817 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1818 {S64, GlobalPtr}, {S64, LocalPtr},
1819 {S32, RegionPtr}, {S64, RegionPtr}});
1820 if (ST.hasFlatAddressSpace()) {
1821 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1822 }
1823
1824 auto &Atomics32 =
1825 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1826 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1827 if (ST.hasFlatAddressSpace()) {
1828 Atomics32.legalFor({{S32, FlatPtr}});
1829 }
1830
1831 // TODO: v2bf16 operations, and fat buffer pointer support.
1832 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1833 if (ST.hasLDSFPAtomicAddF32()) {
1834 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1835 if (ST.hasLdsAtomicAddF64())
1836 Atomic.legalFor({{S64, LocalPtr}});
1837 if (ST.hasAtomicDsPkAdd16Insts())
1838 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1839 }
1840 if (ST.hasAtomicFaddInsts())
1841 Atomic.legalFor({{S32, GlobalPtr}});
1842 if (ST.hasFlatAtomicFaddF32Inst())
1843 Atomic.legalFor({{S32, FlatPtr}});
1844
1845 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1846 // These are legal with some caveats, and should have undergone expansion in
1847 // the IR in most situations
1848 // TODO: Move atomic expansion into legalizer
1849 Atomic.legalFor({
1850 {S32, GlobalPtr},
1851 {S64, GlobalPtr},
1852 {S64, FlatPtr}
1853 });
1854 }
1855
1856 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1857 ST.hasAtomicBufferGlobalPkAddF16Insts())
1858 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1859 if (ST.hasAtomicGlobalPkAddBF16Inst())
1860 Atomic.legalFor({{V2BF16, GlobalPtr}});
1861 if (ST.hasAtomicFlatPkAdd16Insts())
1862 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1863
1864
1865 // Most of the legalization work here is done by AtomicExpand. We could
1866 // probably use a simpler legality rule that just assumes anything is OK.
1867 auto &AtomicFMinFMax =
1868 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1869 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1870
1871 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1872 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1873 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1874 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1875 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1876 AtomicFMinFMax.legalFor({F32, FlatPtr});
1877 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1878 AtomicFMinFMax.legalFor({F64, FlatPtr});
1879
1880 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1881 // demarshalling
1882 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1883 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1884 {S32, FlatPtr}, {S64, FlatPtr}})
1885 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1886 {S32, RegionPtr}, {S64, RegionPtr}});
1887 // TODO: Pointer types, any 32-bit or 64-bit vector
1888
1889 // Condition should be s32 for scalar, s1 for vector.
1890 getActionDefinitionsBuilder(G_SELECT)
1891 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1892 LocalPtr, FlatPtr, PrivatePtr,
1893 LLT::fixed_vector(2, LocalPtr),
1894 LLT::fixed_vector(2, PrivatePtr)},
1895 {S1, S32})
1896 .clampScalar(0, S16, S64)
1897 .scalarize(1)
1898 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1899 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1900 .clampMaxNumElements(0, S32, 2)
1901 .clampMaxNumElements(0, LocalPtr, 2)
1902 .clampMaxNumElements(0, PrivatePtr, 2)
1903 .scalarize(0)
1904 .widenScalarToNextPow2(0)
1905 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1906
1907 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1908 // be more flexible with the shift amount type.
1909 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1910 .legalFor({{S32, S32}, {S64, S32}});
1911 if (ST.has16BitInsts()) {
1912 if (ST.hasVOP3PInsts()) {
1913 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1914 .clampMaxNumElements(0, S16, 2);
1915 } else
1916 Shifts.legalFor({{S16, S16}});
1917
1918 // TODO: Support 16-bit shift amounts for all types
1919 Shifts.widenScalarIf(
1920 [=](const LegalityQuery &Query) {
1921 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1922 // 32-bit amount.
1923 const LLT ValTy = Query.Types[0];
1924 const LLT AmountTy = Query.Types[1];
1925 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1926 AmountTy.getSizeInBits() < 16;
1927 }, changeTo(1, S16));
1928 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1929 Shifts.clampScalar(1, S32, S32);
1930 Shifts.widenScalarToNextPow2(0, 16);
1931 Shifts.clampScalar(0, S16, S64);
1932
1933 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1934 .minScalar(0, S16)
1935 .scalarize(0)
1936 .lower();
1937 } else {
1938 // Make sure we legalize the shift amount type first, as the general
1939 // expansion for the shifted type will produce much worse code if it hasn't
1940 // been truncated already.
1941 Shifts.clampScalar(1, S32, S32);
1942 Shifts.widenScalarToNextPow2(0, 32);
1943 Shifts.clampScalar(0, S32, S64);
1944
1945 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1946 .minScalar(0, S32)
1947 .scalarize(0)
1948 .lower();
1949 }
1950 Shifts.scalarize(0);
1951
1952 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1953 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1954 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1955 unsigned IdxTypeIdx = 2;
1956
1957 getActionDefinitionsBuilder(Op)
1958 .customIf([=](const LegalityQuery &Query) {
1959 const LLT EltTy = Query.Types[EltTypeIdx];
1960 const LLT VecTy = Query.Types[VecTypeIdx];
1961 const LLT IdxTy = Query.Types[IdxTypeIdx];
1962 const unsigned EltSize = EltTy.getSizeInBits();
1963 const bool isLegalVecType =
1965 // Address space 8 pointers are 128-bit wide values, but the logic
1966 // below will try to bitcast them to 2N x s64, which will fail.
1967 // Therefore, as an intermediate step, wrap extracts/insertions from a
1968 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1969 // extraction result) in order to produce a vector operation that can
1970 // be handled by the logic below.
1971 if (EltTy.isPointer() && EltSize > 64)
1972 return true;
1973 return (EltSize == 32 || EltSize == 64) &&
1974 VecTy.getSizeInBits() % 32 == 0 &&
1975 VecTy.getSizeInBits() <= MaxRegisterSize &&
1976 IdxTy.getSizeInBits() == 32 &&
1977 isLegalVecType;
1978 })
1979 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1980 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1981 bitcastToVectorElement32(VecTypeIdx))
1982 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1983 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1984 scalarOrEltWiderThan(VecTypeIdx, 64)),
1985 [=](const LegalityQuery &Query) {
1986 // For > 64-bit element types, try to turn this into a
1987 // 64-bit element vector since we may be able to do better
1988 // indexing if this is scalar. If not, fall back to 32.
1989 const LLT EltTy = Query.Types[EltTypeIdx];
1990 const LLT VecTy = Query.Types[VecTypeIdx];
1991 const unsigned DstEltSize = EltTy.getSizeInBits();
1992 const unsigned VecSize = VecTy.getSizeInBits();
1993
1994 const unsigned TargetEltSize =
1995 DstEltSize % 64 == 0 ? 64 : 32;
1996 return std::pair(VecTypeIdx,
1997 LLT::fixed_vector(VecSize / TargetEltSize,
1998 TargetEltSize));
1999 })
2000 .clampScalar(EltTypeIdx, S32, S64)
2001 .clampScalar(VecTypeIdx, S32, S64)
2002 .clampScalar(IdxTypeIdx, S32, S32)
2003 .clampMaxNumElements(VecTypeIdx, S32, 32)
2004 // TODO: Clamp elements for 64-bit vectors?
2005 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
2007 // It should only be necessary with variable indexes.
2008 // As a last resort, lower to the stack
2009 .lower();
2010 }
2011
2012 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
2013 .unsupportedIf([=](const LegalityQuery &Query) {
2014 const LLT &EltTy = Query.Types[1].getElementType();
2015 return Query.Types[0] != EltTy;
2016 });
2017
2018 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
2019 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
2020 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
2021 getActionDefinitionsBuilder(Op)
2022 .widenScalarIf(
2023 [=](const LegalityQuery &Query) {
2024 const LLT BigTy = Query.Types[BigTyIdx];
2025 return (BigTy.getScalarSizeInBits() < 16);
2026 },
2028 .widenScalarIf(
2029 [=](const LegalityQuery &Query) {
2030 const LLT LitTy = Query.Types[LitTyIdx];
2031 return (LitTy.getScalarSizeInBits() < 16);
2032 },
2034 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
2035 .widenScalarToNextPow2(BigTyIdx, 32)
2036 .customIf([=](const LegalityQuery &Query) {
2037 // Generic lower operates on the full-width value, producing
2038 // shift+trunc/mask sequences. For simple cases where extract/insert
2039 // values are 32-bit aligned, we can instead unmerge/merge and work on
2040 // the 32-bit components. However, we can't check the offset here so
2041 // custom lower function will have to call generic lowering if offset
2042 // is not 32-bit aligned.
2043 const LLT BigTy = Query.Types[BigTyIdx];
2044 const LLT LitTy = Query.Types[LitTyIdx];
2045 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
2046 LitTy.getSizeInBits() % 32 == 0;
2047 })
2048 .lower();
2049 }
2050
2051 auto &BuildVector =
2052 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2053 .legalForCartesianProduct(AllS32Vectors, {S32})
2054 .legalForCartesianProduct(AllS64Vectors, {S64})
2055 .clampNumElements(0, V16S32, V32S32)
2056 .clampNumElements(0, V2S64, V16S64)
2057 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
2058 .moreElementsIf(isIllegalRegisterType(ST, 0),
2060
2061 if (ST.hasScalarPackInsts()) {
2062 BuildVector
2063 // FIXME: Should probably widen s1 vectors straight to s32
2064 .minScalarOrElt(0, S16)
2065 .minScalar(1, S16);
2066
2067 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2068 .legalFor({V2S16, S32})
2069 .lower();
2070 } else {
2071 BuildVector.customFor({V2S16, S16});
2072 BuildVector.minScalarOrElt(0, S32);
2073
2074 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2075 .customFor({V2S16, S32})
2076 .lower();
2077 }
2078
2079 BuildVector.legalIf(isRegisterType(ST, 0));
2080
2081 // FIXME: Clamp maximum size
2082 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2083 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2084 .clampMaxNumElements(0, S32, 32)
2085 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2086 .clampMaxNumElements(0, S16, 64);
2087
2088 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2089
2090 // Merge/Unmerge
2091 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2092 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2093 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2094
2095 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2096 const LLT Ty = Query.Types[TypeIdx];
2097 if (Ty.isVector()) {
2098 const LLT &EltTy = Ty.getElementType();
2099 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2100 return true;
2102 return true;
2103 }
2104 return false;
2105 };
2106
2107 auto &Builder =
2108 getActionDefinitionsBuilder(Op)
2109 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2110 .lowerFor({{S16, V2S16}})
2111 .lowerIf([=](const LegalityQuery &Query) {
2112 const LLT BigTy = Query.Types[BigTyIdx];
2113 return BigTy.getSizeInBits() == 32;
2114 })
2115 // Try to widen to s16 first for small types.
2116 // TODO: Only do this on targets with legal s16 shifts
2117 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2118 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2119 .moreElementsIf(isSmallOddVector(BigTyIdx),
2120 oneMoreElement(BigTyIdx))
2121 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
2122 elementTypeIs(1, S16)),
2123 changeTo(1, V2S16))
2124 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2125 // not worth considering the multiples of 64 since 2*192 and 2*384
2126 // are not valid.
2127 .clampScalar(LitTyIdx, S32, S512)
2128 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2129 // Break up vectors with weird elements into scalars
2130 .fewerElementsIf(
2131 [=](const LegalityQuery &Query) {
2132 return notValidElt(Query, LitTyIdx);
2133 },
2134 scalarize(0))
2135 .fewerElementsIf(
2136 [=](const LegalityQuery &Query) {
2137 return notValidElt(Query, BigTyIdx);
2138 },
2139 scalarize(1))
2140 .clampScalar(BigTyIdx, S32, MaxScalar);
2141
2142 if (Op == G_MERGE_VALUES) {
2143 Builder.widenScalarIf(
2144 // TODO: Use 16-bit shifts if legal for 8-bit values?
2145 [=](const LegalityQuery &Query) {
2146 const LLT Ty = Query.Types[LitTyIdx];
2147 return Ty.getSizeInBits() < 32;
2148 },
2149 changeTo(LitTyIdx, S32));
2150 }
2151
2152 Builder.widenScalarIf(
2153 [=](const LegalityQuery &Query) {
2154 const LLT Ty = Query.Types[BigTyIdx];
2155 return Ty.getSizeInBits() % 16 != 0;
2156 },
2157 [=](const LegalityQuery &Query) {
2158 // Pick the next power of 2, or a multiple of 64 over 128.
2159 // Whichever is smaller.
2160 const LLT &Ty = Query.Types[BigTyIdx];
2161 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2162 if (NewSizeInBits >= 256) {
2163 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2164 if (RoundedTo < NewSizeInBits)
2165 NewSizeInBits = RoundedTo;
2166 }
2167 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2168 })
2169 // Any vectors left are the wrong size. Scalarize them.
2170 .scalarize(0)
2171 .scalarize(1);
2172 }
2173
2174 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2175 // RegBankSelect.
2176 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2177 .legalFor({{S32}, {S64}})
2178 .clampScalar(0, S32, S64);
2179
2180 if (ST.hasVOP3PInsts()) {
2181 SextInReg.lowerFor({{V2S16}})
2182 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2183 // get more vector shift opportunities, since we'll get those when
2184 // expanded.
2185 .clampMaxNumElementsStrict(0, S16, 2);
2186 } else if (ST.has16BitInsts()) {
2187 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2188 } else {
2189 // Prefer to promote to s32 before lowering if we don't have 16-bit
2190 // shifts. This avoid a lot of intermediate truncate and extend operations.
2191 SextInReg.lowerFor({{S32}, {S64}});
2192 }
2193
2194 SextInReg
2195 .scalarize(0)
2196 .clampScalar(0, S32, S64)
2197 .lower();
2198
2199 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2200 .scalarize(0)
2201 .lower();
2202
2203 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2204 FSHRActionDefs.legalFor({{S32, S32}})
2205 .clampMaxNumElementsStrict(0, S16, 2);
2206 if (ST.hasVOP3PInsts())
2207 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2208 FSHRActionDefs.scalarize(0).lower();
2209
2210 if (ST.hasVOP3PInsts()) {
2211 getActionDefinitionsBuilder(G_FSHL)
2212 .lowerFor({{V2S16, V2S16}})
2213 .clampMaxNumElementsStrict(0, S16, 2)
2214 .scalarize(0)
2215 .lower();
2216 } else {
2217 getActionDefinitionsBuilder(G_FSHL)
2218 .scalarize(0)
2219 .lower();
2220 }
2221
2222 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2223 .legalFor({S64});
2224
2225 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2226
2227 getActionDefinitionsBuilder(G_FENCE)
2228 .alwaysLegal();
2229
2230 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2231 .scalarize(0)
2232 .minScalar(0, S32)
2233 .lower();
2234
2235 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2236 .legalFor({{S32, S32}, {S64, S32}})
2237 .clampScalar(1, S32, S32)
2238 .clampScalar(0, S32, S64)
2239 .widenScalarToNextPow2(0)
2240 .scalarize(0);
2241
2242 getActionDefinitionsBuilder(
2243 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2244 G_FCOPYSIGN,
2245
2246 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2247 G_READ_REGISTER, G_WRITE_REGISTER,
2248
2249 G_SADDO, G_SSUBO})
2250 .lower();
2251
2252 if (ST.hasIEEEMinimumMaximumInsts()) {
2253 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2254 .legalFor(FPTypesPK16)
2255 .clampMaxNumElements(0, S16, 2)
2256 .scalarize(0);
2257 } else if (ST.hasVOP3PInsts()) {
2258 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2259 .lowerFor({V2S16})
2260 .clampMaxNumElementsStrict(0, S16, 2)
2261 .scalarize(0)
2262 .lower();
2263 } else {
2264 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2265 .scalarize(0)
2266 .clampScalar(0, S32, S64)
2267 .lower();
2268 }
2269
2270 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2271 .lower();
2272
2273 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2274
2275 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2276 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2277 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2278 .unsupported();
2279
2280 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2281
2282 getActionDefinitionsBuilder(
2283 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2284 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2285 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2286 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2287 .legalFor(AllVectors)
2288 .scalarize(1)
2289 .lower();
2290
2291 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2292 G_INTRINSIC_CONVERGENT,
2293 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2294 .alwaysLegal();
2295
2296 getLegacyLegalizerInfo().computeTables();
2297 verify(*ST.getInstrInfo());
2298}
2299
2302 LostDebugLocObserver &LocObserver) const {
2303 MachineIRBuilder &B = Helper.MIRBuilder;
2304 MachineRegisterInfo &MRI = *B.getMRI();
2305
2306 switch (MI.getOpcode()) {
2307 case TargetOpcode::G_ADDRSPACE_CAST:
2308 return legalizeAddrSpaceCast(MI, MRI, B);
2309 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2310 return legalizeFroundeven(MI, MRI, B);
2311 case TargetOpcode::G_FCEIL:
2312 return legalizeFceil(MI, MRI, B);
2313 case TargetOpcode::G_FREM:
2314 return legalizeFrem(MI, MRI, B);
2315 case TargetOpcode::G_INTRINSIC_TRUNC:
2316 return legalizeIntrinsicTrunc(MI, MRI, B);
2317 case TargetOpcode::G_SITOFP:
2318 return legalizeITOFP(MI, MRI, B, true);
2319 case TargetOpcode::G_UITOFP:
2320 return legalizeITOFP(MI, MRI, B, false);
2321 case TargetOpcode::G_FPTOSI:
2322 return legalizeFPTOI(MI, MRI, B, true);
2323 case TargetOpcode::G_FPTOUI:
2324 return legalizeFPTOI(MI, MRI, B, false);
2325 case TargetOpcode::G_FMINNUM:
2326 case TargetOpcode::G_FMAXNUM:
2327 case TargetOpcode::G_FMINIMUMNUM:
2328 case TargetOpcode::G_FMAXIMUMNUM:
2329 return legalizeMinNumMaxNum(Helper, MI);
2330 case TargetOpcode::G_EXTRACT:
2331 return legalizeExtract(Helper, MI);
2332 case TargetOpcode::G_INSERT:
2333 return legalizeInsert(Helper, MI);
2334 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2335 return legalizeExtractVectorElt(MI, MRI, B);
2336 case TargetOpcode::G_INSERT_VECTOR_ELT:
2337 return legalizeInsertVectorElt(MI, MRI, B);
2338 case TargetOpcode::G_FSIN:
2339 case TargetOpcode::G_FCOS:
2340 return legalizeSinCos(MI, MRI, B);
2341 case TargetOpcode::G_GLOBAL_VALUE:
2342 return legalizeGlobalValue(MI, MRI, B);
2343 case TargetOpcode::G_LOAD:
2344 case TargetOpcode::G_SEXTLOAD:
2345 case TargetOpcode::G_ZEXTLOAD:
2346 return legalizeLoad(Helper, MI);
2347 case TargetOpcode::G_STORE:
2348 return legalizeStore(Helper, MI);
2349 case TargetOpcode::G_FMAD:
2350 return legalizeFMad(MI, MRI, B);
2351 case TargetOpcode::G_FDIV:
2352 return legalizeFDIV(MI, MRI, B);
2353 case TargetOpcode::G_FFREXP:
2354 return legalizeFFREXP(MI, MRI, B);
2355 case TargetOpcode::G_FSQRT:
2356 return legalizeFSQRT(MI, MRI, B);
2357 case TargetOpcode::G_UDIV:
2358 case TargetOpcode::G_UREM:
2359 case TargetOpcode::G_UDIVREM:
2360 return legalizeUnsignedDIV_REM(MI, MRI, B);
2361 case TargetOpcode::G_SDIV:
2362 case TargetOpcode::G_SREM:
2363 case TargetOpcode::G_SDIVREM:
2364 return legalizeSignedDIV_REM(MI, MRI, B);
2365 case TargetOpcode::G_ATOMIC_CMPXCHG:
2366 return legalizeAtomicCmpXChg(MI, MRI, B);
2367 case TargetOpcode::G_FLOG2:
2368 return legalizeFlog2(MI, B);
2369 case TargetOpcode::G_FLOG:
2370 case TargetOpcode::G_FLOG10:
2371 return legalizeFlogCommon(MI, B);
2372 case TargetOpcode::G_FEXP2:
2373 return legalizeFExp2(MI, B);
2374 case TargetOpcode::G_FEXP:
2375 case TargetOpcode::G_FEXP10:
2376 return legalizeFExp(MI, B);
2377 case TargetOpcode::G_FPOW:
2378 return legalizeFPow(MI, B);
2379 case TargetOpcode::G_FFLOOR:
2380 return legalizeFFloor(MI, MRI, B);
2381 case TargetOpcode::G_BUILD_VECTOR:
2382 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2383 return legalizeBuildVector(MI, MRI, B);
2384 case TargetOpcode::G_MUL:
2385 return legalizeMul(Helper, MI);
2386 case TargetOpcode::G_CTLZ:
2387 case TargetOpcode::G_CTTZ:
2388 return legalizeCTLZ_CTTZ(MI, MRI, B);
2389 case TargetOpcode::G_CTLS:
2390 return legalizeCTLS(MI, MRI, B);
2391 case TargetOpcode::G_CTLZ_ZERO_POISON:
2392 return legalizeCTLZ_ZERO_POISON(MI, MRI, B);
2393 case TargetOpcode::G_STACKSAVE:
2394 return legalizeStackSave(MI, B);
2395 case TargetOpcode::G_GET_FPENV:
2396 return legalizeGetFPEnv(MI, MRI, B);
2397 case TargetOpcode::G_SET_FPENV:
2398 return legalizeSetFPEnv(MI, MRI, B);
2399 case TargetOpcode::G_TRAP:
2400 return legalizeTrap(MI, MRI, B);
2401 case TargetOpcode::G_DEBUGTRAP:
2402 return legalizeDebugTrap(MI, MRI, B);
2403 default:
2404 return false;
2405 }
2406
2407 llvm_unreachable("expected switch to return");
2408}
2409
2411 unsigned AS,
2413 MachineIRBuilder &B) const {
2414 MachineFunction &MF = B.getMF();
2415 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2416 const LLT S32 = LLT::scalar(32);
2417 const LLT S64 = LLT::scalar(64);
2418
2420
2421 if (ST.hasApertureRegs()) {
2422 // Note: this register is somewhat broken. When used as a 32-bit operand,
2423 // it only returns zeroes. The real value is in the upper 32 bits.
2424 // Thus, we must emit extract the high 32 bits.
2425 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2426 ? AMDGPU::SRC_SHARED_BASE
2427 : AMDGPU::SRC_PRIVATE_BASE;
2428 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2429 !ST.hasGloballyAddressableScratch()) &&
2430 "Cannot use src_private_base with globally addressable scratch!");
2432 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2433 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2434 return B.buildUnmerge(S32, Dst).getReg(1);
2435 }
2436
2439 // For code object version 5, private_base and shared_base are passed through
2440 // implicit kernargs.
2444
2449 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2450
2451 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2453
2454 if (!loadInputValue(KernargPtrReg, B,
2456 return Register();
2457
2459 PtrInfo.getWithOffset(Offset),
2463
2464 // Pointer address
2465 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2466 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2467 // Load address
2468 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2469 }
2470
2473
2475 return Register();
2476
2477 // TODO: Use custom PseudoSourceValue
2479
2480 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2481 // private_segment_aperture_base_hi.
2482 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2483
2485 PtrInfo,
2488 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2489
2490 B.buildObjectPtrOffset(
2491 LoadAddr, QueuePtr,
2492 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2493 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2494}
2495
2496/// Return true if the value is a known valid address, such that a null check is
2497/// not necessary.
2499 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2500 MachineInstr *Def = MRI.getVRegDef(Val);
2501 switch (Def->getOpcode()) {
2502 case AMDGPU::G_FRAME_INDEX:
2503 case AMDGPU::G_GLOBAL_VALUE:
2504 case AMDGPU::G_BLOCK_ADDR:
2505 return true;
2506 case AMDGPU::G_CONSTANT: {
2507 const ConstantInt *CI = Def->getOperand(1).getCImm();
2508 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2509 }
2510 default:
2511 return false;
2512 }
2513
2514 return false;
2515}
2516
2519 MachineIRBuilder &B) const {
2520 MachineFunction &MF = B.getMF();
2521
2522 // MI can either be a G_ADDRSPACE_CAST or a
2523 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2524 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2525 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2526 Intrinsic::amdgcn_addrspacecast_nonnull));
2527
2528 const LLT S32 = LLT::scalar(32);
2529 Register Dst = MI.getOperand(0).getReg();
2530 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2531 : MI.getOperand(1).getReg();
2532 LLT DstTy = MRI.getType(Dst);
2533 LLT SrcTy = MRI.getType(Src);
2534 unsigned DestAS = DstTy.getAddressSpace();
2535 unsigned SrcAS = SrcTy.getAddressSpace();
2536
2537 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2538 // vector element.
2539 assert(!DstTy.isVector());
2540
2541 const AMDGPUTargetMachine &TM
2542 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2543
2544 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2545 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2546 return true;
2547 }
2548
2549 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2550 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2551 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2552 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2553 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2554 ST.hasGloballyAddressableScratch()) {
2555 // flat -> private with globally addressable scratch: subtract
2556 // src_flat_scratch_base_lo.
2557 const LLT S32 = LLT::scalar(32);
2558 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2559 Register FlatScratchBaseLo =
2560 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2561 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2562 .getReg(0);
2563 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2564 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2565 return B.buildIntToPtr(Dst, Sub).getReg(0);
2566 }
2567
2568 // Extract low 32-bits of the pointer.
2569 return B.buildExtract(Dst, Src, 0).getReg(0);
2570 };
2571
2572 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2573 // G_ADDRSPACE_CAST we need to guess.
2574 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2575 castFlatToLocalOrPrivate(Dst);
2576 MI.eraseFromParent();
2577 return true;
2578 }
2579
2580 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2581
2582 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2583 auto FlatNull = B.buildConstant(SrcTy, 0);
2584
2585 // Extract low 32-bits of the pointer.
2586 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2587
2588 auto CmpRes =
2589 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2590 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2591
2592 MI.eraseFromParent();
2593 return true;
2594 }
2595
2596 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2597 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2598 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2599 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2600 // Coerce the type of the low half of the result so we can use
2601 // merge_values.
2602 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2603
2604 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2605 ST.hasGloballyAddressableScratch()) {
2606 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2607 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2608 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2609 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2610 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2611 .addUse(AllOnes)
2612 .addUse(ThreadID)
2613 .getReg(0);
2614 if (ST.isWave64()) {
2615 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2616 .addUse(AllOnes)
2617 .addUse(ThreadID)
2618 .getReg(0);
2619 }
2620 Register ShAmt =
2621 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2622 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2623 Register CvtPtr =
2624 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2625 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2626 // 64-bit hi:lo value.
2627 Register FlatScratchBase =
2628 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2629 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2630 .getReg(0);
2631 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2632 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2633 }
2634
2635 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2636 if (!ApertureReg.isValid())
2637 return false;
2638
2639 // TODO: Should we allow mismatched types but matching sizes in merges to
2640 // avoid the ptrtoint?
2641 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2642 };
2643
2644 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2645 // G_ADDRSPACE_CAST we need to guess.
2646 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2647 castLocalOrPrivateToFlat(Dst);
2648 MI.eraseFromParent();
2649 return true;
2650 }
2651
2652 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2653
2654 auto SegmentNull =
2655 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2656 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2657
2658 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2659 SegmentNull.getReg(0));
2660
2661 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2662
2663 MI.eraseFromParent();
2664 return true;
2665 }
2666
2667 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2668 SrcTy.getSizeInBits() == 64) {
2669 // Truncate.
2670 B.buildExtract(Dst, Src, 0);
2671 MI.eraseFromParent();
2672 return true;
2673 }
2674
2675 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2676 DstTy.getSizeInBits() == 64) {
2678 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2679 auto PtrLo = B.buildPtrToInt(S32, Src);
2680 if (AddrHiVal == 0) {
2681 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2682 B.buildIntToPtr(Dst, Zext);
2683 } else {
2684 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2685 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2686 }
2687
2688 MI.eraseFromParent();
2689 return true;
2690 }
2691
2692 // Invalid casts are poison.
2693 // TODO: Should return poison
2694 B.buildUndef(Dst);
2695 MI.eraseFromParent();
2696 return true;
2697}
2698
2701 MachineIRBuilder &B) const {
2702 Register Src = MI.getOperand(1).getReg();
2703 LLT Ty = MRI.getType(Src);
2704 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2705
2706 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2707 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2708
2709 auto C1 = B.buildFConstant(Ty, C1Val);
2710 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2711
2712 // TODO: Should this propagate fast-math-flags?
2713 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2714 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2715
2716 auto C2 = B.buildFConstant(Ty, C2Val);
2717 auto Fabs = B.buildFAbs(Ty, Src);
2718
2719 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2720 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2721 MI.eraseFromParent();
2722 return true;
2723}
2724
2727 MachineIRBuilder &B) const {
2728
2729 const LLT S1 = LLT::scalar(1);
2730 const LLT S64 = LLT::scalar(64);
2731
2732 Register Src = MI.getOperand(1).getReg();
2733 assert(MRI.getType(Src) == S64);
2734
2735 // result = trunc(src)
2736 // if (src > 0.0 && src != result)
2737 // result += 1.0
2738
2739 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2740
2741 const auto Zero = B.buildFConstant(S64, 0.0);
2742 const auto One = B.buildFConstant(S64, 1.0);
2743 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2744 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2745 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2746 auto Add = B.buildSelect(S64, And, One, Zero);
2747
2748 // TODO: Should this propagate fast-math-flags?
2749 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2750 MI.eraseFromParent();
2751 return true;
2752}
2753
2756 MachineIRBuilder &B) const {
2757 Register DstReg = MI.getOperand(0).getReg();
2758 Register Src0Reg = MI.getOperand(1).getReg();
2759 Register Src1Reg = MI.getOperand(2).getReg();
2760 auto Flags = MI.getFlags();
2761 LLT Ty = MRI.getType(DstReg);
2762
2763 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2764 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2765 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2766 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2767 MI.eraseFromParent();
2768 return true;
2769}
2770
2773 const unsigned FractBits = 52;
2774 const unsigned ExpBits = 11;
2775 LLT S32 = LLT::scalar(32);
2776
2777 auto Const0 = B.buildConstant(S32, FractBits - 32);
2778 auto Const1 = B.buildConstant(S32, ExpBits);
2779
2780 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2781 .addUse(Hi)
2782 .addUse(Const0.getReg(0))
2783 .addUse(Const1.getReg(0));
2784
2785 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2786}
2787
2790 MachineIRBuilder &B) const {
2791 const LLT S1 = LLT::scalar(1);
2792 const LLT S32 = LLT::scalar(32);
2793 const LLT S64 = LLT::scalar(64);
2794
2795 Register Src = MI.getOperand(1).getReg();
2796 assert(MRI.getType(Src) == S64);
2797
2798 // TODO: Should this use extract since the low half is unused?
2799 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2800 Register Hi = Unmerge.getReg(1);
2801
2802 // Extract the upper half, since this is where we will find the sign and
2803 // exponent.
2804 auto Exp = extractF64Exponent(Hi, B);
2805
2806 const unsigned FractBits = 52;
2807
2808 // Extract the sign bit.
2809 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2810 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2811
2812 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2813
2814 const auto Zero32 = B.buildConstant(S32, 0);
2815
2816 // Extend back to 64-bits.
2817 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2818
2819 auto Shr = B.buildAShr(S64, FractMask, Exp);
2820 auto Not = B.buildNot(S64, Shr);
2821 auto Tmp0 = B.buildAnd(S64, Src, Not);
2822 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2823
2824 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2825 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2826
2827 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2828 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2829 MI.eraseFromParent();
2830 return true;
2831}
2832
2835 MachineIRBuilder &B, bool Signed) const {
2836
2837 Register Dst = MI.getOperand(0).getReg();
2838 Register Src = MI.getOperand(1).getReg();
2839
2840 const LLT S64 = LLT::scalar(64);
2841 const LLT S32 = LLT::scalar(32);
2842
2843 assert(MRI.getType(Src) == S64);
2844
2845 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2846 auto ThirtyTwo = B.buildConstant(S32, 32);
2847
2848 if (MRI.getType(Dst) == S64) {
2849 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2850 : B.buildUITOFP(S64, Unmerge.getReg(1));
2851
2852 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2853 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2854
2855 // TODO: Should this propagate fast-math-flags?
2856 B.buildFAdd(Dst, LdExp, CvtLo);
2857 MI.eraseFromParent();
2858 return true;
2859 }
2860
2861 assert(MRI.getType(Dst) == S32);
2862
2863 auto One = B.buildConstant(S32, 1);
2864
2865 MachineInstrBuilder ShAmt;
2866 if (Signed) {
2867 auto ThirtyOne = B.buildConstant(S32, 31);
2868 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2869 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2870 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2871 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2872 .addUse(Unmerge.getReg(1));
2873 auto LS2 = B.buildSub(S32, LS, One);
2874 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2875 } else
2876 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2877 auto Norm = B.buildShl(S64, Src, ShAmt);
2878 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2879 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2880 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2881 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2882 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2883 B.buildFLdexp(Dst, FVal, Scale);
2884 MI.eraseFromParent();
2885 return true;
2886}
2887
2888// TODO: Copied from DAG implementation. Verify logic and document how this
2889// actually works.
2893 bool Signed) const {
2894
2895 Register Dst = MI.getOperand(0).getReg();
2896 Register Src = MI.getOperand(1).getReg();
2897
2898 const LLT S64 = LLT::scalar(64);
2899 const LLT S32 = LLT::scalar(32);
2900
2901 const LLT SrcLT = MRI.getType(Src);
2902 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2903
2904 unsigned Flags = MI.getFlags();
2905
2906 // The basic idea of converting a floating point number into a pair of 32-bit
2907 // integers is illustrated as follows:
2908 //
2909 // tf := trunc(val);
2910 // hif := floor(tf * 2^-32);
2911 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2912 // hi := fptoi(hif);
2913 // lo := fptoi(lof);
2914 //
2915 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2917 if (Signed && SrcLT == S32) {
2918 // However, a 32-bit floating point number has only 23 bits mantissa and
2919 // it's not enough to hold all the significant bits of `lof` if val is
2920 // negative. To avoid the loss of precision, We need to take the absolute
2921 // value after truncating and flip the result back based on the original
2922 // signedness.
2923 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2924 Trunc = B.buildFAbs(S32, Trunc, Flags);
2925 }
2926 MachineInstrBuilder K0, K1;
2927 if (SrcLT == S64) {
2928 K0 = B.buildFConstant(
2929 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2930 K1 = B.buildFConstant(
2931 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2932 } else {
2933 K0 = B.buildFConstant(
2934 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2935 K1 = B.buildFConstant(
2936 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2937 }
2938
2939 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2940 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2941 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2942
2943 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2944 : B.buildFPTOUI(S32, FloorMul);
2945 auto Lo = B.buildFPTOUI(S32, Fma);
2946
2947 if (Signed && SrcLT == S32) {
2948 // Flip the result based on the signedness, which is either all 0s or 1s.
2949 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2950 // r := xor({lo, hi}, sign) - sign;
2951 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2952 Sign);
2953 } else
2954 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2955 MI.eraseFromParent();
2956
2957 return true;
2958}
2959
2961 MachineInstr &MI) const {
2962 MachineFunction &MF = Helper.MIRBuilder.getMF();
2964
2965 // With ieee_mode disabled, the instructions have the correct behavior.
2966 if (!MFI->getMode().IEEE)
2967 return true;
2968
2970}
2971
2973 MachineInstr &MI) const {
2974 MachineIRBuilder &B = Helper.MIRBuilder;
2975 MachineRegisterInfo &MRI = *B.getMRI();
2976 Register DstReg = MI.getOperand(0).getReg();
2977 Register SrcReg = MI.getOperand(1).getReg();
2978 uint64_t Offset = MI.getOperand(2).getImm();
2979
2980 // Fall back to generic lowering for offset 0 (trivial trunc) and
2981 // non-32-bit-aligned cases which require shift+trunc sequences
2982 // that generic code handles correctly.
2983 if (Offset == 0 || Offset % 32 != 0)
2984 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2985
2986 const LLT DstTy = MRI.getType(DstReg);
2987 unsigned StartIdx = Offset / 32;
2988 unsigned DstCount = DstTy.getSizeInBits() / 32;
2989 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2990
2991 if (DstCount == 1) {
2992 if (DstTy.isPointer())
2993 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2994 else
2995 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2996 } else {
2997 SmallVector<Register, 8> MergeVec;
2998 for (unsigned I = 0; I < DstCount; ++I)
2999 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
3000 B.buildMergeLikeInstr(DstReg, MergeVec);
3001 }
3002
3003 MI.eraseFromParent();
3004 return true;
3005}
3006
3008 MachineInstr &MI) const {
3009 MachineIRBuilder &B = Helper.MIRBuilder;
3010 MachineRegisterInfo &MRI = *B.getMRI();
3011 Register DstReg = MI.getOperand(0).getReg();
3012 Register SrcReg = MI.getOperand(1).getReg();
3013 Register InsertSrc = MI.getOperand(2).getReg();
3014 uint64_t Offset = MI.getOperand(3).getImm();
3015
3016 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3017 const LLT InsertTy = MRI.getType(InsertSrc);
3018 unsigned InsertSize = InsertTy.getSizeInBits();
3019
3020 // Fall back to generic lowering for non-32-bit-aligned cases which
3021 // require shift+mask sequences that generic code handles correctly.
3022 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
3023 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
3024
3025 const LLT S32 = LLT::scalar(32);
3026 unsigned DstCount = DstSize / 32;
3027 unsigned InsertCount = InsertSize / 32;
3028 unsigned StartIdx = Offset / 32;
3029
3030 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
3031
3032 SmallVector<Register, 8> MergeVec;
3033 for (unsigned I = 0; I < StartIdx; ++I)
3034 MergeVec.push_back(SrcUnmerge.getReg(I));
3035
3036 if (InsertCount == 1) {
3037 // Merge-like instructions require same source types. Convert pointer
3038 // to scalar when inserting a pointer value into a scalar.
3039 if (InsertTy.isPointer())
3040 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
3041 MergeVec.push_back(InsertSrc);
3042 } else {
3043 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
3044 for (unsigned I = 0; I < InsertCount; ++I)
3045 MergeVec.push_back(InsertUnmerge.getReg(I));
3046 }
3047
3048 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
3049 MergeVec.push_back(SrcUnmerge.getReg(I));
3050
3051 B.buildMergeLikeInstr(DstReg, MergeVec);
3052
3053 MI.eraseFromParent();
3054 return true;
3055}
3056
3059 MachineIRBuilder &B) const {
3060 // TODO: Should move some of this into LegalizerHelper.
3061
3062 // TODO: Promote dynamic indexing of s16 to s32
3063
3064 Register Dst = MI.getOperand(0).getReg();
3065 Register Vec = MI.getOperand(1).getReg();
3066
3067 LLT VecTy = MRI.getType(Vec);
3068 LLT EltTy = VecTy.getElementType();
3069 assert(EltTy == MRI.getType(Dst));
3070
3071 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3072 // but we can't go directly to that logic becasue you can't bitcast a vector
3073 // of pointers to a vector of integers. Therefore, introduce an intermediate
3074 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3075 // drive the legalization forward.
3076 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3077 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3078 LLT IntVecTy = VecTy.changeElementType(IntTy);
3079
3080 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3081 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3082 B.buildIntToPtr(Dst, IntElt);
3083
3084 MI.eraseFromParent();
3085 return true;
3086 }
3087
3088 // FIXME: Artifact combiner probably should have replaced the truncated
3089 // constant before this, so we shouldn't need
3090 // getIConstantVRegValWithLookThrough.
3091 std::optional<ValueAndVReg> MaybeIdxVal =
3092 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3093 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3094 return true;
3095 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3096
3097 if (IdxVal < VecTy.getNumElements()) {
3098 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3099 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3100 } else {
3101 B.buildUndef(Dst);
3102 }
3103
3104 MI.eraseFromParent();
3105 return true;
3106}
3107
3110 MachineIRBuilder &B) const {
3111 // TODO: Should move some of this into LegalizerHelper.
3112
3113 // TODO: Promote dynamic indexing of s16 to s32
3114
3115 Register Dst = MI.getOperand(0).getReg();
3116 Register Vec = MI.getOperand(1).getReg();
3117 Register Ins = MI.getOperand(2).getReg();
3118
3119 LLT VecTy = MRI.getType(Vec);
3120 LLT EltTy = VecTy.getElementType();
3121 assert(EltTy == MRI.getType(Ins));
3122
3123 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3124 // but we can't go directly to that logic becasue you can't bitcast a vector
3125 // of pointers to a vector of integers. Therefore, make the pointer vector
3126 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3127 // new value, and then inttoptr the result vector back. This will then allow
3128 // the rest of legalization to take over.
3129 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3130 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3131 LLT IntVecTy = VecTy.changeElementType(IntTy);
3132
3133 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3134 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3135 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3136 MI.getOperand(3));
3137 B.buildIntToPtr(Dst, IntVecDest);
3138 MI.eraseFromParent();
3139 return true;
3140 }
3141
3142 // FIXME: Artifact combiner probably should have replaced the truncated
3143 // constant before this, so we shouldn't need
3144 // getIConstantVRegValWithLookThrough.
3145 std::optional<ValueAndVReg> MaybeIdxVal =
3146 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3147 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3148 return true;
3149
3150 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3151
3152 unsigned NumElts = VecTy.getNumElements();
3153 if (IdxVal < NumElts) {
3155 for (unsigned i = 0; i < NumElts; ++i)
3156 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3157 B.buildUnmerge(SrcRegs, Vec);
3158
3159 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3160 B.buildMergeLikeInstr(Dst, SrcRegs);
3161 } else {
3162 B.buildUndef(Dst);
3163 }
3164
3165 MI.eraseFromParent();
3166 return true;
3167}
3168
3171 MachineIRBuilder &B) const {
3172
3173 Register DstReg = MI.getOperand(0).getReg();
3174 Register SrcReg = MI.getOperand(1).getReg();
3175 LLT Ty = MRI.getType(DstReg);
3176 unsigned Flags = MI.getFlags();
3177
3178 Register TrigVal;
3179 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3180 if (ST.hasTrigReducedRange()) {
3181 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3182 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3183 .addUse(MulVal.getReg(0))
3184 .setMIFlags(Flags)
3185 .getReg(0);
3186 } else
3187 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3188
3189 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3190 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3191 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3192 .addUse(TrigVal)
3193 .setMIFlags(Flags);
3194 MI.eraseFromParent();
3195 return true;
3196}
3197
3200 const GlobalValue *GV,
3201 int64_t Offset,
3202 unsigned GAFlags) const {
3203 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3204 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3205 // to the following code sequence:
3206 //
3207 // For constant address space:
3208 // s_getpc_b64 s[0:1]
3209 // s_add_u32 s0, s0, $symbol
3210 // s_addc_u32 s1, s1, 0
3211 //
3212 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3213 // a fixup or relocation is emitted to replace $symbol with a literal
3214 // constant, which is a pc-relative offset from the encoding of the $symbol
3215 // operand to the global variable.
3216 //
3217 // For global address space:
3218 // s_getpc_b64 s[0:1]
3219 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3220 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3221 //
3222 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3223 // fixups or relocations are emitted to replace $symbol@*@lo and
3224 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3225 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3226 // operand to the global variable.
3227
3229
3230 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3231 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3232
3233 if (ST.has64BitLiterals()) {
3234 assert(GAFlags != SIInstrInfo::MO_NONE);
3235
3237 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3238 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3239 } else {
3241 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3242
3243 MIB.addGlobalAddress(GV, Offset, GAFlags);
3244 if (GAFlags == SIInstrInfo::MO_NONE)
3245 MIB.addImm(0);
3246 else
3247 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3248 }
3249
3250 if (!B.getMRI()->getRegClassOrNull(PCReg))
3251 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3252
3253 if (PtrTy.getSizeInBits() == 32)
3254 B.buildExtract(DstReg, PCReg, 0);
3255 return true;
3256}
3257
3258// Emit a ABS32_LO / ABS32_HI relocation stub.
3260 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3261 MachineRegisterInfo &MRI) const {
3262 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3263
3264 if (RequiresHighHalf && ST.has64BitLiterals()) {
3265 if (!MRI.getRegClassOrNull(DstReg))
3266 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3267 B.buildInstr(AMDGPU::S_MOV_B64)
3268 .addDef(DstReg)
3269 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3270 return;
3271 }
3272
3273 LLT S32 = LLT::scalar(32);
3274
3275 // Use the destination directly, if and only if we store the lower address
3276 // part only and we don't have a register class being set.
3277 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3278 ? DstReg
3280
3281 if (!MRI.getRegClassOrNull(AddrLo))
3282 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3283
3284 // Write the lower half.
3285 B.buildInstr(AMDGPU::S_MOV_B32)
3286 .addDef(AddrLo)
3287 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3288
3289 // If required, write the upper half as well.
3290 if (RequiresHighHalf) {
3291 assert(PtrTy.getSizeInBits() == 64 &&
3292 "Must provide a 64-bit pointer type!");
3293
3295 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3296
3297 B.buildInstr(AMDGPU::S_MOV_B32)
3298 .addDef(AddrHi)
3299 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3300
3301 // Use the destination directly, if and only if we don't have a register
3302 // class being set.
3303 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3304 ? DstReg
3306
3307 if (!MRI.getRegClassOrNull(AddrDst))
3308 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3309
3310 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3311
3312 // If we created a new register for the destination, cast the result into
3313 // the final output.
3314 if (AddrDst != DstReg)
3315 B.buildCast(DstReg, AddrDst);
3316 } else if (AddrLo != DstReg) {
3317 // If we created a new register for the destination, cast the result into
3318 // the final output.
3319 B.buildCast(DstReg, AddrLo);
3320 }
3321}
3322
3325 MachineIRBuilder &B) const {
3326 Register DstReg = MI.getOperand(0).getReg();
3327 LLT Ty = MRI.getType(DstReg);
3328 unsigned AS = Ty.getAddressSpace();
3329
3330 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3331 MachineFunction &MF = B.getMF();
3333
3335 if (!MFI->isModuleEntryFunction() &&
3336 GV->getName() != "llvm.amdgcn.module.lds" &&
3338 const Function &Fn = MF.getFunction();
3340 Fn, "local memory global used by non-kernel function",
3341 MI.getDebugLoc(), DS_Warning));
3342
3343 // We currently don't have a way to correctly allocate LDS objects that
3344 // aren't directly associated with a kernel. We do force inlining of
3345 // functions that use local objects. However, if these dead functions are
3346 // not eliminated, we don't want a compile time error. Just emit a warning
3347 // and a trap, since there should be no callable path here.
3348 B.buildTrap();
3349 B.buildUndef(DstReg);
3350 MI.eraseFromParent();
3351 return true;
3352 }
3353
3354 // TODO: We could emit code to handle the initialization somewhere.
3355 // We ignore the initializer for now and legalize it to allow selection.
3356 // The initializer will anyway get errored out during assembly emission.
3357 const SITargetLowering *TLI = ST.getTargetLowering();
3358 if (!TLI->shouldUseLDSConstAddress(GV)) {
3359 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3360 return true; // Leave in place;
3361 }
3362
3363 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3364 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3365 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3366 // zero-sized type in other languages to declare the dynamic shared
3367 // memory which size is not known at the compile time. They will be
3368 // allocated by the runtime and placed directly after the static
3369 // allocated ones. They all share the same offset.
3370 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3371 // Adjust alignment for that dynamic shared memory array.
3372 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3373 LLT S32 = LLT::scalar(32);
3374 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3375 B.buildIntToPtr(DstReg, Sz);
3376 MI.eraseFromParent();
3377 return true;
3378 }
3379 }
3380
3381 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3382 MI.eraseFromParent();
3383 return true;
3384 }
3385
3386 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3387 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3388 MI.eraseFromParent();
3389 return true;
3390 }
3391
3392 const SITargetLowering *TLI = ST.getTargetLowering();
3393
3394 if (TLI->shouldEmitFixup(GV)) {
3395 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3396 MI.eraseFromParent();
3397 return true;
3398 }
3399
3400 if (TLI->shouldEmitPCReloc(GV)) {
3401 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3402 MI.eraseFromParent();
3403 return true;
3404 }
3405
3407 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3408
3409 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3414 LoadTy, Align(8));
3415
3416 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3417
3418 if (Ty.getSizeInBits() == 32) {
3419 // Truncate if this is a 32-bit constant address.
3420 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3421 B.buildExtract(DstReg, Load, 0);
3422 } else
3423 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3424
3425 MI.eraseFromParent();
3426 return true;
3427}
3428
3430 if (Ty.isVector())
3431 return Ty.changeElementCount(
3432 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3433 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3434}
3435
3437 MachineInstr &MI) const {
3438 MachineIRBuilder &B = Helper.MIRBuilder;
3439 MachineRegisterInfo &MRI = *B.getMRI();
3440 GISelChangeObserver &Observer = Helper.Observer;
3441
3442 Register PtrReg = MI.getOperand(1).getReg();
3443 LLT PtrTy = MRI.getType(PtrReg);
3444 unsigned AddrSpace = PtrTy.getAddressSpace();
3445
3446 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3448 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3449 Observer.changingInstr(MI);
3450 MI.getOperand(1).setReg(Cast.getReg(0));
3451 Observer.changedInstr(MI);
3452 return true;
3453 }
3454
3455 if (MI.getOpcode() != AMDGPU::G_LOAD)
3456 return false;
3457
3458 Register ValReg = MI.getOperand(0).getReg();
3459 LLT ValTy = MRI.getType(ValReg);
3460
3461 if (hasBufferRsrcWorkaround(ValTy)) {
3462 Observer.changingInstr(MI);
3463 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3464 Observer.changedInstr(MI);
3465 return true;
3466 }
3467
3468 MachineMemOperand *MMO = *MI.memoperands_begin();
3469 const unsigned ValSize = ValTy.getSizeInBits();
3470 const LLT MemTy = MMO->getMemoryType();
3471 const Align MemAlign = MMO->getAlign();
3472 const unsigned MemSize = MemTy.getSizeInBits();
3473 const uint64_t AlignInBits = 8 * MemAlign.value();
3474
3475 // Widen non-power-of-2 loads to the alignment if needed
3476 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3477 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3478
3479 // This was already the correct extending load result type, so just adjust
3480 // the memory type.
3481 if (WideMemSize == ValSize) {
3482 MachineFunction &MF = B.getMF();
3483
3484 MachineMemOperand *WideMMO =
3485 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3486 Observer.changingInstr(MI);
3487 MI.setMemRefs(MF, {WideMMO});
3488 Observer.changedInstr(MI);
3489 return true;
3490 }
3491
3492 // Don't bother handling edge case that should probably never be produced.
3493 if (ValSize > WideMemSize)
3494 return false;
3495
3496 LLT WideTy = widenToNextPowerOf2(ValTy);
3497
3498 Register WideLoad;
3499 if (!WideTy.isVector()) {
3500 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3501 B.buildTrunc(ValReg, WideLoad).getReg(0);
3502 } else {
3503 // Extract the subvector.
3504
3505 if (isRegisterType(ST, ValTy)) {
3506 // If this a case where G_EXTRACT is legal, use it.
3507 // (e.g. <3 x s32> -> <4 x s32>)
3508 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3509 B.buildExtract(ValReg, WideLoad, 0);
3510 } else {
3511 // For cases where the widened type isn't a nice register value, unmerge
3512 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3513 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3514 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3515 }
3516 }
3517
3518 MI.eraseFromParent();
3519 return true;
3520 }
3521
3522 return false;
3523}
3524
3526 MachineInstr &MI) const {
3527 MachineIRBuilder &B = Helper.MIRBuilder;
3528 MachineRegisterInfo &MRI = *B.getMRI();
3529 GISelChangeObserver &Observer = Helper.Observer;
3530
3531 Register DataReg = MI.getOperand(0).getReg();
3532 LLT DataTy = MRI.getType(DataReg);
3533
3534 if (hasBufferRsrcWorkaround(DataTy)) {
3535 Observer.changingInstr(MI);
3537 Observer.changedInstr(MI);
3538 return true;
3539 }
3540 return false;
3541}
3542
3545 MachineIRBuilder &B) const {
3546 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3547 assert(Ty.isScalar());
3548
3549 MachineFunction &MF = B.getMF();
3551
3552 // TODO: Always legal with future ftz flag.
3553 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3554 // FIXME: Do we need just output?
3555 if (Ty == LLT::scalar(32) &&
3557 return true;
3558 if (Ty == LLT::scalar(16) &&
3560 return true;
3561
3562 MachineIRBuilder HelperBuilder(MI);
3563 GISelObserverWrapper DummyObserver;
3564 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3565 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3566}
3567
3570 Register DstReg = MI.getOperand(0).getReg();
3571 Register PtrReg = MI.getOperand(1).getReg();
3572 Register CmpVal = MI.getOperand(2).getReg();
3573 Register NewVal = MI.getOperand(3).getReg();
3574
3576 "this should not have been custom lowered");
3577
3578 LLT ValTy = MRI.getType(CmpVal);
3579 LLT VecTy = LLT::fixed_vector(2, ValTy);
3580
3581 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3582
3583 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3584 .addDef(DstReg)
3585 .addUse(PtrReg)
3586 .addUse(PackedVal)
3587 .setMemRefs(MI.memoperands());
3588
3589 MI.eraseFromParent();
3590 return true;
3591}
3592
3593/// Return true if it's known that \p Src can never be an f32 denormal value.
3595 Register Src) {
3596 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3597 switch (DefMI->getOpcode()) {
3598 case TargetOpcode::G_INTRINSIC: {
3600 case Intrinsic::amdgcn_frexp_mant:
3601 case Intrinsic::amdgcn_log:
3602 case Intrinsic::amdgcn_log_clamp:
3603 case Intrinsic::amdgcn_exp2:
3604 case Intrinsic::amdgcn_sqrt:
3605 return true;
3606 default:
3607 break;
3608 }
3609
3610 break;
3611 }
3612 case TargetOpcode::G_FSQRT:
3613 return true;
3614 case TargetOpcode::G_FFREXP: {
3615 if (DefMI->getOperand(0).getReg() == Src)
3616 return true;
3617 break;
3618 }
3619 case TargetOpcode::G_FPEXT: {
3620 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3621 }
3622 default:
3623 return false;
3624 }
3625
3626 return false;
3627}
3628
3629static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3630 return Flags & MachineInstr::FmAfn;
3631}
3632
3634 unsigned Flags) {
3635 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3638}
3639
3640std::pair<Register, Register>
3642 unsigned Flags) const {
3643 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3644 return {};
3645
3646 const LLT F32 = LLT::scalar(32);
3647 auto SmallestNormal = B.buildFConstant(
3649 auto IsLtSmallestNormal =
3650 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3651
3652 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3653 auto One = B.buildFConstant(F32, 1.0);
3654 auto ScaleFactor =
3655 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3656 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3657
3658 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3659}
3660
3662 MachineIRBuilder &B) const {
3663 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3664 // If we have to handle denormals, scale up the input and adjust the result.
3665
3666 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3667 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3668
3669 Register Dst = MI.getOperand(0).getReg();
3670 Register Src = MI.getOperand(1).getReg();
3671 LLT Ty = B.getMRI()->getType(Dst);
3672 unsigned Flags = MI.getFlags();
3673
3674 if (Ty == LLT::scalar(16)) {
3675 const LLT F32 = LLT::scalar(32);
3676 // Nothing in half is a denormal when promoted to f32.
3677 auto Ext = B.buildFPExt(F32, Src, Flags);
3678 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3679 .addUse(Ext.getReg(0))
3680 .setMIFlags(Flags);
3681 B.buildFPTrunc(Dst, Log2, Flags);
3682 MI.eraseFromParent();
3683 return true;
3684 }
3685
3686 assert(Ty == LLT::scalar(32));
3687
3688 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3689 if (!ScaledInput) {
3690 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3691 .addUse(Src)
3692 .setMIFlags(Flags);
3693 MI.eraseFromParent();
3694 return true;
3695 }
3696
3697 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3698 .addUse(ScaledInput)
3699 .setMIFlags(Flags);
3700
3701 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3702 auto Zero = B.buildFConstant(Ty, 0.0);
3703 auto ResultOffset =
3704 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3705 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3706
3707 MI.eraseFromParent();
3708 return true;
3709}
3710
3712 Register Z, unsigned Flags) {
3713 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3714 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3715}
3716
3718 MachineIRBuilder &B) const {
3719 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3720 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3721
3722 MachineRegisterInfo &MRI = *B.getMRI();
3723 Register Dst = MI.getOperand(0).getReg();
3724 Register X = MI.getOperand(1).getReg();
3725 unsigned Flags = MI.getFlags();
3726 const LLT Ty = MRI.getType(X);
3727
3728 const LLT F32 = LLT::scalar(32);
3729 const LLT F16 = LLT::scalar(16);
3730
3731 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3732 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3733 // depending on !fpmath metadata.
3734 bool PromoteToF32 =
3735 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3736 if (PromoteToF32) {
3738 auto PromoteSrc = B.buildFPExt(F32, X);
3739 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3740 B.buildFPTrunc(Dst, LogVal);
3741 } else {
3742 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3743 }
3744
3745 MI.eraseFromParent();
3746 return true;
3747 }
3748
3749 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3750 if (ScaledInput)
3751 X = ScaledInput;
3752
3753 auto Y =
3754 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3755
3756 Register R;
3757 if (ST.hasFastFMAF32()) {
3758 // c+cc are ln(2)/ln(10) to more than 49 bits
3759 const float c_log10 = 0x1.344134p-2f;
3760 const float cc_log10 = 0x1.09f79ep-26f;
3761
3762 // c + cc is ln(2) to more than 49 bits
3763 const float c_log = 0x1.62e42ep-1f;
3764 const float cc_log = 0x1.efa39ep-25f;
3765
3766 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3767 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3768 // This adds correction terms for which contraction may lead to an increase
3769 // in the error of the approximation, so disable it.
3770 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3771 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3772 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3773 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3774 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3775 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3776 } else {
3777 // ch+ct is ln(2)/ln(10) to more than 36 bits
3778 const float ch_log10 = 0x1.344000p-2f;
3779 const float ct_log10 = 0x1.3509f6p-18f;
3780
3781 // ch + ct is ln(2) to more than 36 bits
3782 const float ch_log = 0x1.62e000p-1f;
3783 const float ct_log = 0x1.0bfbe8p-15f;
3784
3785 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3786 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3787
3788 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3789 auto YH = B.buildAnd(Ty, Y, MaskConst);
3790 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3791 // This adds correction terms for which contraction may lead to an increase
3792 // in the error of the approximation, so disable it.
3793 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3794 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3795
3796 Register Mad0 =
3797 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3798 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3799 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3800 }
3801
3802 const bool IsFiniteOnly =
3804
3805 if (!IsFiniteOnly) {
3806 // Expand isfinite(x) => fabs(x) < inf
3807 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3808 auto Fabs = B.buildFAbs(Ty, Y);
3809 auto IsFinite =
3810 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3811 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3812 }
3813
3814 if (ScaledInput) {
3815 auto Zero = B.buildFConstant(Ty, 0.0);
3816 auto ShiftK =
3817 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3818 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3819 B.buildFSub(Dst, R, Shift, Flags);
3820 } else {
3821 B.buildCopy(Dst, R);
3822 }
3823
3824 MI.eraseFromParent();
3825 return true;
3826}
3827
3829 Register Src, bool IsLog10,
3830 unsigned Flags) const {
3831 const double Log2BaseInverted =
3833
3834 LLT Ty = B.getMRI()->getType(Dst);
3835
3836 if (Ty == LLT::scalar(32)) {
3837 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3838 if (ScaledInput) {
3839 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3840 .addUse(Src)
3841 .setMIFlags(Flags);
3842 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3843 auto Zero = B.buildFConstant(Ty, 0.0);
3844 auto ResultOffset =
3845 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3846 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3847
3848 if (ST.hasFastFMAF32())
3849 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3850 else {
3851 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3852 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3853 }
3854
3855 return true;
3856 }
3857 }
3858
3859 auto Log2Operand = Ty == LLT::scalar(16)
3860 ? B.buildFLog2(Ty, Src, Flags)
3861 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3862 .addUse(Src)
3863 .setMIFlags(Flags);
3864 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3865 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3866 return true;
3867}
3868
3870 MachineIRBuilder &B) const {
3871 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3872 // If we have to handle denormals, scale up the input and adjust the result.
3873
3874 Register Dst = MI.getOperand(0).getReg();
3875 Register Src = MI.getOperand(1).getReg();
3876 unsigned Flags = MI.getFlags();
3877 LLT Ty = B.getMRI()->getType(Dst);
3878 const LLT F16 = LLT::scalar(16);
3879 const LLT F32 = LLT::scalar(32);
3880 const LLT F64 = LLT::scalar(64);
3881
3882 if (Ty == F64)
3883 return legalizeFEXPF64(MI, B);
3884
3885 if (Ty == F16) {
3886 // Nothing in half is a denormal when promoted to f32.
3887 auto Ext = B.buildFPExt(F32, Src, Flags);
3888 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3889 .addUse(Ext.getReg(0))
3890 .setMIFlags(Flags);
3891 B.buildFPTrunc(Dst, Log2, Flags);
3892 MI.eraseFromParent();
3893 return true;
3894 }
3895
3896 assert(Ty == F32);
3897
3898 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3899 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3900 .addUse(Src)
3901 .setMIFlags(Flags);
3902 MI.eraseFromParent();
3903 return true;
3904 }
3905
3906 // bool needs_scaling = x < -0x1.f80000p+6f;
3907 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3908
3909 // -nextafter(128.0, -1)
3910 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3911 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3912 RangeCheckConst, Flags);
3913
3914 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3915 auto Zero = B.buildFConstant(Ty, 0.0);
3916 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3917 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3918
3919 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3920 .addUse(AddInput.getReg(0))
3921 .setMIFlags(Flags);
3922
3923 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3924 auto One = B.buildFConstant(Ty, 1.0);
3925 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3926 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3927 MI.eraseFromParent();
3928 return true;
3929}
3930
3932 const SrcOp &Src, unsigned Flags) {
3933 LLT Ty = Dst.getLLTTy(*B.getMRI());
3934
3935 if (Ty == LLT::scalar(32)) {
3936 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3937 .addUse(Src.getReg())
3938 .setMIFlags(Flags);
3939 }
3940 return B.buildFExp2(Dst, Src, Flags);
3941}
3942
3944 Register Dst, Register X,
3945 unsigned Flags,
3946 bool IsExp10) const {
3947 LLT Ty = B.getMRI()->getType(X);
3948
3949 // exp(x) -> exp2(M_LOG2E_F * x);
3950 // exp10(x) -> exp2(log2(10) * x);
3951 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3952 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3953 buildExp(B, Dst, Mul, Flags);
3954 return true;
3955}
3956
3958 Register X, unsigned Flags) const {
3959 LLT Ty = B.getMRI()->getType(Dst);
3960 LLT F32 = LLT::scalar(32);
3961
3962 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3963 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3964 }
3965
3966 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3967 auto NeedsScaling =
3968 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3969 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3970 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3971 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3972
3973 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3974 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3975
3976 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3977 .addUse(ExpInput.getReg(0))
3978 .setMIFlags(Flags);
3979
3980 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3981 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3982 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3983 return true;
3984}
3985
3987 Register Dst, Register X,
3988 unsigned Flags) const {
3989 LLT Ty = B.getMRI()->getType(Dst);
3990 LLT F32 = LLT::scalar(32);
3991
3992 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3993 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3994 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3995 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3996
3997 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3998 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3999 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
4000 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
4001 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
4002 return true;
4003 }
4004
4005 // bool s = x < -0x1.2f7030p+5f;
4006 // x += s ? 0x1.0p+5f : 0.0f;
4007 // exp10 = exp2(x * 0x1.a92000p+1f) *
4008 // exp2(x * 0x1.4f0978p-11f) *
4009 // (s ? 0x1.9f623ep-107f : 1.0f);
4010
4011 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
4012 auto NeedsScaling =
4013 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
4014
4015 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
4016 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
4017 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
4018
4019 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
4020 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
4021
4022 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
4023 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
4024 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
4025 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
4026
4027 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
4028 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
4029 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4030
4031 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4032 return true;
4033}
4034
4035// This expansion gives a result slightly better than 1ulp.
4037 MachineIRBuilder &B) const {
4038
4039 Register X = MI.getOperand(1).getReg();
4040 LLT S64 = LLT::scalar(64);
4041 LLT S32 = LLT::scalar(32);
4042 LLT S1 = LLT::scalar(1);
4043
4044 // TODO: Check if reassoc is safe. There is an output change in exp2 and
4045 // exp10, which slightly increases ulp.
4046 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
4047
4048 Register Dn, F, T;
4049
4050 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
4051 // Dn = rint(X)
4052 Dn = B.buildFRint(S64, X, Flags).getReg(0);
4053 // F = X - Dn
4054 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
4055 // T = F*C1 + F*C2
4056 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4057 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4058 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
4059 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
4060
4061 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
4062 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
4063 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4064 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4065
4066 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4067 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
4068 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
4069 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4070 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4071
4072 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4073 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4074 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4075 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4076
4077 } else { // G_FEXP
4078 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4079 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4080 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4081
4082 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4083 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4084 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4085 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4086 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4087 }
4088
4089 // Polynomial chain for P
4090 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4091 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4092 Flags);
4093 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4094 Flags);
4095 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4096 Flags);
4097 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4098 Flags);
4099 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4100 Flags);
4101 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4102 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4103 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4104 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4105
4106 auto One = B.buildFConstant(S64, 1.0);
4107 P = B.buildFMA(S64, T, P, One, Flags);
4108 P = B.buildFMA(S64, T, P, One, Flags);
4109
4110 // Z = FLDEXP(P, (int)Dn)
4111 auto DnInt = B.buildFPTOSI(S32, Dn);
4112 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4113
4114 if (!(Flags & MachineInstr::FmNoInfs)) {
4115 // Overflow guard: if X <= 1024.0 then Z else +inf
4116 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4117 B.buildFConstant(S64, APFloat(1024.0)));
4118 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4119 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4120 }
4121
4122 // Underflow guard: if X >= -1075.0 then Z else 0.0
4123 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4124 B.buildFConstant(S64, APFloat(-1075.0)));
4125 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4126 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4127
4128 MI.eraseFromParent();
4129 return true;
4130}
4131
4133 MachineIRBuilder &B) const {
4134 Register Dst = MI.getOperand(0).getReg();
4135 Register X = MI.getOperand(1).getReg();
4136 const unsigned Flags = MI.getFlags();
4137 MachineFunction &MF = B.getMF();
4138 MachineRegisterInfo &MRI = *B.getMRI();
4139 LLT Ty = MRI.getType(Dst);
4140
4141 const LLT F64 = LLT::scalar(64);
4142
4143 if (Ty == F64)
4144 return legalizeFEXPF64(MI, B);
4145
4146 const LLT F16 = LLT::scalar(16);
4147 const LLT F32 = LLT::scalar(32);
4148 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4149
4150 if (Ty == F16) {
4151 // v_exp_f16 (fmul x, log2e)
4152 if (allowApproxFunc(MF, Flags)) {
4153 // TODO: Does this really require fast?
4154 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4155 : legalizeFExpUnsafe(B, Dst, X, Flags);
4156 MI.eraseFromParent();
4157 return true;
4158 }
4159
4160 // Nothing in half is a denormal when promoted to f32.
4161 //
4162 // exp(f16 x) ->
4163 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4164 //
4165 // exp10(f16 x) ->
4166 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4167 auto Ext = B.buildFPExt(F32, X, Flags);
4169 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4170 B.buildFPTrunc(Dst, Lowered, Flags);
4171 MI.eraseFromParent();
4172 return true;
4173 }
4174
4175 assert(Ty == F32);
4176
4177 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4178 // library behavior. Also, is known-not-daz source sufficient?
4179 if (allowApproxFunc(MF, Flags)) {
4180 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4181 : legalizeFExpUnsafe(B, Dst, X, Flags);
4182 MI.eraseFromParent();
4183 return true;
4184 }
4185
4186 // Algorithm:
4187 //
4188 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4189 //
4190 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4191 // n = 64*m + j, 0 <= j < 64
4192 //
4193 // e^x = 2^((64*m + j + f)/64)
4194 // = (2^m) * (2^(j/64)) * 2^(f/64)
4195 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4196 //
4197 // f = x*(64/ln(2)) - n
4198 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4199 //
4200 // e^x = (2^m) * (2^(j/64)) * e^r
4201 //
4202 // (2^(j/64)) is precomputed
4203 //
4204 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4205 // e^r = 1 + q
4206 //
4207 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4208 //
4209 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4210 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4211 Register PH, PL;
4212
4213 if (ST.hasFastFMAF32()) {
4214 const float c_exp = numbers::log2ef;
4215 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4216 const float c_exp10 = 0x1.a934f0p+1f;
4217 const float cc_exp10 = 0x1.2f346ep-24f;
4218
4219 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4220 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4221 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4222 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4223
4224 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4225 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4226 } else {
4227 const float ch_exp = 0x1.714000p+0f;
4228 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4229
4230 const float ch_exp10 = 0x1.a92000p+1f;
4231 const float cl_exp10 = 0x1.4f0978p-11f;
4232
4233 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4234 auto XH = B.buildAnd(Ty, X, MaskConst);
4235 auto XL = B.buildFSub(Ty, X, XH, Flags);
4236
4237 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4238 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4239
4240 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4241 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4242
4243 Register Mad0 =
4244 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4245 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4246 }
4247
4248 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4249
4250 // It is unsafe to contract this fsub into the PH multiply.
4251 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4252 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4253 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4254
4255 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4256 .addUse(A.getReg(0))
4257 .setMIFlags(Flags);
4258 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4259
4260 auto UnderflowCheckConst =
4261 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4262 auto Zero = B.buildFConstant(Ty, 0.0);
4263 auto Underflow =
4264 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4265
4266 R = B.buildSelect(Ty, Underflow, Zero, R);
4267
4268 if (!(Flags & MachineInstr::FmNoInfs)) {
4269 auto OverflowCheckConst =
4270 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4271
4272 auto Overflow =
4273 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4274 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4275 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4276 }
4277
4278 B.buildCopy(Dst, R);
4279 MI.eraseFromParent();
4280 return true;
4281}
4282
4284 MachineIRBuilder &B) const {
4285 Register Dst = MI.getOperand(0).getReg();
4286 Register Src0 = MI.getOperand(1).getReg();
4287 Register Src1 = MI.getOperand(2).getReg();
4288 unsigned Flags = MI.getFlags();
4289 LLT Ty = B.getMRI()->getType(Dst);
4290 const LLT F16 = LLT::scalar(16); // TODO: Expected LLT::float16()
4291 const LLT F32 = LLT::scalar(32); // TODO: Expected LLT::float32()
4292
4293 if (Ty == F32) {
4294 auto Log = B.buildFLog2(F32, Src0, Flags);
4295 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4296 .addUse(Log.getReg(0))
4297 .addUse(Src1)
4298 .setMIFlags(Flags);
4299 B.buildFExp2(Dst, Mul, Flags);
4300 } else if (Ty == F16) {
4301 // There's no f16 fmul_legacy, so we need to convert for it.
4302 auto Log = B.buildFLog2(F16, Src0, Flags);
4303 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4304 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4305 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4306 .addUse(Ext0.getReg(0))
4307 .addUse(Ext1.getReg(0))
4308 .setMIFlags(Flags);
4309 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4310 } else
4311 return false;
4312
4313 MI.eraseFromParent();
4314 return true;
4315}
4316
4317// Find a source register, ignoring any possible source modifiers.
4319 Register ModSrc = OrigSrc;
4320 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4321 ModSrc = SrcFNeg->getOperand(1).getReg();
4322 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4323 ModSrc = SrcFAbs->getOperand(1).getReg();
4324 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4325 ModSrc = SrcFAbs->getOperand(1).getReg();
4326 return ModSrc;
4327}
4328
4331 MachineIRBuilder &B) const {
4332
4333 const LLT S1 = LLT::scalar(1);
4334 const LLT F64 = LLT::scalar(64); // TODO: Expected float64
4335 Register Dst = MI.getOperand(0).getReg();
4336 Register OrigSrc = MI.getOperand(1).getReg();
4337 unsigned Flags = MI.getFlags();
4338 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4339 "this should not have been custom lowered");
4340
4341 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4342 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4343 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4344 // V_FRACT bug is:
4345 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4346 //
4347 // Convert floor(x) to (x - fract(x))
4348
4349 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4350 .addUse(OrigSrc)
4351 .setMIFlags(Flags);
4352
4353 // Give source modifier matching some assistance before obscuring a foldable
4354 // pattern.
4355
4356 // TODO: We can avoid the neg on the fract? The input sign to fract
4357 // shouldn't matter?
4358 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4359
4360 auto Const =
4361 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4362
4364
4365 // We don't need to concern ourselves with the snan handling difference, so
4366 // use the one which will directly select.
4367 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4368 if (MFI->getMode().IEEE)
4369 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4370 else
4371 B.buildFMinNum(Min, Fract, Const, Flags);
4372
4373 Register CorrectedFract = Min;
4374 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4375 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4376 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4377 }
4378
4379 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4380 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4381
4382 MI.eraseFromParent();
4383 return true;
4384}
4385
4386// Turn an illegal packed v2s16 build vector into bit operations.
4387// TODO: This should probably be a bitcast action in LegalizerHelper.
4390 Register Dst = MI.getOperand(0).getReg();
4391 const LLT S32 = LLT::scalar(32);
4392 const LLT S16 = LLT::scalar(16);
4393 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4394
4395 Register Src0 = MI.getOperand(1).getReg();
4396 Register Src1 = MI.getOperand(2).getReg();
4397
4398 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4399 assert(MRI.getType(Src0) == S32);
4400 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4401 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4402 }
4403
4404 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4405 B.buildBitcast(Dst, Merge);
4406
4407 MI.eraseFromParent();
4408 return true;
4409}
4410
4411// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4412//
4413// Source and accumulation registers must all be 32-bits.
4414//
4415// TODO: When the multiply is uniform, we should produce a code sequence
4416// that is better suited to instruction selection on the SALU. Instead of
4417// the outer loop going over parts of the result, the outer loop should go
4418// over parts of one of the factors. This should result in instruction
4419// selection that makes full use of S_ADDC_U32 instructions.
4422 ArrayRef<Register> Src0,
4423 ArrayRef<Register> Src1,
4424 bool UsePartialMad64_32,
4425 bool SeparateOddAlignedProducts) const {
4426 // Use (possibly empty) vectors of S1 registers to represent the set of
4427 // carries from one pair of positions to the next.
4428 using Carry = SmallVector<Register, 2>;
4429
4430 MachineIRBuilder &B = Helper.MIRBuilder;
4431 GISelValueTracking &VT = *Helper.getValueTracking();
4432
4433 const LLT S1 = LLT::scalar(1);
4434 const LLT S32 = LLT::scalar(32);
4435 const LLT S64 = LLT::scalar(64);
4436
4437 Register Zero32;
4438 Register Zero64;
4439
4440 auto getZero32 = [&]() -> Register {
4441 if (!Zero32)
4442 Zero32 = B.buildConstant(S32, 0).getReg(0);
4443 return Zero32;
4444 };
4445 auto getZero64 = [&]() -> Register {
4446 if (!Zero64)
4447 Zero64 = B.buildConstant(S64, 0).getReg(0);
4448 return Zero64;
4449 };
4450
4451 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4452 for (unsigned i = 0; i < Src0.size(); ++i) {
4453 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4454 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4455 }
4456
4457 // Merge the given carries into the 32-bit LocalAccum, which is modified
4458 // in-place.
4459 //
4460 // Returns the carry-out, which is a single S1 register or null.
4461 auto mergeCarry =
4462 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4463 if (CarryIn.empty())
4464 return Register();
4465
4466 bool HaveCarryOut = true;
4467 Register CarryAccum;
4468 if (CarryIn.size() == 1) {
4469 if (!LocalAccum) {
4470 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4471 return Register();
4472 }
4473
4474 CarryAccum = getZero32();
4475 } else {
4476 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4477 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4478 CarryAccum =
4479 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4480 .getReg(0);
4481 }
4482
4483 if (!LocalAccum) {
4484 LocalAccum = getZero32();
4485 HaveCarryOut = false;
4486 }
4487 }
4488
4489 auto Add =
4490 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4491 LocalAccum = Add.getReg(0);
4492 return HaveCarryOut ? Add.getReg(1) : Register();
4493 };
4494
4495 // Build a multiply-add chain to compute
4496 //
4497 // LocalAccum + (partial products at DstIndex)
4498 // + (opportunistic subset of CarryIn)
4499 //
4500 // LocalAccum is an array of one or two 32-bit registers that are updated
4501 // in-place. The incoming registers may be null.
4502 //
4503 // In some edge cases, carry-ins can be consumed "for free". In that case,
4504 // the consumed carry bits are removed from CarryIn in-place.
4505 auto buildMadChain =
4506 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4507 -> Carry {
4508 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4509 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4510
4511 Carry CarryOut;
4512 unsigned j0 = 0;
4513
4514 // Use plain 32-bit multiplication for the most significant part of the
4515 // result by default.
4516 if (LocalAccum.size() == 1 &&
4517 (!UsePartialMad64_32 || !CarryIn.empty())) {
4518 do {
4519 // Skip multiplication if one of the operands is 0
4520 unsigned j1 = DstIndex - j0;
4521 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4522 ++j0;
4523 continue;
4524 }
4525 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4526 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4527 LocalAccum[0] = Mul.getReg(0);
4528 } else {
4529 if (CarryIn.empty()) {
4530 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4531 } else {
4532 LocalAccum[0] =
4533 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4534 .getReg(0);
4535 CarryIn.pop_back();
4536 }
4537 }
4538 ++j0;
4539 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4540 }
4541
4542 // Build full 64-bit multiplies.
4543 if (j0 <= DstIndex) {
4544 bool HaveSmallAccum = false;
4545 Register Tmp;
4546
4547 if (LocalAccum[0]) {
4548 if (LocalAccum.size() == 1) {
4549 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4550 HaveSmallAccum = true;
4551 } else if (LocalAccum[1]) {
4552 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4553 HaveSmallAccum = false;
4554 } else {
4555 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4556 HaveSmallAccum = true;
4557 }
4558 } else {
4559 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4560 Tmp = getZero64();
4561 HaveSmallAccum = true;
4562 }
4563
4564 do {
4565 unsigned j1 = DstIndex - j0;
4566 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4567 ++j0;
4568 continue;
4569 }
4570 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4571 {Src0[j0], Src1[j1], Tmp});
4572 Tmp = Mad.getReg(0);
4573 if (!HaveSmallAccum)
4574 CarryOut.push_back(Mad.getReg(1));
4575 HaveSmallAccum = false;
4576
4577 ++j0;
4578 } while (j0 <= DstIndex);
4579
4580 auto Unmerge = B.buildUnmerge(S32, Tmp);
4581 LocalAccum[0] = Unmerge.getReg(0);
4582 if (LocalAccum.size() > 1)
4583 LocalAccum[1] = Unmerge.getReg(1);
4584 }
4585
4586 return CarryOut;
4587 };
4588
4589 // Outer multiply loop, iterating over destination parts from least
4590 // significant to most significant parts.
4591 //
4592 // The columns of the following diagram correspond to the destination parts
4593 // affected by one iteration of the outer loop (ignoring boundary
4594 // conditions).
4595 //
4596 // Dest index relative to 2 * i: 1 0 -1
4597 // ------
4598 // Carries from previous iteration: e o
4599 // Even-aligned partial product sum: E E .
4600 // Odd-aligned partial product sum: O O
4601 //
4602 // 'o' is OddCarry, 'e' is EvenCarry.
4603 // EE and OO are computed from partial products via buildMadChain and use
4604 // accumulation where possible and appropriate.
4605 //
4606 Register SeparateOddCarry;
4607 Carry EvenCarry;
4608 Carry OddCarry;
4609
4610 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4611 Carry OddCarryIn = std::move(OddCarry);
4612 Carry EvenCarryIn = std::move(EvenCarry);
4613 OddCarry.clear();
4614 EvenCarry.clear();
4615
4616 // Partial products at offset 2 * i.
4617 if (2 * i < Accum.size()) {
4618 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4619 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4620 }
4621
4622 // Partial products at offset 2 * i - 1.
4623 if (i > 0) {
4624 if (!SeparateOddAlignedProducts) {
4625 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4626 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4627 } else {
4628 bool IsHighest = 2 * i >= Accum.size();
4629 Register SeparateOddOut[2];
4630 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4631 .take_front(IsHighest ? 1 : 2);
4632 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4633
4635
4636 if (i == 1) {
4637 if (!IsHighest)
4638 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4639 else
4640 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4641 } else {
4642 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4643 SeparateOddCarry);
4644 }
4645 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4646
4647 if (!IsHighest) {
4648 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4649 Lo->getOperand(1).getReg());
4650 Accum[2 * i] = Hi.getReg(0);
4651 SeparateOddCarry = Hi.getReg(1);
4652 }
4653 }
4654 }
4655
4656 // Add in the carries from the previous iteration
4657 if (i > 0) {
4658 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4659 EvenCarryIn.push_back(CarryOut);
4660
4661 if (2 * i < Accum.size()) {
4662 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4663 OddCarry.push_back(CarryOut);
4664 }
4665 }
4666 }
4667}
4668
4669// Custom narrowing of wide multiplies using wide multiply-add instructions.
4670//
4671// TODO: If the multiply is followed by an addition, we should attempt to
4672// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4674 MachineInstr &MI) const {
4675 assert(ST.hasMad64_32());
4676 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4677
4678 MachineIRBuilder &B = Helper.MIRBuilder;
4679 MachineRegisterInfo &MRI = *B.getMRI();
4680
4681 Register DstReg = MI.getOperand(0).getReg();
4682 Register Src0 = MI.getOperand(1).getReg();
4683 Register Src1 = MI.getOperand(2).getReg();
4684
4685 LLT Ty = MRI.getType(DstReg);
4686 assert(Ty.isScalar());
4687
4688 unsigned Size = Ty.getSizeInBits();
4689 if (ST.hasVMulU64Inst() && Size == 64)
4690 return true;
4691
4692 unsigned NumParts = Size / 32;
4693 assert((Size % 32) == 0);
4694 assert(NumParts >= 2);
4695
4696 // Whether to use MAD_64_32 for partial products whose high half is
4697 // discarded. This avoids some ADD instructions but risks false dependency
4698 // stalls on some subtargets in some cases.
4699 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4700
4701 // Whether to compute odd-aligned partial products separately. This is
4702 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4703 // in an even-aligned VGPR.
4704 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4705
4706 LLT S32 = LLT::scalar(32);
4707 SmallVector<Register, 2> Src0Parts, Src1Parts;
4708 for (unsigned i = 0; i < NumParts; ++i) {
4711 }
4712 B.buildUnmerge(Src0Parts, Src0);
4713 B.buildUnmerge(Src1Parts, Src1);
4714
4715 SmallVector<Register, 2> AccumRegs(NumParts);
4716 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4717 SeparateOddAlignedProducts);
4718
4719 B.buildMergeLikeInstr(DstReg, AccumRegs);
4720 MI.eraseFromParent();
4721 return true;
4722}
4723
4724// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4725// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input
4726// case with a single min instruction instead of a compare+select.
4729 MachineIRBuilder &B) const {
4730 Register Dst = MI.getOperand(0).getReg();
4731 Register Src = MI.getOperand(1).getReg();
4732 LLT DstTy = MRI.getType(Dst);
4733 LLT SrcTy = MRI.getType(Src);
4734
4735 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4736 ? AMDGPU::G_AMDGPU_FFBH_U32
4737 : AMDGPU::G_AMDGPU_FFBL_B32;
4738 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4739 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4740
4741 MI.eraseFromParent();
4742 return true;
4743}
4744
4747 MachineIRBuilder &B) const {
4748 Register Dst = MI.getOperand(0).getReg();
4749 Register Src = MI.getOperand(1).getReg();
4750 LLT SrcTy = MRI.getType(Src);
4751 TypeSize NumBits = SrcTy.getSizeInBits();
4752
4753 assert(NumBits < 32u);
4754
4755 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4756 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4757 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4758 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4759 B.buildTrunc(Dst, Ctlz);
4760 MI.eraseFromParent();
4761 return true;
4762}
4763
4766 MachineIRBuilder &B) const {
4767 Register Dst = MI.getOperand(0).getReg();
4768 Register Src = MI.getOperand(1).getReg();
4769 LLT SrcTy = MRI.getType(Src);
4770 const LLT S32 = LLT::scalar(32);
4771 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4772 unsigned BitWidth = SrcTy.getSizeInBits();
4773
4774 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4775 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4776 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4777 MI.eraseFromParent();
4778 return true;
4779}
4780
4781// Check that this is a G_XOR x, -1
4782static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4783 if (MI.getOpcode() != TargetOpcode::G_XOR)
4784 return false;
4785 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4786 return ConstVal == -1;
4787}
4788
4789// Return the use branch instruction, otherwise null if the usage is invalid.
4790static MachineInstr *
4792 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4793 Register CondDef = MI.getOperand(0).getReg();
4794 if (!MRI.hasOneNonDBGUse(CondDef))
4795 return nullptr;
4796
4797 MachineBasicBlock *Parent = MI.getParent();
4798 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4799
4800 if (isNot(MRI, *UseMI)) {
4801 Register NegatedCond = UseMI->getOperand(0).getReg();
4802 if (!MRI.hasOneNonDBGUse(NegatedCond))
4803 return nullptr;
4804
4805 // We're deleting the def of this value, so we need to remove it.
4806 eraseInstr(*UseMI, MRI);
4807
4808 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4809 Negated = true;
4810 }
4811
4812 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4813 return nullptr;
4814
4815 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4816 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4817 if (Next == Parent->end()) {
4818 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4819 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4820 return nullptr;
4821 UncondBrTarget = &*NextMBB;
4822 } else {
4823 if (Next->getOpcode() != AMDGPU::G_BR)
4824 return nullptr;
4825 Br = &*Next;
4826 UncondBrTarget = Br->getOperand(0).getMBB();
4827 }
4828
4829 return UseMI;
4830}
4831
4834 const ArgDescriptor *Arg,
4835 const TargetRegisterClass *ArgRC,
4836 LLT ArgTy) const {
4837 MCRegister SrcReg = Arg->getRegister();
4838 assert(SrcReg.isPhysical() && "Physical register expected");
4839 assert(DstReg.isVirtual() && "Virtual register expected");
4840
4841 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4842 *ArgRC, B.getDebugLoc(), ArgTy);
4843 if (Arg->isMasked()) {
4844 // TODO: Should we try to emit this once in the entry block?
4845 const LLT S32 = LLT::scalar(32);
4846 const unsigned Mask = Arg->getMask();
4847 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4848
4849 Register AndMaskSrc = LiveIn;
4850
4851 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4852 // 0.
4853 if (Shift != 0) {
4854 auto ShiftAmt = B.buildConstant(S32, Shift);
4855 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4856 }
4857
4858 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4859 } else {
4860 B.buildCopy(DstReg, LiveIn);
4861 }
4862}
4863
4868 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4869 Register DstReg = MI.getOperand(0).getReg();
4870 if (!ST.hasClusters()) {
4871 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4872 return false;
4873 MI.eraseFromParent();
4874 return true;
4875 }
4876
4877 // Clusters are supported. Return the global position in the grid. If clusters
4878 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4879
4880 // WorkGroupIdXYZ = ClusterId == 0 ?
4881 // ClusterIdXYZ :
4882 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4883 MachineRegisterInfo &MRI = *B.getMRI();
4884 const LLT S32 = LLT::scalar(32);
4885 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4886 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4887 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4888 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4889 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4890 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4891 return false;
4892
4893 auto One = B.buildConstant(S32, 1);
4894 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4895 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4896 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4897
4898 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4899
4900 switch (MFI->getClusterDims().getKind()) {
4903 B.buildCopy(DstReg, GlobalIdXYZ);
4904 MI.eraseFromParent();
4905 return true;
4906 }
4908 B.buildCopy(DstReg, ClusterIdXYZ);
4909 MI.eraseFromParent();
4910 return true;
4911 }
4913 using namespace AMDGPU::Hwreg;
4914 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4915 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4916 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4917 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4918 .addDef(ClusterId)
4919 .addImm(ClusterIdField);
4920 auto Zero = B.buildConstant(S32, 0);
4921 auto NoClusters =
4922 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4923 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4924 MI.eraseFromParent();
4925 return true;
4926 }
4927 }
4928
4929 llvm_unreachable("nothing should reach here");
4930}
4931
4933 Register DstReg, MachineIRBuilder &B,
4935 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4936 const ArgDescriptor *Arg = nullptr;
4937 const TargetRegisterClass *ArgRC;
4938 LLT ArgTy;
4939
4940 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4941 const ArgDescriptor WorkGroupIDX =
4942 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4943 // If GridZ is not programmed in an entry function then the hardware will set
4944 // it to all zeros, so there is no need to mask the GridY value in the low
4945 // order bits.
4946 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4947 AMDGPU::TTMP7,
4948 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4949 const ArgDescriptor WorkGroupIDZ =
4950 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4951 const ArgDescriptor ClusterWorkGroupIDX =
4952 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4953 const ArgDescriptor ClusterWorkGroupIDY =
4954 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4955 const ArgDescriptor ClusterWorkGroupIDZ =
4956 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4957 const ArgDescriptor ClusterWorkGroupMaxIDX =
4958 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4959 const ArgDescriptor ClusterWorkGroupMaxIDY =
4960 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4961 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4962 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4963 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4964 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4965
4966 auto LoadConstant = [&](unsigned N) {
4967 B.buildConstant(DstReg, N);
4968 return true;
4969 };
4970
4971 if (ST.hasArchitectedSGPRs() &&
4973 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4974 bool HasFixedDims = ClusterDims.isFixedDims();
4975
4976 switch (ArgType) {
4978 Arg = &WorkGroupIDX;
4979 ArgRC = &AMDGPU::SReg_32RegClass;
4980 ArgTy = LLT::scalar(32);
4981 break;
4983 Arg = &WorkGroupIDY;
4984 ArgRC = &AMDGPU::SReg_32RegClass;
4985 ArgTy = LLT::scalar(32);
4986 break;
4988 Arg = &WorkGroupIDZ;
4989 ArgRC = &AMDGPU::SReg_32RegClass;
4990 ArgTy = LLT::scalar(32);
4991 break;
4993 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4994 return LoadConstant(0);
4995 Arg = &ClusterWorkGroupIDX;
4996 ArgRC = &AMDGPU::SReg_32RegClass;
4997 ArgTy = LLT::scalar(32);
4998 break;
5000 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
5001 return LoadConstant(0);
5002 Arg = &ClusterWorkGroupIDY;
5003 ArgRC = &AMDGPU::SReg_32RegClass;
5004 ArgTy = LLT::scalar(32);
5005 break;
5007 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
5008 return LoadConstant(0);
5009 Arg = &ClusterWorkGroupIDZ;
5010 ArgRC = &AMDGPU::SReg_32RegClass;
5011 ArgTy = LLT::scalar(32);
5012 break;
5014 if (HasFixedDims)
5015 return LoadConstant(ClusterDims.getDims()[0] - 1);
5016 Arg = &ClusterWorkGroupMaxIDX;
5017 ArgRC = &AMDGPU::SReg_32RegClass;
5018 ArgTy = LLT::scalar(32);
5019 break;
5021 if (HasFixedDims)
5022 return LoadConstant(ClusterDims.getDims()[1] - 1);
5023 Arg = &ClusterWorkGroupMaxIDY;
5024 ArgRC = &AMDGPU::SReg_32RegClass;
5025 ArgTy = LLT::scalar(32);
5026 break;
5028 if (HasFixedDims)
5029 return LoadConstant(ClusterDims.getDims()[2] - 1);
5030 Arg = &ClusterWorkGroupMaxIDZ;
5031 ArgRC = &AMDGPU::SReg_32RegClass;
5032 ArgTy = LLT::scalar(32);
5033 break;
5035 Arg = &ClusterWorkGroupMaxFlatID;
5036 ArgRC = &AMDGPU::SReg_32RegClass;
5037 ArgTy = LLT::scalar(32);
5038 break;
5039 default:
5040 break;
5041 }
5042 }
5043
5044 if (!Arg)
5045 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5046
5047 if (!Arg) {
5049 // The intrinsic may appear when we have a 0 sized kernarg segment, in
5050 // which case the pointer argument may be missing and we use null.
5051 return LoadConstant(0);
5052 }
5053
5054 // It's undefined behavior if a function marked with the amdgpu-no-*
5055 // attributes uses the corresponding intrinsic.
5056 B.buildUndef(DstReg);
5057 return true;
5058 }
5059
5060 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5061 return false; // TODO: Handle these
5062 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5063 return true;
5064}
5065
5069 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
5070 return false;
5071
5072 MI.eraseFromParent();
5073 return true;
5074}
5075
5077 int64_t C) {
5078 B.buildConstant(MI.getOperand(0).getReg(), C);
5079 MI.eraseFromParent();
5080 return true;
5081}
5082
5085 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5086 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5087 if (MaxID == 0)
5088 return replaceWithConstant(B, MI, 0);
5089
5090 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5091 const ArgDescriptor *Arg;
5092 const TargetRegisterClass *ArgRC;
5093 LLT ArgTy;
5094 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5095
5096 Register DstReg = MI.getOperand(0).getReg();
5097 if (!Arg) {
5098 // It's undefined behavior if a function marked with the amdgpu-no-*
5099 // attributes uses the corresponding intrinsic.
5100 B.buildUndef(DstReg);
5101 MI.eraseFromParent();
5102 return true;
5103 }
5104
5105 if (Arg->isMasked()) {
5106 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5107 // masking operations anyway.
5108 //
5109 // TODO: We could assert the top bit is 0 for the source copy.
5110 if (!loadInputValue(DstReg, B, ArgType))
5111 return false;
5112 } else {
5114 if (!loadInputValue(TmpReg, B, ArgType))
5115 return false;
5116 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5117 }
5118
5119 MI.eraseFromParent();
5120 return true;
5121}
5122
5125 // This isn't really a constant pool but close enough.
5128 return PtrInfo;
5129}
5130
5132 int64_t Offset) const {
5134 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5135
5136 // TODO: If we passed in the base kernel offset we could have a better
5137 // alignment than 4, but we don't really need it.
5138 if (!loadInputValue(KernArgReg, B,
5140 llvm_unreachable("failed to find kernarg segment ptr");
5141
5142 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5143 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5144}
5145
5146/// Legalize a value that's loaded from kernel arguments. This is only used by
5147/// legacy intrinsics.
5151 Align Alignment) const {
5152 Register DstReg = MI.getOperand(0).getReg();
5153
5154 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5155 "unexpected kernarg parameter type");
5156
5159 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5162 MI.eraseFromParent();
5163 return true;
5164}
5165
5168 MachineIRBuilder &B) const {
5169 Register Dst = MI.getOperand(0).getReg();
5170 LLT DstTy = MRI.getType(Dst);
5171 LLT S16 = LLT::scalar(16);
5172 LLT S32 = LLT::scalar(32);
5173 LLT S64 = LLT::scalar(64);
5174
5175 if (DstTy == S16)
5176 return legalizeFDIV16(MI, MRI, B);
5177 if (DstTy == S32)
5178 return legalizeFDIV32(MI, MRI, B);
5179 if (DstTy == S64)
5180 return legalizeFDIV64(MI, MRI, B);
5181
5182 return false;
5183}
5184
5186 Register DstDivReg,
5187 Register DstRemReg,
5188 Register X,
5189 Register Y) const {
5190 const LLT S1 = LLT::scalar(1);
5191 const LLT S32 = LLT::scalar(32);
5192
5193 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5194 // algorithm used here.
5195
5196 // Initial estimate of inv(y).
5197 auto FloatY = B.buildUITOFP(S32, Y);
5198 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5199 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5200 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5201 auto Z = B.buildFPTOUI(S32, ScaledY);
5202
5203 // One round of UNR.
5204 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5205 auto NegYZ = B.buildMul(S32, NegY, Z);
5206 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5207
5208 // Quotient/remainder estimate.
5209 auto Q = B.buildUMulH(S32, X, Z);
5210 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5211
5212 // First quotient/remainder refinement.
5213 auto One = B.buildConstant(S32, 1);
5214 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5215 if (DstDivReg)
5216 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5217 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5218
5219 // Second quotient/remainder refinement.
5220 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5221 if (DstDivReg)
5222 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5223
5224 if (DstRemReg)
5225 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5226}
5227
5228// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5229//
5230// Return lo, hi of result
5231//
5232// %cvt.lo = G_UITOFP Val.lo
5233// %cvt.hi = G_UITOFP Val.hi
5234// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5235// %rcp = G_AMDGPU_RCP_IFLAG %mad
5236// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5237// %mul2 = G_FMUL %mul1, 2**(-32)
5238// %trunc = G_INTRINSIC_TRUNC %mul2
5239// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5240// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5241static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5242 Register Val) {
5243 const LLT S32 = LLT::scalar(32);
5244 auto Unmerge = B.buildUnmerge(S32, Val);
5245
5246 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5247 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5248
5249 auto Mad = B.buildFMAD(
5250 S32, CvtHi, // 2**32
5251 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5252
5253 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5254 auto Mul1 = B.buildFMul(
5255 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5256
5257 // 2**(-32)
5258 auto Mul2 = B.buildFMul(
5259 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5260 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5261
5262 // -(2**32)
5263 auto Mad2 = B.buildFMAD(
5264 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5265 Mul1);
5266
5267 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5268 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5269
5270 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5271}
5272
5274 Register DstDivReg,
5275 Register DstRemReg,
5276 Register Numer,
5277 Register Denom) const {
5278 const LLT S32 = LLT::scalar(32);
5279 const LLT S64 = LLT::scalar(64);
5280 const LLT S1 = LLT::scalar(1);
5281 Register RcpLo, RcpHi;
5282
5283 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5284
5285 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5286
5287 auto Zero64 = B.buildConstant(S64, 0);
5288 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5289
5290 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5291 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5292
5293 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5294 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5295 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5296
5297 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5298 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5299 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5300
5301 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5302 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5303 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5304 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5305 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5306
5307 auto Zero32 = B.buildConstant(S32, 0);
5308 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5309 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5310 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5311
5312 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5313 Register NumerLo = UnmergeNumer.getReg(0);
5314 Register NumerHi = UnmergeNumer.getReg(1);
5315
5316 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5317 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5318 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5319 Register Mul3_Lo = UnmergeMul3.getReg(0);
5320 Register Mul3_Hi = UnmergeMul3.getReg(1);
5321 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5322 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5323 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5324 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5325
5326 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5327 Register DenomLo = UnmergeDenom.getReg(0);
5328 Register DenomHi = UnmergeDenom.getReg(1);
5329
5330 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5331 auto C1 = B.buildSExt(S32, CmpHi);
5332
5333 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5334 auto C2 = B.buildSExt(S32, CmpLo);
5335
5336 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5337 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5338
5339 // TODO: Here and below portions of the code can be enclosed into if/endif.
5340 // Currently control flow is unconditional and we have 4 selects after
5341 // potential endif to substitute PHIs.
5342
5343 // if C3 != 0 ...
5344 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5345 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5346 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5347 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5348
5349 auto One64 = B.buildConstant(S64, 1);
5350 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5351
5352 auto C4 =
5353 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5354 auto C5 =
5355 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5356 auto C6 = B.buildSelect(
5357 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5358
5359 // if (C6 != 0)
5360 auto Add4 = B.buildAdd(S64, Add3, One64);
5361 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5362
5363 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5364 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5365 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5366
5367 // endif C6
5368 // endif C3
5369
5370 if (DstDivReg) {
5371 auto Sel1 = B.buildSelect(
5372 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5373 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5374 Sel1, MulHi3);
5375 }
5376
5377 if (DstRemReg) {
5378 auto Sel2 = B.buildSelect(
5379 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5380 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5381 Sel2, Sub1);
5382 }
5383}
5384
5387 MachineIRBuilder &B) const {
5388 Register DstDivReg, DstRemReg;
5389 switch (MI.getOpcode()) {
5390 default:
5391 llvm_unreachable("Unexpected opcode!");
5392 case AMDGPU::G_UDIV: {
5393 DstDivReg = MI.getOperand(0).getReg();
5394 break;
5395 }
5396 case AMDGPU::G_UREM: {
5397 DstRemReg = MI.getOperand(0).getReg();
5398 break;
5399 }
5400 case AMDGPU::G_UDIVREM: {
5401 DstDivReg = MI.getOperand(0).getReg();
5402 DstRemReg = MI.getOperand(1).getReg();
5403 break;
5404 }
5405 }
5406
5407 const LLT S64 = LLT::scalar(64);
5408 const LLT S32 = LLT::scalar(32);
5409 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5410 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5411 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5412 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5413
5414 if (Ty == S32)
5415 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5416 else if (Ty == S64)
5417 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5418 else
5419 return false;
5420
5421 MI.eraseFromParent();
5422 return true;
5423}
5424
5427 MachineIRBuilder &B) const {
5428 const LLT S64 = LLT::scalar(64);
5429 const LLT S32 = LLT::scalar(32);
5430
5431 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5432 if (Ty != S32 && Ty != S64)
5433 return false;
5434
5435 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5436 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5437 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5438
5439 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5440 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5441 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5442
5443 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5444 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5445
5446 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5447 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5448
5449 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5450 switch (MI.getOpcode()) {
5451 default:
5452 llvm_unreachable("Unexpected opcode!");
5453 case AMDGPU::G_SDIV: {
5454 DstDivReg = MI.getOperand(0).getReg();
5455 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5456 break;
5457 }
5458 case AMDGPU::G_SREM: {
5459 DstRemReg = MI.getOperand(0).getReg();
5460 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5461 break;
5462 }
5463 case AMDGPU::G_SDIVREM: {
5464 DstDivReg = MI.getOperand(0).getReg();
5465 DstRemReg = MI.getOperand(1).getReg();
5466 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5467 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5468 break;
5469 }
5470 }
5471
5472 if (Ty == S32)
5473 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5474 else
5475 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5476
5477 if (DstDivReg) {
5478 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5479 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5480 B.buildSub(DstDivReg, SignXor, Sign);
5481 }
5482
5483 if (DstRemReg) {
5484 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5485 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5486 B.buildSub(DstRemReg, SignXor, Sign);
5487 }
5488
5489 MI.eraseFromParent();
5490 return true;
5491}
5492
5495 MachineIRBuilder &B) const {
5496 Register Res = MI.getOperand(0).getReg();
5497 Register LHS = MI.getOperand(1).getReg();
5498 Register RHS = MI.getOperand(2).getReg();
5499 uint16_t Flags = MI.getFlags();
5500 LLT ResTy = MRI.getType(Res);
5501
5502 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5503
5504 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5505 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5506 return false;
5507
5508 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5509 // the CI documentation has a worst case error of 1 ulp.
5510 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5511 // use it as long as we aren't trying to use denormals.
5512 //
5513 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5514
5515 // 1 / x -> RCP(x)
5516 if (CLHS->isExactlyValue(1.0)) {
5517 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5518 .addUse(RHS)
5519 .setMIFlags(Flags);
5520
5521 MI.eraseFromParent();
5522 return true;
5523 }
5524
5525 // -1 / x -> RCP( FNEG(x) )
5526 if (CLHS->isExactlyValue(-1.0)) {
5527 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5528 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5529 .addUse(FNeg.getReg(0))
5530 .setMIFlags(Flags);
5531
5532 MI.eraseFromParent();
5533 return true;
5534 }
5535 }
5536
5537 // For f16 require afn or arcp.
5538 // For f32 require afn.
5539 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5540 !MI.getFlag(MachineInstr::FmArcp)))
5541 return false;
5542
5543 // x / y -> x * (1.0 / y)
5544 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5545 .addUse(RHS)
5546 .setMIFlags(Flags);
5547 B.buildFMul(Res, LHS, RCP, Flags);
5548
5549 MI.eraseFromParent();
5550 return true;
5551}
5552
5555 MachineIRBuilder &B) const {
5556 Register Res = MI.getOperand(0).getReg();
5557 Register X = MI.getOperand(1).getReg();
5558 Register Y = MI.getOperand(2).getReg();
5559 uint16_t Flags = MI.getFlags();
5560 LLT ResTy = MRI.getType(Res);
5561
5562 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5563
5564 if (!AllowInaccurateRcp)
5565 return false;
5566
5567 const ConstantFP *CLHS = getConstantFPVRegVal(X, MRI);
5568 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
5569
5570 // Pull out the negation so it folds for free into the source modifiers.
5571 if (IsNegRcp)
5572 X = B.buildFConstant(ResTy, 1.0).getReg(0);
5573
5574 Register NegY = IsNegRcp ? Y : B.buildFNeg(ResTy, Y).getReg(0);
5575 auto One = B.buildFConstant(ResTy, 1.0);
5576
5577 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5578 .addUse(Y)
5579 .setMIFlags(Flags);
5580 if (IsNegRcp)
5581 R = B.buildFNeg(ResTy, R);
5582
5583 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5584 R = B.buildFMA(ResTy, Tmp0, R, R);
5585
5586 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5587 R = B.buildFMA(ResTy, Tmp1, R, R);
5588
5589 // Skip the last 2 correction terms for reciprocal.
5590 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0))) {
5591 B.buildCopy(Res, R);
5592 MI.eraseFromParent();
5593 return true;
5594 }
5595
5596 auto Ret = B.buildFMul(ResTy, X, R);
5597 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5598
5599 B.buildFMA(Res, Tmp2, R, Ret);
5600 MI.eraseFromParent();
5601 return true;
5602}
5603
5606 MachineIRBuilder &B) const {
5607 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5608 return true;
5609
5610 Register Res = MI.getOperand(0).getReg();
5611 Register LHS = MI.getOperand(1).getReg();
5612 Register RHS = MI.getOperand(2).getReg();
5613
5614 uint16_t Flags = MI.getFlags();
5615
5616 LLT S16 = LLT::scalar(16);
5617 LLT S32 = LLT::scalar(32);
5618
5619 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5620 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5621 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5622 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5623 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5624 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5625 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5626 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5627 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5628 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5629 // q16.u = opx(V_CVT_F16_F32, q32.u);
5630 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5631
5632 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5633 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5634 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5635 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5636 .addUse(RHSExt.getReg(0))
5637 .setMIFlags(Flags);
5638 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5640 if (ST.hasMadMacF32Insts()) {
5641 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5642 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5643 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5644 } else {
5645 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5646 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5647 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5648 }
5649 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5650 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5651 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5652 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5653 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5654 .addUse(RDst.getReg(0))
5655 .addUse(RHS)
5656 .addUse(LHS)
5657 .setMIFlags(Flags);
5658
5659 MI.eraseFromParent();
5660 return true;
5661}
5662
5663static constexpr unsigned SPDenormModeBitField =
5665
5666// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5667// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5669 const GCNSubtarget &ST,
5671 // Set SP denorm mode to this value.
5672 unsigned SPDenormMode =
5673 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5674
5675 if (ST.hasDenormModeInst()) {
5676 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5677 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5678
5679 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5680 B.buildInstr(AMDGPU::S_DENORM_MODE)
5681 .addImm(NewDenormModeValue);
5682
5683 } else {
5684 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5685 .addImm(SPDenormMode)
5686 .addImm(SPDenormModeBitField);
5687 }
5688}
5689
5692 MachineIRBuilder &B) const {
5693 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5694 return true;
5695
5696 Register Res = MI.getOperand(0).getReg();
5697 Register LHS = MI.getOperand(1).getReg();
5698 Register RHS = MI.getOperand(2).getReg();
5699 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5700 SIModeRegisterDefaults Mode = MFI->getMode();
5701
5702 uint16_t Flags = MI.getFlags();
5703
5704 LLT S32 = LLT::scalar(32);
5705 LLT S1 = LLT::scalar(1);
5706
5707 auto One = B.buildFConstant(S32, 1.0f);
5708
5709 auto DenominatorScaled =
5710 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5711 .addUse(LHS)
5712 .addUse(RHS)
5713 .addImm(0)
5714 .setMIFlags(Flags);
5715 auto NumeratorScaled =
5716 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5717 .addUse(LHS)
5718 .addUse(RHS)
5719 .addImm(1)
5720 .setMIFlags(Flags);
5721
5722 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5723 .addUse(DenominatorScaled.getReg(0))
5724 .setMIFlags(Flags);
5725 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5726
5727 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5728 const bool HasDynamicDenormals =
5729 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5730 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5731
5732 Register SavedSPDenormMode;
5733 if (!PreservesDenormals) {
5734 if (HasDynamicDenormals) {
5735 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5736 B.buildInstr(AMDGPU::S_GETREG_B32)
5737 .addDef(SavedSPDenormMode)
5738 .addImm(SPDenormModeBitField);
5739 }
5740 toggleSPDenormMode(true, B, ST, Mode);
5741 }
5742
5743 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5744 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5745 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5746 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5747 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5748 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5749
5750 if (!PreservesDenormals) {
5751 if (HasDynamicDenormals) {
5752 assert(SavedSPDenormMode);
5753 B.buildInstr(AMDGPU::S_SETREG_B32)
5754 .addReg(SavedSPDenormMode)
5755 .addImm(SPDenormModeBitField);
5756 } else
5757 toggleSPDenormMode(false, B, ST, Mode);
5758 }
5759
5760 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5761 .addUse(Fma4.getReg(0))
5762 .addUse(Fma1.getReg(0))
5763 .addUse(Fma3.getReg(0))
5764 .addUse(NumeratorScaled.getReg(1))
5765 .setMIFlags(Flags);
5766
5767 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5768 .addUse(Fmas.getReg(0))
5769 .addUse(RHS)
5770 .addUse(LHS)
5771 .setMIFlags(Flags);
5772
5773 MI.eraseFromParent();
5774 return true;
5775}
5776
5779 MachineIRBuilder &B) const {
5780 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5781 return true;
5782
5783 Register Res = MI.getOperand(0).getReg();
5784 Register LHS = MI.getOperand(1).getReg();
5785 Register RHS = MI.getOperand(2).getReg();
5786
5787 uint16_t Flags = MI.getFlags();
5788
5789 LLT S64 = LLT::scalar(64);
5790 LLT S1 = LLT::scalar(1);
5791
5792 auto One = B.buildFConstant(S64, 1.0);
5793
5794 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5795 .addUse(LHS)
5796 .addUse(RHS)
5797 .addImm(0)
5798 .setMIFlags(Flags);
5799
5800 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5801
5802 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5803 .addUse(DivScale0.getReg(0))
5804 .setMIFlags(Flags);
5805
5806 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5807 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5808 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5809
5810 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5811 .addUse(LHS)
5812 .addUse(RHS)
5813 .addImm(1)
5814 .setMIFlags(Flags);
5815
5816 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5817 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5818 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5819
5820 Register Scale;
5821 if (!ST.hasUsableDivScaleConditionOutput()) {
5822 // Workaround a hardware bug on SI where the condition output from div_scale
5823 // is not usable.
5824
5825 LLT S32 = LLT::scalar(32);
5826
5827 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5828 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5829 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5830 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5831
5832 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5833 Scale1Unmerge.getReg(1));
5834 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5835 Scale0Unmerge.getReg(1));
5836 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5837 } else {
5838 Scale = DivScale1.getReg(1);
5839 }
5840
5841 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5842 .addUse(Fma4.getReg(0))
5843 .addUse(Fma3.getReg(0))
5844 .addUse(Mul.getReg(0))
5845 .addUse(Scale)
5846 .setMIFlags(Flags);
5847
5848 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5849 .addUse(Fmas.getReg(0))
5850 .addUse(RHS)
5851 .addUse(LHS)
5852 .setMIFlags(Flags);
5853
5854 MI.eraseFromParent();
5855 return true;
5856}
5857
5860 MachineIRBuilder &B) const {
5861 Register Res0 = MI.getOperand(0).getReg();
5862 Register Res1 = MI.getOperand(1).getReg();
5863 Register Val = MI.getOperand(2).getReg();
5864 uint16_t Flags = MI.getFlags();
5865
5866 LLT Ty = MRI.getType(Res0);
5867 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5868
5869 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5870 .addUse(Val)
5871 .setMIFlags(Flags);
5872 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5873 .addUse(Val)
5874 .setMIFlags(Flags);
5875
5876 if (ST.hasFractBug()) {
5877 auto Fabs = B.buildFAbs(Ty, Val);
5878 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5879 auto IsFinite =
5880 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5881 auto Zero = B.buildConstant(InstrExpTy, 0);
5882 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5883 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5884 }
5885
5886 B.buildCopy(Res0, Mant);
5887 B.buildSExtOrTrunc(Res1, Exp);
5888
5889 MI.eraseFromParent();
5890 return true;
5891}
5892
5895 MachineIRBuilder &B) const {
5896 Register Res = MI.getOperand(0).getReg();
5897 Register LHS = MI.getOperand(2).getReg();
5898 Register RHS = MI.getOperand(3).getReg();
5899 uint16_t Flags = MI.getFlags();
5900
5901 LLT S32 = LLT::scalar(32);
5902 LLT S1 = LLT::scalar(1);
5903
5904 auto Abs = B.buildFAbs(S32, RHS, Flags);
5905 const APFloat C0Val(1.0f);
5906
5907 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5908 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5909 auto C2 = B.buildFConstant(S32, 1.0f);
5910
5911 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5912 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5913
5914 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5915
5916 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5917 .addUse(Mul0.getReg(0))
5918 .setMIFlags(Flags);
5919
5920 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5921
5922 B.buildFMul(Res, Sel, Mul1, Flags);
5923
5924 MI.eraseFromParent();
5925 return true;
5926}
5927
5930 MachineIRBuilder &B) const {
5931 // Bypass the correct expansion a standard promotion through G_FSQRT would
5932 // get. The f32 op is accurate enough for the f16 cas.
5933 unsigned Flags = MI.getFlags();
5934 assert(!ST.has16BitInsts());
5935 const LLT F32 = LLT::scalar(32);
5936 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5937 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5938 .addUse(Ext.getReg(0))
5939 .setMIFlags(Flags);
5940 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5941 MI.eraseFromParent();
5942 return true;
5943}
5944
5947 MachineIRBuilder &B) const {
5948 MachineFunction &MF = B.getMF();
5949 Register Dst = MI.getOperand(0).getReg();
5950 Register X = MI.getOperand(1).getReg();
5951 const unsigned Flags = MI.getFlags();
5952 const LLT S1 = LLT::scalar(1);
5953 const LLT F32 = LLT::scalar(32);
5954 const LLT I32 = LLT::scalar(32);
5955
5956 if (allowApproxFunc(MF, Flags)) {
5957 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5958 .addUse(X)
5959 .setMIFlags(Flags);
5960 MI.eraseFromParent();
5961 return true;
5962 }
5963
5964 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5965 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5966 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5967 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5968 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5969
5971 if (needsDenormHandlingF32(MF, X, Flags)) {
5972 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5973 .addUse(SqrtX.getReg(0))
5974 .setMIFlags(Flags);
5975
5976 auto NegOne = B.buildConstant(I32, -1);
5977 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5978
5979 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5980 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5981
5982 auto PosOne = B.buildConstant(I32, 1);
5983 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5984
5985 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5986 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5987
5988 auto Zero = B.buildFConstant(F32, 0.0f);
5989 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5990
5991 SqrtS =
5992 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5993
5994 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5995 SqrtS =
5996 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5997 } else {
5998 auto SqrtR =
5999 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
6000 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
6001
6002 auto Half = B.buildFConstant(F32, 0.5f);
6003 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
6004 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
6005 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
6006 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
6007 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
6008 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
6009 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
6010 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
6011 }
6012
6013 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
6014
6015 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
6016
6017 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
6018
6019 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
6020 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
6021
6022 MI.eraseFromParent();
6023 return true;
6024}
6025
6028 MachineIRBuilder &B) const {
6029 // For double type, the SQRT and RSQ instructions don't have required
6030 // precision, we apply Goldschmidt's algorithm to improve the result:
6031 //
6032 // y0 = rsq(x)
6033 // g0 = x * y0
6034 // h0 = 0.5 * y0
6035 //
6036 // r0 = 0.5 - h0 * g0
6037 // g1 = g0 * r0 + g0
6038 // h1 = h0 * r0 + h0
6039 //
6040 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
6041 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
6042 // h2 = h1 * r1 + h1
6043 //
6044 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
6045 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
6046 //
6047 // sqrt(x) = g3
6048
6049 const LLT S1 = LLT::scalar(1);
6050 const LLT S32 = LLT::scalar(32);
6051 const LLT F64 = LLT::scalar(64);
6052
6053 Register Dst = MI.getOperand(0).getReg();
6054 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
6055
6056 Register X = MI.getOperand(1).getReg();
6057 unsigned Flags = MI.getFlags();
6058
6059 Register SqrtX = X;
6060 Register Scaling, ZeroInt;
6061 if (!MI.getFlag(MachineInstr::FmAfn)) {
6062 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
6063
6064 ZeroInt = B.buildConstant(S32, 0).getReg(0);
6065 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
6066
6067 // Scale up input if it is too small.
6068 auto ScaleUpFactor = B.buildConstant(S32, 256);
6069 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
6070 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
6071 }
6072
6073 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
6074
6075 auto Half = B.buildFConstant(F64, 0.5);
6076 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
6077 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
6078
6079 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
6080 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
6081
6082 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
6083 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
6084
6085 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
6086 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6087
6088 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6089
6090 Register SqrtRet = SqrtS2.getReg(0);
6091 if (!MI.getFlag(MachineInstr::FmAfn)) {
6092 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6093 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6094 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6095
6096 // Scale down the result.
6097 auto ScaleDownFactor = B.buildConstant(S32, -128);
6098 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6099 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6100 }
6101
6102 Register IsZeroOrInf;
6103 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6104 auto ZeroFP = B.buildFConstant(F64, 0.0);
6105 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6106 } else {
6107 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6108 }
6109
6110 // TODO: Check for DAZ and expand to subnormals
6111
6112 // If x is +INF, +0, or -0, use its original value
6113 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6114
6115 MI.eraseFromParent();
6116 return true;
6117}
6118
6121 MachineIRBuilder &B) const {
6122 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6123 if (Ty == LLT::scalar(32))
6124 return legalizeFSQRTF32(MI, MRI, B);
6125 if (Ty == LLT::scalar(64))
6126 return legalizeFSQRTF64(MI, MRI, B);
6127 if (Ty == LLT::scalar(16))
6128 return legalizeFSQRTF16(MI, MRI, B);
6129 return false;
6130}
6131
6132// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6133// FIXME: Why do we handle this one but not other removed instructions?
6134//
6135// Reciprocal square root. The clamp prevents infinite results, clamping
6136// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6137// +-max_float.
6140 MachineIRBuilder &B) const {
6141 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6142 return true;
6143
6144 Register Dst = MI.getOperand(0).getReg();
6145 Register Src = MI.getOperand(2).getReg();
6146 auto Flags = MI.getFlags();
6147
6148 LLT Ty = MRI.getType(Dst);
6149
6150 const fltSemantics *FltSemantics;
6151 if (Ty == LLT::scalar(32))
6152 FltSemantics = &APFloat::IEEEsingle();
6153 else if (Ty == LLT::scalar(64))
6154 FltSemantics = &APFloat::IEEEdouble();
6155 else
6156 return false;
6157
6158 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6159 .addUse(Src)
6160 .setMIFlags(Flags);
6161
6162 // We don't need to concern ourselves with the snan handling difference, since
6163 // the rsq quieted (or not) so use the one which will directly select.
6164 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6165 const bool UseIEEE = MFI->getMode().IEEE;
6166
6167 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6168 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6169 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6170
6171 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6172
6173 if (UseIEEE)
6174 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6175 else
6176 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6177 MI.eraseFromParent();
6178 return true;
6179}
6180
6181// TODO: Fix pointer type handling
6184 Intrinsic::ID IID) const {
6185
6186 MachineIRBuilder &B = Helper.MIRBuilder;
6187 MachineRegisterInfo &MRI = *B.getMRI();
6188
6189 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6190 IID == Intrinsic::amdgcn_permlanex16;
6191 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6192 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6193 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6194 IID == Intrinsic::amdgcn_permlane_up ||
6195 IID == Intrinsic::amdgcn_permlane_down ||
6196 IID == Intrinsic::amdgcn_permlane_xor;
6197
6198 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6199 Register Src2, LLT VT) -> Register {
6200 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6201 switch (IID) {
6202 case Intrinsic::amdgcn_readfirstlane:
6203 case Intrinsic::amdgcn_permlane64:
6204 return LaneOp.getReg(0);
6205 case Intrinsic::amdgcn_readlane:
6206 case Intrinsic::amdgcn_set_inactive:
6207 case Intrinsic::amdgcn_set_inactive_chain_arg:
6208 return LaneOp.addUse(Src1).getReg(0);
6209 case Intrinsic::amdgcn_writelane:
6210 case Intrinsic::amdgcn_permlane_bcast:
6211 case Intrinsic::amdgcn_permlane_up:
6212 case Intrinsic::amdgcn_permlane_down:
6213 case Intrinsic::amdgcn_permlane_xor:
6214 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6215 case Intrinsic::amdgcn_permlane16:
6216 case Intrinsic::amdgcn_permlanex16: {
6217 Register Src3 = MI.getOperand(5).getReg();
6218 int64_t Src4 = MI.getOperand(6).getImm();
6219 int64_t Src5 = MI.getOperand(7).getImm();
6220 return LaneOp.addUse(Src1)
6221 .addUse(Src2)
6222 .addUse(Src3)
6223 .addImm(Src4)
6224 .addImm(Src5)
6225 .getReg(0);
6226 }
6227 case Intrinsic::amdgcn_mov_dpp8:
6228 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6229 case Intrinsic::amdgcn_update_dpp:
6230 return LaneOp.addUse(Src1)
6231 .addImm(MI.getOperand(4).getImm())
6232 .addImm(MI.getOperand(5).getImm())
6233 .addImm(MI.getOperand(6).getImm())
6234 .addImm(MI.getOperand(7).getImm())
6235 .getReg(0);
6236 default:
6237 llvm_unreachable("unhandled lane op");
6238 }
6239 };
6240
6241 Register DstReg = MI.getOperand(0).getReg();
6242 Register Src0 = MI.getOperand(2).getReg();
6243 Register Src1, Src2;
6244 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6245 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6246 IsPermlaneShuffle) {
6247 Src1 = MI.getOperand(3).getReg();
6248 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6249 IsPermlaneShuffle) {
6250 Src2 = MI.getOperand(4).getReg();
6251 }
6252 }
6253
6254 LLT Ty = MRI.getType(DstReg);
6255 unsigned Size = Ty.getSizeInBits();
6256
6257 unsigned SplitSize = 32;
6258 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6259 ST.hasDPALU_DPP() &&
6260 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6261 SplitSize = 64;
6262
6263 if (Size == SplitSize) {
6264 // Already legal
6265 return true;
6266 }
6267
6268 if (Size < 32) {
6269 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6270
6271 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6272 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6273
6274 if (IID == Intrinsic::amdgcn_writelane)
6275 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6276
6277 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6278 B.buildTrunc(DstReg, LaneOpDst);
6279 MI.eraseFromParent();
6280 return true;
6281 }
6282
6283 if (Size % SplitSize != 0)
6284 return false;
6285
6286 LLT PartialResTy = LLT::scalar(SplitSize);
6287 bool NeedsBitcast = false;
6288 if (Ty.isVector()) {
6289 LLT EltTy = Ty.getElementType();
6290 unsigned EltSize = EltTy.getSizeInBits();
6291 if (EltSize == SplitSize) {
6292 PartialResTy = EltTy;
6293 } else if (EltSize == 16 || EltSize == 32) {
6294 unsigned NElem = SplitSize / EltSize;
6295 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6296 } else {
6297 // Handle all other cases via S32/S64 pieces
6298 NeedsBitcast = true;
6299 }
6300 }
6301
6302 SmallVector<Register, 4> PartialRes;
6303 unsigned NumParts = Size / SplitSize;
6304 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6305 MachineInstrBuilder Src1Parts, Src2Parts;
6306
6307 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6308 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6309
6310 if (IID == Intrinsic::amdgcn_writelane)
6311 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6312
6313 for (unsigned i = 0; i < NumParts; ++i) {
6314 Src0 = Src0Parts.getReg(i);
6315
6316 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6317 Src1 = Src1Parts.getReg(i);
6318
6319 if (IID == Intrinsic::amdgcn_writelane)
6320 Src2 = Src2Parts.getReg(i);
6321
6322 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6323 }
6324
6325 if (NeedsBitcast)
6326 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6327 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6328 else
6329 B.buildMergeLikeInstr(DstReg, PartialRes);
6330
6331 MI.eraseFromParent();
6332 return true;
6333}
6334
6337 MachineIRBuilder &B) const {
6339 ST.getTargetLowering()->getImplicitParameterOffset(
6341 LLT DstTy = MRI.getType(DstReg);
6342 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6343
6344 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6345 if (!loadInputValue(KernargPtrReg, B,
6347 return false;
6348
6349 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6350 B.buildConstant(IdxTy, Offset).getReg(0));
6351 return true;
6352}
6353
6354/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6355/// bits of the pointer and replace them with the stride argument, then
6356/// merge_values everything together. In the common case of a raw buffer (the
6357/// stride component is 0), we can just AND off the upper half.
6360 Register Result = MI.getOperand(0).getReg();
6361 Register Pointer = MI.getOperand(2).getReg();
6362 Register Stride = MI.getOperand(3).getReg();
6363 Register NumRecords = MI.getOperand(4).getReg();
6364 Register Flags = MI.getOperand(5).getReg();
6365
6366 LLT S32 = LLT::scalar(32);
6367 LLT S64 = LLT::scalar(64);
6368
6369 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6370
6371 auto ExtStride = B.buildAnyExt(S32, Stride);
6372
6373 if (ST.has45BitNumRecordsBufferResource()) {
6374 Register Zero = B.buildConstant(S32, 0).getReg(0);
6375 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6376 // num_records.
6377 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6378 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6379 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6380 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6381 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6382
6383 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6384 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6385 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6386 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6387 auto ExtShiftedStride =
6388 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6389 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6390 auto ExtShiftedFlags =
6391 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6392 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6393 Register HighHalf =
6394 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6395 B.buildMergeValues(Result, {LowHalf, HighHalf});
6396 } else {
6397 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6398 auto Unmerge = B.buildUnmerge(S32, Pointer);
6399 auto LowHalf = Unmerge.getReg(0);
6400 auto HighHalf = Unmerge.getReg(1);
6401
6402 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6403 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6404 auto ShiftConst = B.buildConstant(S32, 16);
6405 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6406 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6407 Register NewHighHalfReg = NewHighHalf.getReg(0);
6408 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6409 }
6410
6411 MI.eraseFromParent();
6412 return true;
6413}
6414
6417 MachineIRBuilder &B) const {
6418 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6419 if (!MFI->isEntryFunction()) {
6420 return legalizePreloadedArgIntrin(MI, MRI, B,
6422 }
6423
6424 Register DstReg = MI.getOperand(0).getReg();
6425 if (!getImplicitArgPtr(DstReg, MRI, B))
6426 return false;
6427
6428 MI.eraseFromParent();
6429 return true;
6430}
6431
6434 MachineIRBuilder &B) const {
6435 Function &F = B.getMF().getFunction();
6436 std::optional<uint32_t> KnownSize =
6438 if (KnownSize.has_value())
6439 B.buildConstant(DstReg, *KnownSize);
6440 return false;
6441}
6442
6445 MachineIRBuilder &B) const {
6446
6447 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6448 if (!MFI->isEntryFunction()) {
6449 return legalizePreloadedArgIntrin(MI, MRI, B,
6451 }
6452
6453 Register DstReg = MI.getOperand(0).getReg();
6454 if (!getLDSKernelId(DstReg, MRI, B))
6455 return false;
6456
6457 MI.eraseFromParent();
6458 return true;
6459}
6460
6464 unsigned AddrSpace) const {
6465 const LLT S32 = LLT::scalar(32);
6466 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6467 Register Hi32 = Unmerge.getReg(1);
6468
6469 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6470 ST.hasGloballyAddressableScratch()) {
6471 Register FlatScratchBaseHi =
6472 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6473 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6474 .getReg(0);
6475 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6476 // Test bits 63..58 against the aperture address.
6477 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6478 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6479 B.buildConstant(S32, 1u << 26));
6480 } else {
6481 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6482 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6483 }
6484 MI.eraseFromParent();
6485 return true;
6486}
6487
6488// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6489// offset (the offset that is included in bounds checking and swizzling, to be
6490// split between the instruction's voffset and immoffset fields) and soffset
6491// (the offset that is excluded from bounds checking and swizzling, to go in
6492// the instruction's soffset field). This function takes the first kind of
6493// offset and figures out how to split it between voffset and immoffset.
6494std::pair<Register, unsigned>
6496 Register OrigOffset) const {
6497 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6498 Register BaseReg;
6499 unsigned ImmOffset;
6500 const LLT S32 = LLT::scalar(32);
6501 MachineRegisterInfo &MRI = *B.getMRI();
6502
6503 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6504 // being added, so we can only safely match a 32-bit addition with no unsigned
6505 // overflow.
6506 bool CheckNUW = ST.hasGFX1250Insts();
6507 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6508 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6509
6510 // If BaseReg is a pointer, convert it to int.
6511 if (MRI.getType(BaseReg).isPointer())
6512 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6513
6514 // If the immediate value is too big for the immoffset field, put only bits
6515 // that would normally fit in the immoffset field. The remaining value that
6516 // is copied/added for the voffset field is a large power of 2, and it
6517 // stands more chance of being CSEd with the copy/add for another similar
6518 // load/store.
6519 // However, do not do that rounding down if that is a negative
6520 // number, as it appears to be illegal to have a negative offset in the
6521 // vgpr, even if adding the immediate offset makes it positive.
6522 unsigned Overflow = ImmOffset & ~MaxImm;
6523 ImmOffset -= Overflow;
6524 if ((int32_t)Overflow < 0) {
6525 Overflow += ImmOffset;
6526 ImmOffset = 0;
6527 }
6528
6529 if (Overflow != 0) {
6530 if (!BaseReg) {
6531 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6532 } else {
6533 auto OverflowVal = B.buildConstant(S32, Overflow);
6534 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6535 }
6536 }
6537
6538 if (!BaseReg)
6539 BaseReg = B.buildConstant(S32, 0).getReg(0);
6540
6541 return std::pair(BaseReg, ImmOffset);
6542}
6543
6544/// Handle register layout difference for f16 images for some subtargets.
6547 Register Reg,
6548 bool ImageStore) const {
6549 const LLT S16 = LLT::scalar(16);
6550 const LLT S32 = LLT::scalar(32);
6551 LLT StoreVT = MRI.getType(Reg);
6552 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6553
6554 if (ST.hasUnpackedD16VMem()) {
6555 auto Unmerge = B.buildUnmerge(S16, Reg);
6556
6557 SmallVector<Register, 4> WideRegs;
6558 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6559 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6560
6561 int NumElts = StoreVT.getNumElements();
6562
6563 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6564 .getReg(0);
6565 }
6566
6567 if (ImageStore && ST.hasImageStoreD16Bug()) {
6568 if (StoreVT.getNumElements() == 2) {
6569 SmallVector<Register, 4> PackedRegs;
6570 Reg = B.buildBitcast(S32, Reg).getReg(0);
6571 PackedRegs.push_back(Reg);
6572 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6573 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6574 .getReg(0);
6575 }
6576
6577 if (StoreVT.getNumElements() == 3) {
6578 SmallVector<Register, 4> PackedRegs;
6579 auto Unmerge = B.buildUnmerge(S16, Reg);
6580 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6581 PackedRegs.push_back(Unmerge.getReg(I));
6582 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6583 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6584 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6585 }
6586
6587 if (StoreVT.getNumElements() == 4) {
6588 SmallVector<Register, 4> PackedRegs;
6589 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6590 auto Unmerge = B.buildUnmerge(S32, Reg);
6591 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6592 PackedRegs.push_back(Unmerge.getReg(I));
6593 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6594 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6595 .getReg(0);
6596 }
6597
6598 llvm_unreachable("invalid data type");
6599 }
6600
6601 if (StoreVT == LLT::fixed_vector(3, S16)) {
6602 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6603 .getReg(0);
6604 }
6605 return Reg;
6606}
6607
6609 Register VData, LLT MemTy,
6610 bool IsFormat) const {
6611 MachineRegisterInfo *MRI = B.getMRI();
6612 LLT Ty = MRI->getType(VData);
6613
6614 const LLT S16 = LLT::scalar(16);
6615
6616 // Fixup buffer resources themselves needing to be v4i128.
6618 return castBufferRsrcToV4I32(VData, B);
6619
6620 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6621 Ty = getBitcastRegisterType(Ty);
6622 VData = B.buildBitcast(Ty, VData).getReg(0);
6623 }
6624 // Fixup illegal register types for i8 stores.
6625 if (Ty == LLT::scalar(8) || Ty == S16) {
6626 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6627 return AnyExt;
6628 }
6629
6630 if (Ty.isVector()) {
6631 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6632 if (IsFormat)
6633 return handleD16VData(B, *MRI, VData);
6634 }
6635 }
6636
6637 return VData;
6638}
6639
6641 LegalizerHelper &Helper,
6642 bool IsTyped,
6643 bool IsFormat) const {
6644 MachineIRBuilder &B = Helper.MIRBuilder;
6645 MachineRegisterInfo &MRI = *B.getMRI();
6646
6647 Register VData = MI.getOperand(1).getReg();
6648 LLT Ty = MRI.getType(VData);
6649 LLT EltTy = Ty.getScalarType();
6650 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6651 const LLT S32 = LLT::scalar(32);
6652
6653 MachineMemOperand *MMO = *MI.memoperands_begin();
6654 const int MemSize = MMO->getSize().getValue();
6655 LLT MemTy = MMO->getMemoryType();
6656
6657 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6658
6660 Register RSrc = MI.getOperand(2).getReg();
6661
6662 unsigned ImmOffset;
6663
6664 // The typed intrinsics add an immediate after the registers.
6665 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6666
6667 // The struct intrinsic variants add one additional operand over raw.
6668 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6669 Register VIndex;
6670 int OpOffset = 0;
6671 if (HasVIndex) {
6672 VIndex = MI.getOperand(3).getReg();
6673 OpOffset = 1;
6674 } else {
6675 VIndex = B.buildConstant(S32, 0).getReg(0);
6676 }
6677
6678 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6679 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6680
6681 unsigned Format = 0;
6682 if (IsTyped) {
6683 Format = MI.getOperand(5 + OpOffset).getImm();
6684 ++OpOffset;
6685 }
6686
6687 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6688
6689 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6690
6691 unsigned Opc;
6692 if (IsTyped) {
6693 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6694 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6695 } else if (IsFormat) {
6696 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6697 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6698 } else {
6699 switch (MemSize) {
6700 case 1:
6701 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6702 break;
6703 case 2:
6704 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6705 break;
6706 default:
6707 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6708 break;
6709 }
6710 }
6711
6712 auto MIB = B.buildInstr(Opc)
6713 .addUse(VData) // vdata
6714 .addUse(RSrc) // rsrc
6715 .addUse(VIndex) // vindex
6716 .addUse(VOffset) // voffset
6717 .addUse(SOffset) // soffset
6718 .addImm(ImmOffset); // offset(imm)
6719
6720 if (IsTyped)
6721 MIB.addImm(Format);
6722
6723 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6724 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6725 .addMemOperand(MMO);
6726
6727 MI.eraseFromParent();
6728 return true;
6729}
6730
6731static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6732 Register VIndex, Register VOffset, Register SOffset,
6733 unsigned ImmOffset, unsigned Format,
6734 unsigned AuxiliaryData, MachineMemOperand *MMO,
6735 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6736 auto MIB = B.buildInstr(Opc)
6737 .addDef(LoadDstReg) // vdata
6738 .addUse(RSrc) // rsrc
6739 .addUse(VIndex) // vindex
6740 .addUse(VOffset) // voffset
6741 .addUse(SOffset) // soffset
6742 .addImm(ImmOffset); // offset(imm)
6743
6744 if (IsTyped)
6745 MIB.addImm(Format);
6746
6747 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6748 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6749 .addMemOperand(MMO);
6750}
6751
6753 LegalizerHelper &Helper,
6754 bool IsFormat,
6755 bool IsTyped) const {
6756 MachineIRBuilder &B = Helper.MIRBuilder;
6757 MachineRegisterInfo &MRI = *B.getMRI();
6758 GISelChangeObserver &Observer = Helper.Observer;
6759
6760 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6761 MachineMemOperand *MMO = *MI.memoperands_begin();
6762 const LLT MemTy = MMO->getMemoryType();
6763 const LLT S32 = LLT::scalar(32);
6764
6765 Register Dst = MI.getOperand(0).getReg();
6766
6767 Register StatusDst;
6768 int OpOffset = 0;
6769 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6770 bool IsTFE = MI.getNumExplicitDefs() == 2;
6771 if (IsTFE) {
6772 StatusDst = MI.getOperand(1).getReg();
6773 ++OpOffset;
6774 }
6775
6776 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6777 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6778
6779 // The typed intrinsics add an immediate after the registers.
6780 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6781
6782 // The struct intrinsic variants add one additional operand over raw.
6783 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6784 Register VIndex;
6785 if (HasVIndex) {
6786 VIndex = MI.getOperand(3 + OpOffset).getReg();
6787 ++OpOffset;
6788 } else {
6789 VIndex = B.buildConstant(S32, 0).getReg(0);
6790 }
6791
6792 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6793 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6794
6795 unsigned Format = 0;
6796 if (IsTyped) {
6797 Format = MI.getOperand(5 + OpOffset).getImm();
6798 ++OpOffset;
6799 }
6800
6801 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6802 unsigned ImmOffset;
6803
6804 LLT Ty = MRI.getType(Dst);
6805 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6806 // logic doesn't have to handle that case.
6807 if (hasBufferRsrcWorkaround(Ty)) {
6808 Observer.changingInstr(MI);
6809 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6810 Observer.changedInstr(MI);
6811 Dst = MI.getOperand(0).getReg();
6812 B.setInsertPt(B.getMBB(), MI);
6813 }
6814 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6815 Ty = getBitcastRegisterType(Ty);
6816 Observer.changingInstr(MI);
6817 Helper.bitcastDst(MI, Ty, 0);
6818 Observer.changedInstr(MI);
6819 Dst = MI.getOperand(0).getReg();
6820 B.setInsertPt(B.getMBB(), MI);
6821 }
6822
6823 LLT EltTy = Ty.getScalarType();
6824 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6825 const bool Unpacked = ST.hasUnpackedD16VMem();
6826
6827 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6828
6829 unsigned Opc;
6830
6831 // TODO: Support TFE for typed and narrow loads.
6832 if (IsTyped) {
6833 if (IsTFE)
6834 return false;
6835 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6836 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6837 } else if (IsFormat) {
6838 if (IsD16) {
6839 if (IsTFE)
6840 return false;
6841 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6842 } else {
6843 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6844 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6845 }
6846 } else {
6847 switch (MemTy.getSizeInBits()) {
6848 case 8:
6849 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6850 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6851 break;
6852 case 16:
6853 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6854 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6855 break;
6856 default:
6857 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6858 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6859 break;
6860 }
6861 }
6862
6863 if (IsTFE) {
6864 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6865 unsigned NumLoadDWords = NumValueDWords + 1;
6866 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6867 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6868 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6869 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6870 if (MemTy.getSizeInBits() < 32) {
6871 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6872 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6873 B.buildTrunc(Dst, ExtDst);
6874 } else if (NumValueDWords == 1) {
6875 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6876 } else {
6877 SmallVector<Register, 5> LoadElts;
6878 for (unsigned I = 0; I != NumValueDWords; ++I)
6879 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6880 LoadElts.push_back(StatusDst);
6881 B.buildUnmerge(LoadElts, LoadDstReg);
6882 LoadElts.truncate(NumValueDWords);
6883 B.buildMergeLikeInstr(Dst, LoadElts);
6884 }
6885 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6886 (IsD16 && !Ty.isVector())) {
6887 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6888 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6889 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6890 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6891 B.buildTrunc(Dst, LoadDstReg);
6892 } else if (Unpacked && IsD16 && Ty.isVector()) {
6893 LLT UnpackedTy = Ty.changeElementSize(32);
6894 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6895 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6896 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6897 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6898 // FIXME: G_TRUNC should work, but legalization currently fails
6899 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6901 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6902 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6903 B.buildMergeLikeInstr(Dst, Repack);
6904 } else {
6905 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6906 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6907 }
6908
6909 MI.eraseFromParent();
6910 return true;
6911}
6912
6913static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6914 switch (IntrID) {
6915 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6916 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6917 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6919 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6920 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6922 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6923 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6924 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6925 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6927 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6930 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6931 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6932 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6934 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6935 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6936 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6937 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6938 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6939 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6940 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6941 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6942 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6943 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6944 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6945 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6946 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6947 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6949 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6950 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6951 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6952 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6954 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6955 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6956 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6957 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6959 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6960 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6961 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6962 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6963 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6964 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6965 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6966 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6967 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6968 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6969 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6970 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6971 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6972 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6973 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6974 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6975 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6976 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6977 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6978 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6979 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6980 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6981 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6982 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6983 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6984 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6985 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6986 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6987 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6988 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6989 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6990 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6991 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6992 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6993 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6994 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6995 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6996 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6997 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6998 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6999 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
7000 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
7001 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
7002 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
7003 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
7004 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
7005 default:
7006 llvm_unreachable("unhandled atomic opcode");
7007 }
7008}
7009
7012 Intrinsic::ID IID) const {
7013 const bool IsCmpSwap =
7014 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
7015 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
7016 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
7017 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
7018
7019 Register Dst = MI.getOperand(0).getReg();
7020 // Since we don't have 128-bit atomics, we don't need to handle the case of
7021 // p8 argmunents to the atomic itself
7022 Register VData = MI.getOperand(2).getReg();
7023
7024 Register CmpVal;
7025 int OpOffset = 0;
7026
7027 if (IsCmpSwap) {
7028 CmpVal = MI.getOperand(3).getReg();
7029 ++OpOffset;
7030 }
7031
7032 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
7033 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
7034 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7035
7036 // The struct intrinsic variants add one additional operand over raw.
7037 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
7038 Register VIndex;
7039 if (HasVIndex) {
7040 VIndex = MI.getOperand(4 + OpOffset).getReg();
7041 ++OpOffset;
7042 } else {
7043 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
7044 }
7045
7046 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
7047 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
7048 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
7049
7050 MachineMemOperand *MMO = *MI.memoperands_begin();
7051
7052 unsigned ImmOffset;
7053 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
7054
7055 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
7056 .addDef(Dst)
7057 .addUse(VData); // vdata
7058
7059 if (IsCmpSwap)
7060 MIB.addReg(CmpVal);
7061
7062 MIB.addUse(RSrc) // rsrc
7063 .addUse(VIndex) // vindex
7064 .addUse(VOffset) // voffset
7065 .addUse(SOffset) // soffset
7066 .addImm(ImmOffset) // offset(imm)
7067 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
7068 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
7069 .addMemOperand(MMO);
7070
7071 MI.eraseFromParent();
7072 return true;
7073}
7074
7075/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
7076/// vector with s16 typed elements.
7078 SmallVectorImpl<Register> &PackedAddrs,
7079 unsigned ArgOffset,
7081 bool IsA16, bool IsG16) {
7082 const LLT S16 = LLT::scalar(16);
7083 const LLT V2S16 = LLT::fixed_vector(2, 16);
7084 auto EndIdx = Intr->VAddrEnd;
7085
7086 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7087 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7088 if (!SrcOp.isReg())
7089 continue; // _L to _LZ may have eliminated this.
7090
7091 Register AddrReg = SrcOp.getReg();
7092
7093 if ((I < Intr->GradientStart) ||
7094 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7095 (I >= Intr->CoordStart && !IsA16)) {
7096 if ((I < Intr->GradientStart) && IsA16 &&
7097 (B.getMRI()->getType(AddrReg) == S16)) {
7098 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7099 // Special handling of bias when A16 is on. Bias is of type half but
7100 // occupies full 32-bit.
7101 PackedAddrs.push_back(
7102 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7103 .getReg(0));
7104 } else {
7105 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7106 "Bias needs to be converted to 16 bit in A16 mode");
7107 // Handle any gradient or coordinate operands that should not be packed
7108 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7109 PackedAddrs.push_back(AddrReg);
7110 }
7111 } else {
7112 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7113 // derivatives dx/dh and dx/dv are packed with undef.
7114 if (((I + 1) >= EndIdx) ||
7115 ((Intr->NumGradients / 2) % 2 == 1 &&
7116 (I == static_cast<unsigned>(Intr->GradientStart +
7117 (Intr->NumGradients / 2) - 1) ||
7118 I == static_cast<unsigned>(Intr->GradientStart +
7119 Intr->NumGradients - 1))) ||
7120 // Check for _L to _LZ optimization
7121 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7122 PackedAddrs.push_back(
7123 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7124 .getReg(0));
7125 } else {
7126 PackedAddrs.push_back(
7127 B.buildBuildVector(
7128 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7129 .getReg(0));
7130 ++I;
7131 }
7132 }
7133 }
7134}
7135
7136/// Convert from separate vaddr components to a single vector address register,
7137/// and replace the remaining operands with $noreg.
7139 int DimIdx, int NumVAddrs) {
7140 const LLT S32 = LLT::scalar(32);
7141 (void)S32;
7142 SmallVector<Register, 8> AddrRegs;
7143 for (int I = 0; I != NumVAddrs; ++I) {
7144 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7145 if (SrcOp.isReg()) {
7146 AddrRegs.push_back(SrcOp.getReg());
7147 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7148 }
7149 }
7150
7151 int NumAddrRegs = AddrRegs.size();
7152 if (NumAddrRegs != 1) {
7153 auto VAddr =
7154 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7155 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7156 }
7157
7158 for (int I = 1; I != NumVAddrs; ++I) {
7159 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7160 if (SrcOp.isReg())
7161 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7162 }
7163}
7164
7165/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7166///
7167/// Depending on the subtarget, load/store with 16-bit element data need to be
7168/// rewritten to use the low half of 32-bit registers, or directly use a packed
7169/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7170/// registers.
7171///
7172/// We don't want to directly select image instructions just yet, but also want
7173/// to exposes all register repacking to the legalizer/combiners. We also don't
7174/// want a selected instruction entering RegBankSelect. In order to avoid
7175/// defining a multitude of intermediate image instructions, directly hack on
7176/// the intrinsic's arguments. In cases like a16 addresses, this requires
7177/// padding now unnecessary arguments with $noreg.
7180 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7181
7182 const MachineFunction &MF = *MI.getMF();
7183 const unsigned NumDefs = MI.getNumExplicitDefs();
7184 const unsigned ArgOffset = NumDefs + 1;
7185 bool IsTFE = NumDefs == 2;
7186 // We are only processing the operands of d16 image operations on subtargets
7187 // that use the unpacked register layout, or need to repack the TFE result.
7188
7189 // TODO: Do we need to guard against already legalized intrinsics?
7190 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7192
7193 MachineRegisterInfo *MRI = B.getMRI();
7194 const LLT S32 = LLT::scalar(32);
7195 const LLT S16 = LLT::scalar(16);
7196 const LLT V2S16 = LLT::fixed_vector(2, 16);
7197
7198 unsigned DMask = 0;
7199 Register VData;
7200 LLT Ty;
7201
7202 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7203 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7204 Ty = MRI->getType(VData);
7205 }
7206
7207 const bool IsAtomicPacked16Bit =
7208 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7209 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7210
7211 // Check for 16 bit addresses and pack if true.
7212 LLT GradTy =
7213 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7214 LLT AddrTy =
7215 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7216 const bool IsG16 =
7217 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7218 const bool IsA16 = AddrTy == S16;
7219 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7220
7221 int DMaskLanes = 0;
7222 if (!BaseOpcode->Atomic) {
7223 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7224 if (BaseOpcode->Gather4) {
7225 DMaskLanes = 4;
7226 } else if (DMask != 0) {
7227 DMaskLanes = llvm::popcount(DMask);
7228 } else if (!IsTFE && !BaseOpcode->Store) {
7229 // If dmask is 0, this is a no-op load. This can be eliminated.
7230 B.buildUndef(MI.getOperand(0));
7231 MI.eraseFromParent();
7232 return true;
7233 }
7234 }
7235
7236 Observer.changingInstr(MI);
7237 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7238
7239 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7240 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7241 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7242 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7243 unsigned NewOpcode = LoadOpcode;
7244 if (BaseOpcode->Store)
7245 NewOpcode = StoreOpcode;
7246 else if (BaseOpcode->NoReturn)
7247 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7248
7249 // Track that we legalized this
7250 MI.setDesc(B.getTII().get(NewOpcode));
7251
7252 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7253 // dmask to be at least 1 otherwise the instruction will fail
7254 if (IsTFE && DMask == 0) {
7255 DMask = 0x1;
7256 DMaskLanes = 1;
7257 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7258 }
7259
7260 if (BaseOpcode->Atomic) {
7261 Register VData0 = MI.getOperand(2).getReg();
7262 LLT Ty = MRI->getType(VData0);
7263
7264 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7265 if (Ty.isVector() && !IsAtomicPacked16Bit)
7266 return false;
7267
7268 if (BaseOpcode->AtomicX2) {
7269 Register VData1 = MI.getOperand(3).getReg();
7270 // The two values are packed in one register.
7271 LLT PackedTy = LLT::fixed_vector(2, Ty);
7272 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7273 MI.getOperand(2).setReg(Concat.getReg(0));
7274 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7275 }
7276 }
7277
7278 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7279
7280 // Rewrite the addressing register layout before doing anything else.
7281 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7282 // 16 bit gradients are supported, but are tied to the A16 control
7283 // so both gradients and addresses must be 16 bit
7284 return false;
7285 }
7286
7287 if (IsA16 && !ST.hasA16()) {
7288 // A16 not supported
7289 return false;
7290 }
7291
7292 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7293 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7294
7295 if (IsA16 || IsG16) {
7296 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7297 // instructions expect VGPR_32
7298 SmallVector<Register, 4> PackedRegs;
7299
7300 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7301
7302 // See also below in the non-a16 branch
7303 const bool UseNSA = ST.hasNSAEncoding() &&
7304 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7305 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7306 const bool UsePartialNSA =
7307 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7308
7309 if (UsePartialNSA) {
7310 // Pack registers that would go over NSAMaxSize into last VAddr register
7311 LLT PackedAddrTy =
7312 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7313 auto Concat = B.buildConcatVectors(
7314 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7315 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7316 PackedRegs.resize(NSAMaxSize);
7317 } else if (!UseNSA && PackedRegs.size() > 1) {
7318 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7319 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7320 PackedRegs[0] = Concat.getReg(0);
7321 PackedRegs.resize(1);
7322 }
7323
7324 const unsigned NumPacked = PackedRegs.size();
7325 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7326 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7327 if (!SrcOp.isReg()) {
7328 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7329 continue;
7330 }
7331
7332 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7333
7334 if (I - Intr->VAddrStart < NumPacked)
7335 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7336 else
7337 SrcOp.setReg(AMDGPU::NoRegister);
7338 }
7339 } else {
7340 // If the register allocator cannot place the address registers contiguously
7341 // without introducing moves, then using the non-sequential address encoding
7342 // is always preferable, since it saves VALU instructions and is usually a
7343 // wash in terms of code size or even better.
7344 //
7345 // However, we currently have no way of hinting to the register allocator
7346 // that MIMG addresses should be placed contiguously when it is possible to
7347 // do so, so force non-NSA for the common 2-address case as a heuristic.
7348 //
7349 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7350 // allocation when possible.
7351 //
7352 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7353 // set of the remaining addresses.
7354 const bool UseNSA = ST.hasNSAEncoding() &&
7355 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7356 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7357 const bool UsePartialNSA =
7358 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7359
7360 if (UsePartialNSA) {
7362 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7363 Intr->NumVAddrs - NSAMaxSize + 1);
7364 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7365 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7366 Intr->NumVAddrs);
7367 }
7368 }
7369
7370 int Flags = 0;
7371 if (IsA16)
7372 Flags |= 1;
7373 if (IsG16)
7374 Flags |= 2;
7375 MI.addOperand(MachineOperand::CreateImm(Flags));
7376
7377 if (BaseOpcode->NoReturn) { // No TFE for stores?
7378 // TODO: Handle dmask trim
7379 if (!Ty.isVector() || !IsD16)
7380 return true;
7381
7382 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7383 if (RepackedReg != VData) {
7384 MI.getOperand(1).setReg(RepackedReg);
7385 }
7386
7387 return true;
7388 }
7389
7390 Register DstReg = MI.getOperand(0).getReg();
7391 const LLT EltTy = Ty.getScalarType();
7392 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7393
7394 // Confirm that the return type is large enough for the dmask specified
7395 if (NumElts < DMaskLanes)
7396 return false;
7397
7398 if (NumElts > 4 || DMaskLanes > 4)
7399 return false;
7400
7401 // Image atomic instructions are using DMask to specify how many bits
7402 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7403 // DMaskLanes for image atomic has default value '0'.
7404 // We must be sure that atomic variants (especially packed) will not be
7405 // truncated from v2s16 or v4s16 to s16 type.
7406 //
7407 // ChangeElementCount will be needed for image load where Ty is always scalar.
7408 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7409 const LLT AdjustedTy =
7410 DMaskLanes == 0
7411 ? Ty
7412 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7413
7414 // The raw dword aligned data component of the load. The only legal cases
7415 // where this matters should be when using the packed D16 format, for
7416 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7417 LLT RoundedTy;
7418
7419 // S32 vector to cover all data, plus TFE result element.
7420 LLT TFETy;
7421
7422 // Register type to use for each loaded component. Will be S32 or V2S16.
7423 LLT RegTy;
7424
7425 if (IsD16 && ST.hasUnpackedD16VMem()) {
7426 RoundedTy =
7427 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7428 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7429 RegTy = S32;
7430 } else {
7431 unsigned EltSize = EltTy.getSizeInBits();
7432 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7433 unsigned RoundedSize = 32 * RoundedElts;
7434 RoundedTy = LLT::scalarOrVector(
7435 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7436 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7437 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7438 }
7439
7440 // The return type does not need adjustment.
7441 // TODO: Should we change s16 case to s32 or <2 x s16>?
7442 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7443 return true;
7444
7445 Register Dst1Reg;
7446
7447 // Insert after the instruction.
7448 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7449
7450 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7451 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7452 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7453 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7454
7455 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7456
7457 MI.getOperand(0).setReg(NewResultReg);
7458
7459 // In the IR, TFE is supposed to be used with a 2 element struct return
7460 // type. The instruction really returns these two values in one contiguous
7461 // register, with one additional dword beyond the loaded data. Rewrite the
7462 // return type to use a single register result.
7463
7464 if (IsTFE) {
7465 Dst1Reg = MI.getOperand(1).getReg();
7466 if (MRI->getType(Dst1Reg) != S32)
7467 return false;
7468
7469 // TODO: Make sure the TFE operand bit is set.
7470 MI.removeOperand(1);
7471
7472 // Handle the easy case that requires no repack instructions.
7473 if (Ty == S32) {
7474 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7475 return true;
7476 }
7477 }
7478
7479 // Now figure out how to copy the new result register back into the old
7480 // result.
7481 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7482
7483 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7484
7485 if (ResultNumRegs == 1) {
7486 assert(!IsTFE);
7487 ResultRegs[0] = NewResultReg;
7488 } else {
7489 // We have to repack into a new vector of some kind.
7490 for (int I = 0; I != NumDataRegs; ++I)
7491 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7492 B.buildUnmerge(ResultRegs, NewResultReg);
7493
7494 // Drop the final TFE element to get the data part. The TFE result is
7495 // directly written to the right place already.
7496 if (IsTFE)
7497 ResultRegs.resize(NumDataRegs);
7498 }
7499
7500 // For an s16 scalar result, we form an s32 result with a truncate regardless
7501 // of packed vs. unpacked.
7502 if (IsD16 && !Ty.isVector()) {
7503 B.buildTrunc(DstReg, ResultRegs[0]);
7504 return true;
7505 }
7506
7507 // Avoid a build/concat_vector of 1 entry.
7508 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7509 B.buildBitcast(DstReg, ResultRegs[0]);
7510 return true;
7511 }
7512
7513 assert(Ty.isVector());
7514
7515 if (IsD16) {
7516 // For packed D16 results with TFE enabled, all the data components are
7517 // S32. Cast back to the expected type.
7518 //
7519 // TODO: We don't really need to use load s32 elements. We would only need one
7520 // cast for the TFE result if a multiple of v2s16 was used.
7521 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7522 for (Register &Reg : ResultRegs)
7523 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7524 } else if (ST.hasUnpackedD16VMem()) {
7525 for (Register &Reg : ResultRegs)
7526 Reg = B.buildTrunc(S16, Reg).getReg(0);
7527 }
7528 }
7529
7530 auto padWithUndef = [&](LLT Ty, int NumElts) {
7531 if (NumElts == 0)
7532 return;
7533 Register Undef = B.buildUndef(Ty).getReg(0);
7534 for (int I = 0; I != NumElts; ++I)
7535 ResultRegs.push_back(Undef);
7536 };
7537
7538 // Pad out any elements eliminated due to the dmask.
7539 LLT ResTy = MRI->getType(ResultRegs[0]);
7540 if (!ResTy.isVector()) {
7541 padWithUndef(ResTy, NumElts - ResultRegs.size());
7542 B.buildBuildVector(DstReg, ResultRegs);
7543 return true;
7544 }
7545
7546 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7547 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7548
7549 // Deal with the one annoying legal case.
7550 const LLT V3S16 = LLT::fixed_vector(3, 16);
7551 if (Ty == V3S16) {
7552 if (IsTFE) {
7553 if (ResultRegs.size() == 1) {
7554 NewResultReg = ResultRegs[0];
7555 } else if (ResultRegs.size() == 2) {
7556 LLT V4S16 = LLT::fixed_vector(4, 16);
7557 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7558 } else {
7559 return false;
7560 }
7561 }
7562
7563 if (MRI->getType(DstReg).getNumElements() <
7564 MRI->getType(NewResultReg).getNumElements()) {
7565 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7566 } else {
7567 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7568 }
7569 return true;
7570 }
7571
7572 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7573 B.buildConcatVectors(DstReg, ResultRegs);
7574 return true;
7575}
7576
7578 MachineInstr &MI) const {
7579 MachineIRBuilder &B = Helper.MIRBuilder;
7580 GISelChangeObserver &Observer = Helper.Observer;
7581
7582 Register OrigDst = MI.getOperand(0).getReg();
7583 Register Dst;
7584 LLT Ty = B.getMRI()->getType(OrigDst);
7585 unsigned Size = Ty.getSizeInBits();
7586 MachineFunction &MF = B.getMF();
7587 unsigned Opc = 0;
7588 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7589 assert(Size == 8 || Size == 16);
7590 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7591 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7592 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7593 // destination register.
7594 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7595 } else {
7596 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7597 Dst = OrigDst;
7598 }
7599
7600 Observer.changingInstr(MI);
7601
7602 // Handle needing to s.buffer.load() a p8 value.
7603 if (hasBufferRsrcWorkaround(Ty)) {
7604 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7605 B.setInsertPt(B.getMBB(), MI);
7606 }
7608 Ty = getBitcastRegisterType(Ty);
7609 Helper.bitcastDst(MI, Ty, 0);
7610 B.setInsertPt(B.getMBB(), MI);
7611 }
7612
7613 // FIXME: We don't really need this intermediate instruction. The intrinsic
7614 // should be fixed to have a memory operand. Since it's readnone, we're not
7615 // allowed to add one.
7616 MI.setDesc(B.getTII().get(Opc));
7617 MI.removeOperand(1); // Remove intrinsic ID
7618
7619 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7620 const unsigned MemSize = (Size + 7) / 8;
7621 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7627 MemSize, MemAlign);
7628 MI.addMemOperand(MF, MMO);
7629 if (Dst != OrigDst) {
7630 MI.getOperand(0).setReg(Dst);
7631 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7632 B.buildTrunc(OrigDst, Dst);
7633 }
7634
7635 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7636 // always be legal. We may need to restore this to a 96-bit result if it turns
7637 // out this needs to be converted to a vector load during RegBankSelect.
7638 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7639 if (Ty.isVector())
7641 else
7642 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7643 }
7644
7645 Observer.changedInstr(MI);
7646 return true;
7647}
7648
7650 MachineInstr &MI) const {
7651 MachineIRBuilder &B = Helper.MIRBuilder;
7652 GISelChangeObserver &Observer = Helper.Observer;
7653 Observer.changingInstr(MI);
7654 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7655 MI.removeOperand(0); // Remove intrinsic ID
7657 Observer.changedInstr(MI);
7658 return true;
7659}
7660
7661// TODO: Move to selection
7664 MachineIRBuilder &B) const {
7665 if (!ST.hasTrapHandler() ||
7666 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7667 return legalizeTrapEndpgm(MI, MRI, B);
7668
7669 return ST.supportsGetDoorbellID() ?
7671}
7672
7675 const DebugLoc &DL = MI.getDebugLoc();
7676 MachineBasicBlock &BB = B.getMBB();
7677 MachineFunction *MF = BB.getParent();
7678
7679 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7680 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7681 .addImm(0);
7682 MI.eraseFromParent();
7683 return true;
7684 }
7685
7686 // We need a block split to make the real endpgm a terminator. We also don't
7687 // want to break phis in successor blocks, so we can't just delete to the
7688 // end of the block.
7689 BB.splitAt(MI, false /*UpdateLiveIns*/);
7691 MF->push_back(TrapBB);
7692 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7693 .addImm(0);
7694 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7695 .addMBB(TrapBB);
7696
7697 BB.addSuccessor(TrapBB);
7698 MI.eraseFromParent();
7699 return true;
7700}
7701
7704 MachineFunction &MF = B.getMF();
7705 const LLT S64 = LLT::scalar(64);
7706
7707 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7708 // For code object version 5, queue_ptr is passed through implicit kernarg.
7714 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7715
7716 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7718
7719 if (!loadInputValue(KernargPtrReg, B,
7721 return false;
7722
7723 // TODO: can we be smarter about machine pointer info?
7726 PtrInfo.getWithOffset(Offset),
7730
7731 // Pointer address
7734 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7735 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7736 // Load address
7737 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7738 B.buildCopy(SGPR01, Temp);
7739 B.buildInstr(AMDGPU::S_TRAP)
7740 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7741 .addReg(SGPR01, RegState::Implicit);
7742 MI.eraseFromParent();
7743 return true;
7744 }
7745
7746 // Pass queue pointer to trap handler as input, and insert trap instruction
7747 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7748 Register LiveIn =
7751 return false;
7752
7753 B.buildCopy(SGPR01, LiveIn);
7754 B.buildInstr(AMDGPU::S_TRAP)
7755 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7756 .addReg(SGPR01, RegState::Implicit);
7757
7758 MI.eraseFromParent();
7759 return true;
7760}
7761
7764 MachineIRBuilder &B) const {
7765 // We need to simulate the 's_trap 2' instruction on targets that run in
7766 // PRIV=1 (where it is treated as a nop).
7767 if (ST.hasPrivEnabledTrap2NopBug()) {
7768 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7769 MI.getDebugLoc());
7770 MI.eraseFromParent();
7771 return true;
7772 }
7773
7774 B.buildInstr(AMDGPU::S_TRAP)
7775 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7776 MI.eraseFromParent();
7777 return true;
7778}
7779
7782 MachineIRBuilder &B) const {
7783 // Is non-HSA path or trap-handler disabled? Then, report a warning
7784 // accordingly
7785 if (!ST.hasTrapHandler() ||
7786 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7787 Function &Fn = B.getMF().getFunction();
7789 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7790 } else {
7791 // Insert debug-trap instruction
7792 B.buildInstr(AMDGPU::S_TRAP)
7793 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7794 }
7795
7796 MI.eraseFromParent();
7797 return true;
7798}
7799
7801 MachineInstr &MI, MachineIRBuilder &B) const {
7802 MachineRegisterInfo &MRI = *B.getMRI();
7803 const LLT S16 = LLT::scalar(16);
7804 const LLT S32 = LLT::scalar(32);
7805 const LLT V2S16 = LLT::fixed_vector(2, 16);
7806 const LLT V3S32 = LLT::fixed_vector(3, 32);
7807
7808 Register DstReg = MI.getOperand(0).getReg();
7809 Register NodePtr = MI.getOperand(2).getReg();
7810 Register RayExtent = MI.getOperand(3).getReg();
7811 Register RayOrigin = MI.getOperand(4).getReg();
7812 Register RayDir = MI.getOperand(5).getReg();
7813 Register RayInvDir = MI.getOperand(6).getReg();
7814 Register TDescr = MI.getOperand(7).getReg();
7815
7816 if (!ST.hasGFX10_AEncoding()) {
7817 Function &Fn = B.getMF().getFunction();
7819 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7820 return false;
7821 }
7822
7823 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7824 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7825 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7826 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7827 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7828 const unsigned NumVDataDwords = 4;
7829 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7830 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7831 const bool UseNSA =
7832 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7833
7834 const unsigned BaseOpcodes[2][2] = {
7835 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7836 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7837 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7838 int Opcode;
7839 if (UseNSA) {
7840 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7841 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7842 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7843 : AMDGPU::MIMGEncGfx10NSA,
7844 NumVDataDwords, NumVAddrDwords);
7845 } else {
7846 assert(!IsGFX12Plus);
7847 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7848 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7849 : AMDGPU::MIMGEncGfx10Default,
7850 NumVDataDwords, NumVAddrDwords);
7851 }
7852 assert(Opcode != -1);
7853
7855 if (UseNSA && IsGFX11Plus) {
7856 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7857 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7858 auto Merged = B.buildMergeLikeInstr(
7859 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7860 Ops.push_back(Merged.getReg(0));
7861 };
7862
7863 Ops.push_back(NodePtr);
7864 Ops.push_back(RayExtent);
7865 packLanes(RayOrigin);
7866
7867 if (IsA16) {
7868 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7869 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7870 auto MergedDir = B.buildMergeLikeInstr(
7871 V3S32,
7872 {B.buildBitcast(
7873 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7874 UnmergeRayDir.getReg(0)}))
7875 .getReg(0),
7876 B.buildBitcast(
7877 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7878 UnmergeRayDir.getReg(1)}))
7879 .getReg(0),
7880 B.buildBitcast(
7881 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7882 UnmergeRayDir.getReg(2)}))
7883 .getReg(0)});
7884 Ops.push_back(MergedDir.getReg(0));
7885 } else {
7886 packLanes(RayDir);
7887 packLanes(RayInvDir);
7888 }
7889 } else {
7890 if (Is64) {
7891 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7892 Ops.push_back(Unmerge.getReg(0));
7893 Ops.push_back(Unmerge.getReg(1));
7894 } else {
7895 Ops.push_back(NodePtr);
7896 }
7897 Ops.push_back(RayExtent);
7898
7899 auto packLanes = [&Ops, &S32, &B](Register Src) {
7900 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7901 Ops.push_back(Unmerge.getReg(0));
7902 Ops.push_back(Unmerge.getReg(1));
7903 Ops.push_back(Unmerge.getReg(2));
7904 };
7905
7906 packLanes(RayOrigin);
7907 if (IsA16) {
7908 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7909 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7913 B.buildMergeLikeInstr(R1,
7914 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7915 B.buildMergeLikeInstr(
7916 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7917 B.buildMergeLikeInstr(
7918 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7919 Ops.push_back(R1);
7920 Ops.push_back(R2);
7921 Ops.push_back(R3);
7922 } else {
7923 packLanes(RayDir);
7924 packLanes(RayInvDir);
7925 }
7926 }
7927
7928 if (!UseNSA) {
7929 // Build a single vector containing all the operands so far prepared.
7930 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7931 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7932 Ops.clear();
7933 Ops.push_back(MergedOps);
7934 }
7935
7936 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7937 .addDef(DstReg)
7938 .addImm(Opcode);
7939
7940 for (Register R : Ops) {
7941 MIB.addUse(R);
7942 }
7943
7944 MIB.addUse(TDescr)
7945 .addImm(IsA16 ? 1 : 0)
7946 .cloneMemRefs(MI);
7947
7948 MI.eraseFromParent();
7949 return true;
7950}
7951
7953 MachineInstr &MI, MachineIRBuilder &B) const {
7954 const LLT S32 = LLT::scalar(32);
7955 const LLT V2S32 = LLT::fixed_vector(2, 32);
7956
7957 Register DstReg = MI.getOperand(0).getReg();
7958 Register DstOrigin = MI.getOperand(1).getReg();
7959 Register DstDir = MI.getOperand(2).getReg();
7960 Register NodePtr = MI.getOperand(4).getReg();
7961 Register RayExtent = MI.getOperand(5).getReg();
7962 Register InstanceMask = MI.getOperand(6).getReg();
7963 Register RayOrigin = MI.getOperand(7).getReg();
7964 Register RayDir = MI.getOperand(8).getReg();
7965 Register Offsets = MI.getOperand(9).getReg();
7966 Register TDescr = MI.getOperand(10).getReg();
7967
7968 if (!ST.hasBVHDualAndBVH8Insts()) {
7969 Function &Fn = B.getMF().getFunction();
7971 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7972 return false;
7973 }
7974
7975 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7976 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7977 const unsigned NumVDataDwords = 10;
7978 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7979 int Opcode = AMDGPU::getMIMGOpcode(
7980 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7981 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7982 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7983 assert(Opcode != -1);
7984
7985 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7986 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7987
7988 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7989 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7990 .addDef(DstReg)
7991 .addDef(DstOrigin)
7992 .addDef(DstDir)
7993 .addImm(Opcode)
7994 .addUse(NodePtr)
7995 .addUse(RayExtentInstanceMaskVec.getReg(0))
7996 .addUse(RayOrigin)
7997 .addUse(RayDir)
7998 .addUse(Offsets)
7999 .addUse(TDescr)
8000 .cloneMemRefs(MI);
8001
8002 MI.eraseFromParent();
8003 return true;
8004}
8005
8007 MachineIRBuilder &B) const {
8008 const SITargetLowering *TLI = ST.getTargetLowering();
8010 Register DstReg = MI.getOperand(0).getReg();
8011 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
8012 MI.eraseFromParent();
8013 return true;
8014}
8015
8017 MachineIRBuilder &B) const {
8018 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8019 if (!ST.hasArchitectedSGPRs())
8020 return false;
8021 LLT S32 = LLT::scalar(32);
8022 Register DstReg = MI.getOperand(0).getReg();
8023 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
8024 auto LSB = B.buildConstant(S32, 25);
8025 auto Width = B.buildConstant(S32, 5);
8026 B.buildUbfx(DstReg, TTMP8, LSB, Width);
8027 MI.eraseFromParent();
8028 return true;
8029}
8030
8033 AMDGPU::Hwreg::Id HwReg,
8034 unsigned LowBit,
8035 unsigned Width) const {
8036 MachineRegisterInfo &MRI = *B.getMRI();
8037 Register DstReg = MI.getOperand(0).getReg();
8038 if (!MRI.getRegClassOrNull(DstReg))
8039 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8040 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8041 .addDef(DstReg)
8042 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
8043 MI.eraseFromParent();
8044 return true;
8045}
8046
8047static constexpr unsigned FPEnvModeBitField =
8049
8050static constexpr unsigned FPEnvTrapBitField =
8052
8055 MachineIRBuilder &B) const {
8056 Register Src = MI.getOperand(0).getReg();
8057 if (MRI.getType(Src) != S64)
8058 return false;
8059
8060 auto ModeReg =
8061 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8062 /*HasSideEffects=*/true, /*isConvergent=*/false)
8063 .addImm(FPEnvModeBitField);
8064 auto TrapReg =
8065 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8066 /*HasSideEffects=*/true, /*isConvergent=*/false)
8067 .addImm(FPEnvTrapBitField);
8068 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8069 MI.eraseFromParent();
8070 return true;
8071}
8072
8075 MachineIRBuilder &B) const {
8076 Register Src = MI.getOperand(0).getReg();
8077 if (MRI.getType(Src) != S64)
8078 return false;
8079
8080 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
8081 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8082 /*HasSideEffects=*/true, /*isConvergent=*/false)
8083 .addImm(static_cast<int16_t>(FPEnvModeBitField))
8084 .addReg(Unmerge.getReg(0));
8085 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8086 /*HasSideEffects=*/true, /*isConvergent=*/false)
8087 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
8088 .addReg(Unmerge.getReg(1));
8089 MI.eraseFromParent();
8090 return true;
8091}
8092
8094 MachineInstr &MI) const {
8095 MachineIRBuilder &B = Helper.MIRBuilder;
8096 MachineRegisterInfo &MRI = *B.getMRI();
8097
8098 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8099 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8100 switch (IntrID) {
8101 case Intrinsic::amdgcn_icmp: {
8102 // amdgcn.icmp(i1 src0, i1 0, NE) -> ballot(src0)
8103 // This is the only valid form of amdgcn.icmp with i1 inputs.
8104 Register Src0 = MI.getOperand(2).getReg();
8105 LLT SrcTy = MRI.getType(Src0);
8106 if (SrcTy != LLT::scalar(1))
8107 return true; // Not i1, leave for default handling.
8108
8109 // Check that src1 is constant 0.
8110 Register Src1 = MI.getOperand(3).getReg();
8111 auto Src1Const = getIConstantVRegValWithLookThrough(Src1, MRI);
8112 if (!Src1Const || Src1Const->Value != 0)
8113 return false; // Invalid i1 icmp form.
8114
8115 // Check that predicate is ICMP_NE.
8116 int64_t Pred = MI.getOperand(4).getImm();
8117 if (Pred != CmpInst::ICMP_NE)
8118 return false; // Invalid i1 icmp form.
8119
8120 // Convert to ballot.
8121 Register Dst = MI.getOperand(0).getReg();
8122 B.buildIntrinsic(Intrinsic::amdgcn_ballot, Dst).addUse(Src0);
8123 MI.eraseFromParent();
8124 return true;
8125 }
8126 case Intrinsic::sponentry:
8127 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8128 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8129 // that we can remove this cast.
8130 const LLT S32 = LLT::scalar(32);
8132 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8133
8134 Register DstReg = MI.getOperand(0).getReg();
8135 B.buildIntToPtr(DstReg, TmpReg);
8136 MI.eraseFromParent();
8137 } else {
8138 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8139 1, 0, /*IsImmutable=*/false);
8140 B.buildFrameIndex(MI.getOperand(0), FI);
8141 MI.eraseFromParent();
8142 }
8143 return true;
8144 case Intrinsic::amdgcn_if:
8145 case Intrinsic::amdgcn_else: {
8146 MachineInstr *Br = nullptr;
8147 MachineBasicBlock *UncondBrTarget = nullptr;
8148 bool Negated = false;
8149 if (MachineInstr *BrCond =
8150 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8151 const SIRegisterInfo *TRI
8152 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8153
8154 Register Def = MI.getOperand(1).getReg();
8155 Register Use = MI.getOperand(3).getReg();
8156
8157 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8158
8159 if (Negated)
8160 std::swap(CondBrTarget, UncondBrTarget);
8161
8162 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8163 if (IntrID == Intrinsic::amdgcn_if) {
8164 B.buildInstr(AMDGPU::SI_IF)
8165 .addDef(Def)
8166 .addUse(Use)
8167 .addMBB(UncondBrTarget);
8168 } else {
8169 B.buildInstr(AMDGPU::SI_ELSE)
8170 .addDef(Def)
8171 .addUse(Use)
8172 .addMBB(UncondBrTarget);
8173 }
8174
8175 if (Br) {
8176 Br->getOperand(0).setMBB(CondBrTarget);
8177 } else {
8178 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8179 // since we're swapping branch targets it needs to be reinserted.
8180 // FIXME: IRTranslator should probably not do this
8181 B.buildBr(*CondBrTarget);
8182 }
8183
8184 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8185 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8186 MI.eraseFromParent();
8187 BrCond->eraseFromParent();
8188 return true;
8189 }
8190
8191 return false;
8192 }
8193 case Intrinsic::amdgcn_loop: {
8194 MachineInstr *Br = nullptr;
8195 MachineBasicBlock *UncondBrTarget = nullptr;
8196 bool Negated = false;
8197 if (MachineInstr *BrCond =
8198 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8199 const SIRegisterInfo *TRI
8200 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8201
8202 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8203 Register Reg = MI.getOperand(2).getReg();
8204
8205 if (Negated)
8206 std::swap(CondBrTarget, UncondBrTarget);
8207
8208 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8209 B.buildInstr(AMDGPU::SI_LOOP)
8210 .addUse(Reg)
8211 .addMBB(UncondBrTarget);
8212
8213 if (Br)
8214 Br->getOperand(0).setMBB(CondBrTarget);
8215 else
8216 B.buildBr(*CondBrTarget);
8217
8218 MI.eraseFromParent();
8219 BrCond->eraseFromParent();
8220 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8221 return true;
8222 }
8223
8224 return false;
8225 }
8226 case Intrinsic::amdgcn_wave_reduce_min:
8227 case Intrinsic::amdgcn_wave_reduce_umin:
8228 case Intrinsic::amdgcn_wave_reduce_max:
8229 case Intrinsic::amdgcn_wave_reduce_umax:
8230 case Intrinsic::amdgcn_wave_reduce_add:
8231 case Intrinsic::amdgcn_wave_reduce_sub:
8232 case Intrinsic::amdgcn_wave_reduce_and:
8233 case Intrinsic::amdgcn_wave_reduce_or:
8234 case Intrinsic::amdgcn_wave_reduce_xor: {
8235 Register SrcReg = MI.getOperand(2).getReg();
8236 if (MRI.getType(SrcReg) != LLT::scalar(16))
8237 return true;
8238 Register DstReg = MI.getOperand(0).getReg();
8239 bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
8240 IntrID == Intrinsic::amdgcn_wave_reduce_max ||
8241 IntrID == Intrinsic::amdgcn_wave_reduce_add ||
8242 IntrID == Intrinsic::amdgcn_wave_reduce_sub;
8243 auto Ext = NeedsSignExt ? B.buildSExt(LLT::scalar(32), SrcReg)
8244 : B.buildZExt(LLT::scalar(32), SrcReg);
8245 auto NewDst = MRI.createGenericVirtualRegister(LLT::scalar(32));
8246 B.buildIntrinsic(IntrID, ArrayRef<Register>{NewDst},
8247 /*hasSideEffects=*/false, /*isConvergent=*/true)
8248 .addUse(Ext.getReg(0))
8249 .addImm(MI.getOperand(3).getImm()); // strategy
8250 B.buildTrunc(DstReg, NewDst);
8251 MI.eraseFromParent();
8252 return true;
8253 }
8254 case Intrinsic::amdgcn_addrspacecast_nonnull:
8255 return legalizeAddrSpaceCast(MI, MRI, B);
8256 case Intrinsic::amdgcn_make_buffer_rsrc:
8257 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8258 case Intrinsic::amdgcn_kernarg_segment_ptr:
8259 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8260 // This only makes sense to call in a kernel, so just lower to null.
8261 B.buildConstant(MI.getOperand(0).getReg(), 0);
8262 MI.eraseFromParent();
8263 return true;
8264 }
8265
8268 case Intrinsic::amdgcn_implicitarg_ptr:
8269 return legalizeImplicitArgPtr(MI, MRI, B);
8270 case Intrinsic::amdgcn_workitem_id_x:
8271 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8273 case Intrinsic::amdgcn_workitem_id_y:
8274 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8276 case Intrinsic::amdgcn_workitem_id_z:
8277 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8279 case Intrinsic::amdgcn_workgroup_id_x:
8280 return legalizeWorkGroupId(
8284 case Intrinsic::amdgcn_workgroup_id_y:
8285 return legalizeWorkGroupId(
8289 case Intrinsic::amdgcn_workgroup_id_z:
8290 return legalizeWorkGroupId(
8294 case Intrinsic::amdgcn_cluster_id_x:
8295 return ST.hasClusters() &&
8298 case Intrinsic::amdgcn_cluster_id_y:
8299 return ST.hasClusters() &&
8302 case Intrinsic::amdgcn_cluster_id_z:
8303 return ST.hasClusters() &&
8306 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8307 return ST.hasClusters() &&
8310 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8311 return ST.hasClusters() &&
8314 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8315 return ST.hasClusters() &&
8318 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8319 return ST.hasClusters() &&
8321 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8322 return ST.hasClusters() &&
8325 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8326 return ST.hasClusters() &&
8329 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8330 return ST.hasClusters() &&
8333 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8334 return ST.hasClusters() &&
8336 MI, MRI, B,
8338 case Intrinsic::amdgcn_wave_id:
8339 return legalizeWaveID(MI, B);
8340 case Intrinsic::amdgcn_lds_kernel_id:
8341 return legalizePreloadedArgIntrin(MI, MRI, B,
8343 case Intrinsic::amdgcn_dispatch_ptr:
8344 return legalizePreloadedArgIntrin(MI, MRI, B,
8346 case Intrinsic::amdgcn_queue_ptr:
8347 return legalizePreloadedArgIntrin(MI, MRI, B,
8349 case Intrinsic::amdgcn_implicit_buffer_ptr:
8352 case Intrinsic::amdgcn_dispatch_id:
8353 return legalizePreloadedArgIntrin(MI, MRI, B,
8355 case Intrinsic::r600_read_ngroups_x:
8356 // TODO: Emit error for hsa
8359 case Intrinsic::r600_read_ngroups_y:
8362 case Intrinsic::r600_read_ngroups_z:
8365 case Intrinsic::r600_read_local_size_x:
8366 // TODO: Could insert G_ASSERT_ZEXT from s16
8368 case Intrinsic::r600_read_local_size_y:
8369 // TODO: Could insert G_ASSERT_ZEXT from s16
8371 // TODO: Could insert G_ASSERT_ZEXT from s16
8372 case Intrinsic::r600_read_local_size_z:
8375 case Intrinsic::amdgcn_fdiv_fast:
8376 return legalizeFDIVFastIntrin(MI, MRI, B);
8377 case Intrinsic::amdgcn_is_shared:
8379 case Intrinsic::amdgcn_is_private:
8381 case Intrinsic::amdgcn_wavefrontsize: {
8382 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8383 MI.eraseFromParent();
8384 return true;
8385 }
8386 case Intrinsic::amdgcn_s_buffer_load:
8387 return legalizeSBufferLoad(Helper, MI);
8388 case Intrinsic::amdgcn_raw_buffer_store:
8389 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8390 case Intrinsic::amdgcn_struct_buffer_store:
8391 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8392 return legalizeBufferStore(MI, Helper, false, false);
8393 case Intrinsic::amdgcn_raw_buffer_store_format:
8394 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8395 case Intrinsic::amdgcn_struct_buffer_store_format:
8396 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8397 return legalizeBufferStore(MI, Helper, false, true);
8398 case Intrinsic::amdgcn_raw_tbuffer_store:
8399 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8400 case Intrinsic::amdgcn_struct_tbuffer_store:
8401 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8402 return legalizeBufferStore(MI, Helper, true, true);
8403 case Intrinsic::amdgcn_raw_buffer_load:
8404 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8405 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8406 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8407 case Intrinsic::amdgcn_struct_buffer_load:
8408 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8409 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8410 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8411 return legalizeBufferLoad(MI, Helper, false, false);
8412 case Intrinsic::amdgcn_raw_buffer_load_format:
8413 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8414 case Intrinsic::amdgcn_struct_buffer_load_format:
8415 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8416 return legalizeBufferLoad(MI, Helper, true, false);
8417 case Intrinsic::amdgcn_raw_tbuffer_load:
8418 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8419 case Intrinsic::amdgcn_struct_tbuffer_load:
8420 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8421 return legalizeBufferLoad(MI, Helper, true, true);
8422 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8423 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8424 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8425 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8426 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8427 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8428 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8429 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8430 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8431 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8432 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8433 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8434 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8436 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8437 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8438 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8439 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8440 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8441 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8442 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8444 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8445 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8446 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8448 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8449 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8450 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8451 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8452 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8453 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8454 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8455 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8456 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8457 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8458 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8460 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8461 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8462 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8463 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8464 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8465 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8466 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8468 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8470 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8472 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8473 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8474 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8475 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8476 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8477 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8478 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8479 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8480 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8481 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8482 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8483 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8484 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8485 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8486 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8487 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8488 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8490 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8491 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8492 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8493 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8494 return legalizeBufferAtomic(MI, B, IntrID);
8495 case Intrinsic::amdgcn_rsq_clamp:
8496 return legalizeRsqClampIntrinsic(MI, MRI, B);
8497 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8499 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8500 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8502 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8503 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8504 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8505 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8506 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8507 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8508 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8509 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8510 Register Index = MI.getOperand(5).getReg();
8511 LLT S64 = LLT::scalar(64);
8512 LLT IndexArgTy = MRI.getType(Index);
8513 if (IndexArgTy != S64) {
8514 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8515 : B.buildAnyExt(S64, Index);
8516 MI.getOperand(5).setReg(NewIndex.getReg(0));
8517 }
8518 return true;
8519 }
8520 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8521 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8522 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8528 Register Index = MI.getOperand(5).getReg();
8529 LLT S32 = LLT::scalar(32);
8530 if (MRI.getType(Index) != S32)
8531 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8532 return true;
8533 }
8534 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8535 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8536 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8537 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8538 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8539 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8540 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8541 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8542 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8543 Register Index = MI.getOperand(7).getReg();
8544 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8545 ? LLT::scalar(64)
8546 : LLT::scalar(32);
8547 LLT IndexArgTy = MRI.getType(Index);
8548 if (IndexArgTy != IdxTy) {
8549 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8550 : B.buildAnyExt(IdxTy, Index);
8551 MI.getOperand(7).setReg(NewIndex.getReg(0));
8552 }
8553 return true;
8554 }
8555
8556 case Intrinsic::amdgcn_fmed3: {
8557 GISelChangeObserver &Observer = Helper.Observer;
8558
8559 // FIXME: This is to workaround the inability of tablegen match combiners to
8560 // match intrinsics in patterns.
8561 Observer.changingInstr(MI);
8562 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8563 MI.removeOperand(1);
8564 Observer.changedInstr(MI);
8565 return true;
8566 }
8567 case Intrinsic::amdgcn_readlane:
8568 case Intrinsic::amdgcn_writelane:
8569 case Intrinsic::amdgcn_readfirstlane:
8570 case Intrinsic::amdgcn_permlane16:
8571 case Intrinsic::amdgcn_permlanex16:
8572 case Intrinsic::amdgcn_permlane64:
8573 case Intrinsic::amdgcn_set_inactive:
8574 case Intrinsic::amdgcn_set_inactive_chain_arg:
8575 case Intrinsic::amdgcn_mov_dpp8:
8576 case Intrinsic::amdgcn_update_dpp:
8577 case Intrinsic::amdgcn_permlane_bcast:
8578 case Intrinsic::amdgcn_permlane_up:
8579 case Intrinsic::amdgcn_permlane_down:
8580 case Intrinsic::amdgcn_permlane_xor:
8581 return legalizeLaneOp(Helper, MI, IntrID);
8582 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8583 return legalizeSBufferPrefetch(Helper, MI);
8584 case Intrinsic::amdgcn_dead: {
8585 // TODO: Use poison instead of undef
8586 for (const MachineOperand &Def : MI.defs())
8587 B.buildUndef(Def);
8588 MI.eraseFromParent();
8589 return true;
8590 }
8591 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8592 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8593 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8594 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8595 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8596 MI.eraseFromParent();
8597 return true;
8598 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8599 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8600 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8601 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8602 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8603 MI.eraseFromParent();
8604 return true;
8605 case Intrinsic::amdgcn_av_load_b128:
8606 case Intrinsic::amdgcn_av_store_b128: {
8607 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
8608 if (!ST.hasFlatGlobalInsts()) {
8609 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8610 ? "llvm.amdgcn.av.load.b128"
8611 : "llvm.amdgcn.av.store.b128";
8612 Function &Fn = B.getMF().getFunction();
8614 Fn, Twine(Name) + " not supported on subtarget", MI.getDebugLoc()));
8615 return false;
8616 }
8617 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8618 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8619 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8620 else
8621 B.buildStore(MI.getOperand(2), MI.getOperand(1),
8622 **MI.memoperands_begin());
8623 MI.eraseFromParent();
8624 return true;
8625 }
8626 case Intrinsic::amdgcn_flat_load_monitor_b32:
8627 case Intrinsic::amdgcn_flat_load_monitor_b64:
8628 case Intrinsic::amdgcn_flat_load_monitor_b128:
8629 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8630 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8631 .add(MI.getOperand(0))
8632 .add(MI.getOperand(2))
8633 .addMemOperand(*MI.memoperands_begin());
8634 MI.eraseFromParent();
8635 return true;
8636 case Intrinsic::amdgcn_global_load_monitor_b32:
8637 case Intrinsic::amdgcn_global_load_monitor_b64:
8638 case Intrinsic::amdgcn_global_load_monitor_b128:
8639 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8640 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8641 .add(MI.getOperand(0))
8642 .add(MI.getOperand(2))
8643 .addMemOperand(*MI.memoperands_begin());
8644 MI.eraseFromParent();
8645 return true;
8646 default: {
8647 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8649 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8650 return true;
8651 }
8652 }
8653
8654 return true;
8655}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:77
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1277
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1197
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:383
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:861
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:558
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:1987
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:656
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:464
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:317
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1685
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:436
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.