LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::scalar(32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::scalar(64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
327constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
328constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
329constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
330constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
331constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
332constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
333constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
334constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
335constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
336constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
337constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
338constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
341constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
342constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
343constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
344constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
345constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
346constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
347constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
350constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
364
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
375
378 (ST.useRealTrue16Insts() && Ty == S16) ||
380}
381
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 Query.MMODescrs[0].Ordering !=
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
500 Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
601 RoundedSize, AddrSpace, Align(AlignInBits / 8),
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
612 Query.MMODescrs[0].AlignInBits,
613 Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(Idx);
622
623 const LLT PointerTy = MRI.getType(MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
647 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Unmerged.getReg(I));
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 }
675 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
677}
678
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
686 return;
688}
689
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
736
738
739 // s1 for VCC branches, s32 for SCC branches.
741
742 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
743 // elements for v3s16
746 .legalFor(AllS32Vectors)
748 .legalFor(AddrSpaces64)
749 .legalFor(AddrSpaces32)
750 .legalFor(AddrSpaces128)
751 .legalIf(isPointer(0))
752 .clampScalar(0, S16, S256)
754 .clampMaxNumElements(0, S32, 16)
756 .scalarize(0);
757
758 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
759 // Full set of gfx9 features.
760 if (ST.hasScalarAddSub64()) {
761 getActionDefinitionsBuilder({G_ADD, G_SUB})
762 .legalFor({S64, S32, S16, V2S16})
763 .clampMaxNumElementsStrict(0, S16, 2)
764 .scalarize(0)
765 .minScalar(0, S16)
767 .maxScalar(0, S32);
768 } else {
769 getActionDefinitionsBuilder({G_ADD, G_SUB})
770 .legalFor({S32, S16, V2S16})
771 .clampMaxNumElementsStrict(0, S16, 2)
772 .scalarize(0)
773 .minScalar(0, S16)
775 .maxScalar(0, S32);
776 }
777
778 if (ST.hasScalarSMulU64()) {
780 .legalFor({S64, S32, S16, V2S16})
781 .clampMaxNumElementsStrict(0, S16, 2)
782 .scalarize(0)
783 .minScalar(0, S16)
785 .custom();
786 } else {
788 .legalFor({S32, S16, V2S16})
789 .clampMaxNumElementsStrict(0, S16, 2)
790 .scalarize(0)
791 .minScalar(0, S16)
793 .custom();
794 }
795 assert(ST.hasMad64_32());
796
797 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
798 .legalFor({S32, S16, V2S16}) // Clamp modifier
799 .minScalarOrElt(0, S16)
801 .scalarize(0)
803 .lower();
804 } else if (ST.has16BitInsts()) {
805 getActionDefinitionsBuilder({G_ADD, G_SUB})
806 .legalFor({S32, S16})
807 .minScalar(0, S16)
809 .maxScalar(0, S32)
810 .scalarize(0);
811
813 .legalFor({S32, S16})
814 .scalarize(0)
815 .minScalar(0, S16)
817 .custom();
818 assert(ST.hasMad64_32());
819
820 // Technically the saturating operations require clamp bit support, but this
821 // was introduced at the same time as 16-bit operations.
822 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
823 .legalFor({S32, S16}) // Clamp modifier
824 .minScalar(0, S16)
825 .scalarize(0)
827 .lower();
828
829 // We're just lowering this, but it helps get a better result to try to
830 // coerce to the desired type first.
831 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
832 .minScalar(0, S16)
833 .scalarize(0)
834 .lower();
835 } else {
836 getActionDefinitionsBuilder({G_ADD, G_SUB})
837 .legalFor({S32})
838 .widenScalarToNextMultipleOf(0, 32)
839 .clampScalar(0, S32, S32)
840 .scalarize(0);
841
842 auto &Mul = getActionDefinitionsBuilder(G_MUL)
843 .legalFor({S32})
844 .scalarize(0)
845 .minScalar(0, S32)
847
848 if (ST.hasMad64_32())
849 Mul.custom();
850 else
851 Mul.maxScalar(0, S32);
852
853 if (ST.hasIntClamp()) {
854 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
855 .legalFor({S32}) // Clamp modifier.
856 .scalarize(0)
858 .lower();
859 } else {
860 // Clamp bit support was added in VI, along with 16-bit operations.
861 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
862 .minScalar(0, S32)
863 .scalarize(0)
864 .lower();
865 }
866
867 // FIXME: DAG expansion gets better results. The widening uses the smaller
868 // range values and goes for the min/max lowering directly.
869 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
870 .minScalar(0, S32)
871 .scalarize(0)
872 .lower();
873 }
874
876 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
877 .customFor({S32, S64})
878 .clampScalar(0, S32, S64)
880 .scalarize(0);
881
882 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
883 .legalFor({S32})
884 .maxScalar(0, S32);
885
886 if (ST.hasVOP3PInsts()) {
887 Mulh
888 .clampMaxNumElements(0, S8, 2)
889 .lowerFor({V2S8});
890 }
891
892 Mulh
893 .scalarize(0)
894 .lower();
895
896 // Report legal for any types we can handle anywhere. For the cases only legal
897 // on the SALU, RegBankSelect will be able to re-legalize.
898 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
899 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
900 .clampScalar(0, S32, S64)
906 .scalarize(0);
907
909 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
910 .legalFor({{S32, S1}, {S32, S32}})
911 .clampScalar(0, S32, S32)
912 .scalarize(0);
913
915 // Don't worry about the size constraint.
917 .lower();
918
920 .legalFor({S1, S32, S64, S16, GlobalPtr,
921 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
922 .legalIf(isPointer(0))
923 .clampScalar(0, S32, S64)
925
926 getActionDefinitionsBuilder(G_FCONSTANT)
927 .legalFor({S32, S64, S16})
928 .clampScalar(0, S16, S64);
929
930 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
931 .legalIf(isRegisterClassType(ST, 0))
932 // s1 and s16 are special cases because they have legal operations on
933 // them, but don't really occupy registers in the normal way.
934 .legalFor({S1, S16})
935 .clampNumElements(0, V16S32, V32S32)
939 .clampMaxNumElements(0, S32, 16);
940
941 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
942
943 // If the amount is divergent, we have to do a wave reduction to get the
944 // maximum value, so this is expanded during RegBankSelect.
945 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
946 .legalFor({{PrivatePtr, S32}});
947
948 getActionDefinitionsBuilder(G_STACKSAVE)
949 .customFor({PrivatePtr});
950 getActionDefinitionsBuilder(G_STACKRESTORE)
951 .legalFor({PrivatePtr});
952
953 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
954
955 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
956 .customIf(typeIsNot(0, PrivatePtr));
957
958 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
959
960 auto &FPOpActions = getActionDefinitionsBuilder(
961 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
962 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
963 .legalFor({S32, S64});
964 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
965 .customFor({S32, S64});
966 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
967 .customFor({S32, S64});
968
969 if (ST.has16BitInsts()) {
970 if (ST.hasVOP3PInsts())
971 FPOpActions.legalFor({S16, V2S16});
972 else
973 FPOpActions.legalFor({S16});
974
975 TrigActions.customFor({S16});
976 FDIVActions.customFor({S16});
977 }
978
979 if (ST.hasPackedFP32Ops()) {
980 FPOpActions.legalFor({V2S32});
981 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
982 }
983
984 if (ST.hasPackedFP64Ops()) {
985 FPOpActions.legalFor({V2S64});
986 FPOpActions.clampMaxNumElementsStrict(0, S64, 2);
987 }
988
989 auto &MinNumMaxNumIeee =
990 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
991
992 if (ST.hasVOP3PInsts()) {
993 MinNumMaxNumIeee.legalFor(FPTypesPK16)
994 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
995 .clampMaxNumElements(0, S16, 2)
996 .clampScalar(0, S16, S64)
997 .scalarize(0);
998 } else if (ST.has16BitInsts()) {
999 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
1000 } else {
1001 MinNumMaxNumIeee.legalFor(FPTypesBase)
1002 .clampScalar(0, S32, S64)
1003 .scalarize(0);
1004 }
1005
1006 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1007 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1008
1009 if (ST.hasVOP3PInsts()) {
1010 MinNumMaxNum.customFor(FPTypesPK16)
1011 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1012 .clampMaxNumElements(0, S16, 2)
1013 .clampScalar(0, S16, S64)
1014 .scalarize(0);
1015 } else if (ST.has16BitInsts()) {
1016 MinNumMaxNum.customFor(FPTypes16)
1017 .clampScalar(0, S16, S64)
1018 .scalarize(0);
1019 } else {
1020 MinNumMaxNum.customFor(FPTypesBase)
1021 .clampScalar(0, S32, S64)
1022 .scalarize(0);
1023 }
1024
1025 if (ST.hasVOP3PInsts())
1026 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1027
1028 FPOpActions
1029 .scalarize(0)
1030 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1031
1032 TrigActions
1033 .scalarize(0)
1034 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1035
1036 FDIVActions
1037 .scalarize(0)
1038 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1039
1040 auto &FNegAbs = getActionDefinitionsBuilder({G_FNEG, G_FABS});
1041 FNegAbs.legalFor(FPTypesPK16)
1042 .legalFor(ST.hasPackedFP32Ops(), {V2S32})
1044 if (ST.hasPackedFP32Ops())
1045 FNegAbs.clampMaxNumElementsStrict(0, S32, 2);
1046 FNegAbs.scalarize(0).clampScalar(0, S16, S64);
1047
1048 if (ST.has16BitInsts()) {
1050 .legalFor({S16})
1051 .customFor({S32, S64})
1052 .scalarize(0)
1053 .unsupported();
1055 .legalFor({S32, S64, S16})
1056 .scalarize(0)
1057 .clampScalar(0, S16, S64);
1058
1059 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1060 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1061 .scalarize(0)
1062 .maxScalarIf(typeIs(0, S16), 1, S16)
1063 .clampScalar(1, S32, S32)
1064 .lower();
1065
1067 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1068 .scalarize(0)
1069 .lower();
1070
1072 .lowerFor({S16, S32, S64})
1073 .scalarize(0)
1074 .lower();
1075 } else {
1077 .customFor({S32, S64, S16})
1078 .scalarize(0)
1079 .unsupported();
1080
1081
1082 if (ST.hasFractBug()) {
1084 .customFor({S64})
1085 .legalFor({S32, S64})
1086 .scalarize(0)
1087 .clampScalar(0, S32, S64);
1088 } else {
1090 .legalFor({S32, S64})
1091 .scalarize(0)
1092 .clampScalar(0, S32, S64);
1093 }
1094
1095 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1096 .legalFor({{S32, S32}, {S64, S32}})
1097 .scalarize(0)
1098 .clampScalar(0, S32, S64)
1099 .clampScalar(1, S32, S32)
1100 .lower();
1101
1103 .customFor({{S32, S32}, {S64, S32}})
1104 .scalarize(0)
1105 .minScalar(0, S32)
1106 .clampScalar(1, S32, S32)
1107 .lower();
1108
1110 .lowerFor({S32, S64})
1111 .scalarize(0)
1112 .lower();
1113 }
1114
1115 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1116 if (ST.hasCvtPkF16F32Inst()) {
1117 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1118 .clampMaxNumElements(0, S16, 2);
1119 } else {
1120 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1121 }
1122 FPTruncActions.scalarize(0).lower();
1123
1125 .legalFor({{S64, S32}, {S32, S16}})
1126 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1127 .scalarize(0);
1128
1129 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1130 if (ST.has16BitInsts()) {
1131 FSubActions
1132 // Use actual fsub instruction
1133 .legalFor({S32, S16})
1134 // Must use fadd + fneg
1135 .lowerFor({S64, V2S16});
1136 } else {
1137 FSubActions
1138 // Use actual fsub instruction
1139 .legalFor({S32})
1140 // Must use fadd + fneg
1141 .lowerFor({S64, S16, V2S16});
1142 }
1143
1144 if (ST.hasPackedFP32Ops())
1145 FSubActions.lowerFor({V2S32}).clampMaxNumElements(0, S32, 2);
1146
1147 FSubActions
1148 .clampMaxNumElements(0, S16, 2)
1149 .scalarize(0)
1150 .clampScalar(0, S32, S64);
1151
1152 // Whether this is legal depends on the floating point mode for the function.
1153 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1154 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1155 FMad.customFor({S32, S16});
1156 else if (ST.hasMadMacF32Insts())
1157 FMad.customFor({S32});
1158 else if (ST.hasMadF16())
1159 FMad.customFor({S16});
1160 FMad.scalarize(0)
1161 .lower();
1162
1163 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1164 if (ST.has16BitInsts()) {
1165 FRem.customFor({S16, S32, S64});
1166 } else {
1167 FRem.minScalar(0, S32)
1168 .customFor({S32, S64});
1169 }
1170 FRem.scalarize(0);
1171
1172 // TODO: Do we need to clamp maximum bitwidth?
1174 .legalIf(isScalar(0))
1175 .legalFor({{V2S16, V2S32}})
1176 .clampMaxNumElements(0, S16, 2)
1177 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1178 // situations (like an invalid implicit use), we don't want to infinite loop
1179 // in the legalizer.
1181 .alwaysLegal();
1182
1183 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1184 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1185 {S32, S1}, {S64, S1}, {S16, S1}})
1186 .scalarize(0)
1187 .clampScalar(0, S32, S64)
1188 .widenScalarToNextPow2(1, 32);
1189
1190 // TODO: Split s1->s64 during regbankselect for VALU.
1191 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1192 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1193 .lowerIf(typeIs(1, S1))
1194 .customFor({{S32, S64}, {S64, S64}});
1195 if (ST.has16BitInsts())
1196 IToFP.legalFor({{S16, S16}});
1197 IToFP.clampScalar(1, S32, S64)
1198 .minScalar(0, S32)
1199 .scalarize(0)
1201
1202 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1203 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1204 .customFor({{S64, S32}, {S64, S64}})
1205 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1206 if (ST.has16BitInsts())
1207 FPToI.legalFor({{S16, S16}});
1208 else
1209 FPToI.minScalar(1, S32);
1210
1211 FPToI.minScalar(0, S32)
1212 .widenScalarToNextPow2(0, 32)
1213 .scalarize(0)
1214 .lower();
1215
1216 // clang-format off
1217 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1218 .legalFor({{S32, S32}, {S32, S64}})
1219 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1220 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1221
1222 // If available, widen width <16 to i16, intead of i32 so v_cvt_i16/u16_f16 can be used.
1223 if (ST.has16BitInsts())
1224 FPToISat.minScalarIf(typeIs(1, S16), 0, S16);
1225
1226 FPToISat.minScalar(1, S32);
1227 FPToISat.minScalar(0, S32)
1228 .widenScalarToNextPow2(0, 32)
1229 .scalarize(0)
1230 .lower();
1231 // clang-format on
1232
1233 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1234 .clampScalar(0, S16, S64)
1235 .scalarize(0)
1236 .lower();
1237
1238 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1239 .legalFor({S16, S32})
1240 .scalarize(0)
1241 .lower();
1242
1243 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1244 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1245 .scalarize(0)
1246 .lower();
1247
1248 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1249 .clampScalar(0, S16, S64)
1250 .scalarize(0)
1251 .lower();
1252
1253 if (ST.has16BitInsts()) {
1254 getActionDefinitionsBuilder(
1255 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1256 .legalFor({S16, S32, S64})
1257 .clampScalar(0, S16, S64)
1258 .scalarize(0);
1259 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1260 getActionDefinitionsBuilder(
1261 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1262 .legalFor({S32, S64})
1263 .clampScalar(0, S32, S64)
1264 .scalarize(0);
1265 } else {
1266 getActionDefinitionsBuilder(
1267 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1268 .legalFor({S32})
1269 .customFor({S64})
1270 .clampScalar(0, S32, S64)
1271 .scalarize(0);
1272 }
1273
1274 getActionDefinitionsBuilder(G_PTR_ADD)
1275 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1276 .legalIf(all(isPointer(0), sameSize(0, 1)))
1277 .scalarize(0)
1278 .scalarSameSizeAs(1, 0);
1279
1280 getActionDefinitionsBuilder(G_PTRMASK)
1281 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1282 .scalarSameSizeAs(1, 0)
1283 .scalarize(0);
1284
1285 auto &CmpBuilder =
1286 getActionDefinitionsBuilder(G_ICMP)
1287 // The compare output type differs based on the register bank of the output,
1288 // so make both s1 and s32 legal.
1289 //
1290 // Scalar compares producing output in scc will be promoted to s32, as that
1291 // is the allocatable register type that will be needed for the copy from
1292 // scc. This will be promoted during RegBankSelect, and we assume something
1293 // before that won't try to use s32 result types.
1294 //
1295 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1296 // bank.
1298 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1299 .legalForCartesianProduct(
1300 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1301 if (ST.has16BitInsts()) {
1302 CmpBuilder.legalFor({{S1, S16}});
1303 }
1304
1305 CmpBuilder
1307 .clampScalar(1, S32, S64)
1308 .scalarize(0)
1309 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1310
1311 auto &FCmpBuilder =
1312 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1313 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1314
1315 if (ST.hasSALUFloatInsts())
1316 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1317
1318 FCmpBuilder
1320 .clampScalar(1, S32, S64)
1321 .scalarize(0);
1322
1323 // FIXME: fpow has a selection pattern that should move to custom lowering.
1324 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1325 if (ST.has16BitInsts())
1326 ExpOps.customFor({{S32}, {S16}});
1327 else
1328 ExpOps.customFor({S32});
1329 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1330 .scalarize(0);
1331
1332 getActionDefinitionsBuilder(G_FPOWI)
1333 .clampScalar(0, MinScalarFPTy, S32)
1334 .lower();
1335
1336 getActionDefinitionsBuilder(G_FLOG2)
1337 .legalFor(ST.has16BitInsts(), {S16})
1338 .customFor({S32, S16})
1339 .scalarize(0)
1340 .lower();
1341
1342 getActionDefinitionsBuilder(G_FEXP2)
1343 .legalFor(ST.has16BitInsts(), {S16})
1344 .customFor({S32, S64, S16})
1345 .scalarize(0)
1346 .lower();
1347
1348 auto &LogOps =
1349 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1350 LogOps.customFor({S32, S16, S64});
1351 LogOps.clampScalar(0, MinScalarFPTy, S32)
1352 .scalarize(0);
1353
1354 // The 64-bit versions produce 32-bit results, but only on the SALU.
1355 getActionDefinitionsBuilder(G_CTPOP)
1356 .legalFor({{S32, S32}, {S32, S64}})
1357 .clampScalar(0, S32, S32)
1358 .widenScalarToNextPow2(1, 32)
1359 .clampScalar(1, S32, S64)
1360 .scalarize(0)
1361 .widenScalarToNextPow2(0, 32);
1362
1363 // If no 16 bit instr is available, lower into different instructions.
1364 if (ST.has16BitInsts())
1365 getActionDefinitionsBuilder(G_IS_FPCLASS)
1366 .legalForCartesianProduct({S1}, FPTypes16)
1367 .widenScalarToNextPow2(1)
1368 .scalarize(0)
1369 .lower();
1370 else
1371 getActionDefinitionsBuilder(G_IS_FPCLASS)
1372 .legalForCartesianProduct({S1}, FPTypesBase)
1373 .lowerFor({S1, S16})
1374 .widenScalarToNextPow2(1)
1375 .scalarize(0)
1376 .lower();
1377
1378 // The hardware instructions return a different result on 0 than the generic
1379 // instructions expect. The hardware produces -1, but these produce the
1380 // bitwidth.
1381 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1382 .scalarize(0)
1383 .clampScalar(0, S32, S32)
1384 .clampScalar(1, S32, S64)
1385 .widenScalarToNextPow2(0, 32)
1386 .widenScalarToNextPow2(1, 32)
1387 .custom();
1388
1389 // The 64-bit versions produce 32-bit results, but only on the SALU.
1390 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1391 .legalFor({{S32, S32}, {S32, S64}})
1392 .customIf(scalarNarrowerThan(1, 32))
1393 .clampScalar(0, S32, S32)
1394 .clampScalar(1, S32, S64)
1395 .scalarize(0)
1396 .widenScalarToNextPow2(0, 32)
1397 .widenScalarToNextPow2(1, 32);
1398
1399 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1400 .legalFor({{S32, S32}, {S32, S64}})
1401 .clampScalar(0, S32, S32)
1402 .clampScalar(1, S32, S64)
1403 .scalarize(0)
1404 .widenScalarToNextPow2(0, 32)
1405 .widenScalarToNextPow2(1, 32);
1406
1407 getActionDefinitionsBuilder(G_CTLS)
1408 .customFor({{S32, S32}})
1409 .scalarize(0)
1410 .clampScalar(0, S32, S32)
1411 .clampScalar(1, S32, S32);
1412
1413 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1414 // RegBankSelect.
1415 getActionDefinitionsBuilder(G_BITREVERSE)
1416 .legalFor({S32, S64})
1417 .clampScalar(0, S32, S64)
1418 .scalarize(0)
1419 .widenScalarToNextPow2(0);
1420
1421 if (ST.has16BitInsts()) {
1422 getActionDefinitionsBuilder(G_BSWAP)
1423 .legalFor({S16, S32, V2S16})
1424 .clampMaxNumElementsStrict(0, S16, 2)
1425 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1426 // narrowScalar limitation.
1427 .widenScalarToNextPow2(0)
1428 .clampScalar(0, S16, S32)
1429 .scalarize(0);
1430
1431 if (ST.hasVOP3PInsts()) {
1432 getActionDefinitionsBuilder(G_ABS)
1433 .legalFor({S32, S16, V2S16})
1434 .clampMaxNumElements(0, S16, 2)
1435 .minScalar(0, S16)
1436 .widenScalarToNextPow2(0)
1437 .scalarize(0)
1438 .lower();
1439 if (ST.hasMinMaxI64Insts()) {
1440 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1441 .legalFor({S32, S16, S64, V2S16})
1442 .clampMaxNumElements(0, S16, 2)
1443 .minScalar(0, S16)
1444 .widenScalarToNextPow2(0)
1445 .scalarize(0)
1446 .lower();
1447 } else {
1448 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1449 .legalFor({S32, S16, V2S16})
1450 .clampMaxNumElements(0, S16, 2)
1451 .minScalar(0, S16)
1452 .widenScalarToNextPow2(0)
1453 .scalarize(0)
1454 .lower();
1455 }
1456 } else {
1457 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1458 .legalFor({S32, S16})
1459 .widenScalarToNextPow2(0)
1460 .minScalar(0, S16)
1461 .scalarize(0)
1462 .lower();
1463 }
1464 } else {
1465 // TODO: Should have same legality without v_perm_b32
1466 getActionDefinitionsBuilder(G_BSWAP)
1467 .legalFor({S32})
1468 .lowerIf(scalarNarrowerThan(0, 32))
1469 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1470 // narrowScalar limitation.
1471 .widenScalarToNextPow2(0)
1472 .maxScalar(0, S32)
1473 .scalarize(0)
1474 .lower();
1475
1476 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1477 .legalFor({S32})
1478 .minScalar(0, S32)
1479 .widenScalarToNextPow2(0)
1480 .scalarize(0)
1481 .lower();
1482 }
1483
1484 getActionDefinitionsBuilder(G_INTTOPTR)
1485 // List the common cases
1486 .legalForCartesianProduct(AddrSpaces64, {S64})
1487 .legalForCartesianProduct(AddrSpaces32, {S32})
1488 .scalarize(0)
1489 // Accept any address space as long as the size matches
1490 .legalIf(sameSize(0, 1))
1491 .widenScalarIf(smallerThan(1, 0),
1492 [](const LegalityQuery &Query) {
1493 return std::pair(
1494 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1495 })
1496 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1497 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1498 });
1499
1500 getActionDefinitionsBuilder(G_PTRTOINT)
1501 // List the common cases
1502 .legalForCartesianProduct(AddrSpaces64, {S64})
1503 .legalForCartesianProduct(AddrSpaces32, {S32})
1504 .scalarize(0)
1505 // Accept any address space as long as the size matches
1506 .legalIf(sameSize(0, 1))
1507 .widenScalarIf(smallerThan(0, 1),
1508 [](const LegalityQuery &Query) {
1509 return std::pair(
1510 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1511 })
1512 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1513 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1514 });
1515
1516 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1517 .scalarize(0)
1518 .custom();
1519
1520 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1521 bool IsLoad) -> bool {
1522 const LLT DstTy = Query.Types[0];
1523
1524 // Split vector extloads.
1525 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1526
1527 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1528 return true;
1529
1530 const LLT PtrTy = Query.Types[1];
1531 unsigned AS = PtrTy.getAddressSpace();
1532 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1533 Query.MMODescrs[0].Ordering !=
1535 return true;
1536
1537 // Catch weird sized loads that don't evenly divide into the access sizes
1538 // TODO: May be able to widen depending on alignment etc.
1539 unsigned NumRegs = (MemSize + 31) / 32;
1540 if (NumRegs == 3) {
1541 if (!ST.hasDwordx3LoadStores())
1542 return true;
1543 } else {
1544 // If the alignment allows, these should have been widened.
1545 if (!isPowerOf2_32(NumRegs))
1546 return true;
1547 }
1548
1549 return false;
1550 };
1551
1552 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1553 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1554 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1555
1556 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1557 // LDS
1558 // TODO: Unsupported flat for SI.
1559
1560 for (unsigned Op : {G_LOAD, G_STORE}) {
1561 const bool IsStore = Op == G_STORE;
1562
1563 auto &Actions = getActionDefinitionsBuilder(Op);
1564 // Explicitly list some common cases.
1565 // TODO: Does this help compile time at all?
1566 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1567 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1568 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1569 {S64, GlobalPtr, S64, GlobalAlign32},
1570 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1571 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1572 {S32, GlobalPtr, S8, GlobalAlign8},
1573 {S32, GlobalPtr, S16, GlobalAlign16},
1574
1575 {S32, LocalPtr, S32, 32},
1576 {S64, LocalPtr, S64, 32},
1577 {V2S32, LocalPtr, V2S32, 32},
1578 {S32, LocalPtr, S8, 8},
1579 {S32, LocalPtr, S16, 16},
1580 {V2S16, LocalPtr, S32, 32},
1581
1582 {S32, PrivatePtr, S32, 32},
1583 {S32, PrivatePtr, S8, 8},
1584 {S32, PrivatePtr, S16, 16},
1585 {V2S16, PrivatePtr, S32, 32},
1586
1587 {S32, ConstantPtr, S32, GlobalAlign32},
1588 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1589 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1590 {S64, ConstantPtr, S64, GlobalAlign32},
1591 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1592
1593 Actions.legalForTypesWithMemDesc(ST.useRealTrue16Insts(), /* Pred */
1594 {{S16, GlobalPtr, S8, GlobalAlign8},
1595 {S16, GlobalPtr, S16, GlobalAlign16},
1596 {S16, LocalPtr, S8, 8},
1597 {S16, LocalPtr, S16, 16},
1598 {S16, PrivatePtr, S8, 8},
1599 {S16, PrivatePtr, S16, 16}});
1600
1601 Actions.legalIf(
1602 [=](const LegalityQuery &Query) -> bool {
1603 return isLoadStoreLegal(ST, Query);
1604 });
1605
1606 // The custom pointers (fat pointers, buffer resources) don't work with load
1607 // and store at this level. Fat pointers should have been lowered to
1608 // intrinsics before the translation to MIR.
1609 Actions.unsupportedIf(
1610 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1611
1612 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1613 // ptrtoint. This is needed to account for the fact that we can't have i128
1614 // as a register class for SelectionDAG reasons.
1615 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1616 return hasBufferRsrcWorkaround(Query.Types[0]);
1617 });
1618
1619 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1620 // 64-bits.
1621 //
1622 // TODO: Should generalize bitcast action into coerce, which will also cover
1623 // inserting addrspacecasts.
1624 Actions.customIf(typeIs(1, Constant32Ptr));
1625
1626 // Turn any illegal element vectors into something easier to deal
1627 // with. These will ultimately produce 32-bit scalar shifts to extract the
1628 // parts anyway.
1629 //
1630 // For odd 16-bit element vectors, prefer to split those into pieces with
1631 // 16-bit vector parts.
1632 Actions.bitcastIf(
1633 [=](const LegalityQuery &Query) -> bool {
1634 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1635 Query.MMODescrs[0].MemoryTy);
1636 }, bitcastToRegisterType(0));
1637
1638 if (!IsStore) {
1639 // Widen suitably aligned loads by loading extra bytes. The standard
1640 // legalization actions can't properly express widening memory operands.
1641 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1642 return shouldWidenLoad(ST, Query, G_LOAD);
1643 });
1644 }
1645
1646 // FIXME: load/store narrowing should be moved to lower action
1647 Actions
1648 .narrowScalarIf(
1649 [=](const LegalityQuery &Query) -> bool {
1650 return !Query.Types[0].isVector() &&
1651 needToSplitMemOp(Query, Op == G_LOAD);
1652 },
1653 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1654 const LLT DstTy = Query.Types[0];
1655 const LLT PtrTy = Query.Types[1];
1656
1657 const unsigned DstSize = DstTy.getSizeInBits();
1658 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1659
1660 // Split extloads.
1661 if (DstSize > MemSize)
1662 return std::pair(0, LLT::scalar(MemSize));
1663
1664 unsigned MaxSize = maxSizeForAddrSpace(
1665 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1666 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1667 if (MemSize > MaxSize)
1668 return std::pair(0, LLT::scalar(MaxSize));
1669
1670 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1671 return std::pair(0, LLT::scalar(Align));
1672 })
1673 .fewerElementsIf(
1674 [=](const LegalityQuery &Query) -> bool {
1675 return Query.Types[0].isVector() &&
1676 needToSplitMemOp(Query, Op == G_LOAD);
1677 },
1678 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1679 const LLT DstTy = Query.Types[0];
1680 const LLT PtrTy = Query.Types[1];
1681
1682 LLT EltTy = DstTy.getElementType();
1683 unsigned MaxSize = maxSizeForAddrSpace(
1684 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1685 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1686
1687 // FIXME: Handle widened to power of 2 results better. This ends
1688 // up scalarizing.
1689 // FIXME: 3 element stores scalarized on SI
1690
1691 // Split if it's too large for the address space.
1692 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1693 if (MemSize > MaxSize) {
1694 unsigned NumElts = DstTy.getNumElements();
1695 unsigned EltSize = EltTy.getSizeInBits();
1696
1697 if (MaxSize % EltSize == 0) {
1698 return std::pair(
1700 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1701 }
1702
1703 unsigned NumPieces = MemSize / MaxSize;
1704
1705 // FIXME: Refine when odd breakdowns handled
1706 // The scalars will need to be re-legalized.
1707 if (NumPieces == 1 || NumPieces >= NumElts ||
1708 NumElts % NumPieces != 0)
1709 return std::pair(0, EltTy);
1710
1711 return std::pair(0,
1712 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1713 }
1714
1715 // FIXME: We could probably handle weird extending loads better.
1716 if (DstTy.getSizeInBits() > MemSize)
1717 return std::pair(0, EltTy);
1718
1719 unsigned EltSize = EltTy.getSizeInBits();
1720 unsigned DstSize = DstTy.getSizeInBits();
1721 if (!isPowerOf2_32(DstSize)) {
1722 // We're probably decomposing an odd sized store. Try to split
1723 // to the widest type. TODO: Account for alignment. As-is it
1724 // should be OK, since the new parts will be further legalized.
1725 unsigned FloorSize = llvm::bit_floor(DstSize);
1726 return std::pair(
1728 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1729 }
1730
1731 // May need relegalization for the scalars.
1732 return std::pair(0, EltTy);
1733 })
1734 .minScalar(0, S32)
1735 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1737 .widenScalarToNextPow2(0)
1738 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1739 .lower();
1740 }
1741
1742 // FIXME: Unaligned accesses not lowered.
1743 auto &ExtLoads =
1744 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1745 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1746 {S32, GlobalPtr, S16, 2 * 8},
1747 {S32, LocalPtr, S8, 8},
1748 {S32, LocalPtr, S16, 16},
1749 {S32, PrivatePtr, S8, 8},
1750 {S32, PrivatePtr, S16, 16},
1751 {S32, ConstantPtr, S8, 8},
1752 {S32, ConstantPtr, S16, 2 * 8}})
1753 .legalForTypesWithMemDesc(ST.useRealTrue16Insts(),
1754 {{S16, GlobalPtr, S8, GlobalAlign8},
1755 {S16, LocalPtr, S8, GlobalAlign8},
1756 {S16, PrivatePtr, S8, GlobalAlign8},
1757 {S16, ConstantPtr, S8, GlobalAlign8}})
1758 .legalIf([=](const LegalityQuery &Query) -> bool {
1759 return isLoadStoreLegal(ST, Query);
1760 });
1761
1762 if (ST.hasFlatAddressSpace()) {
1763 ExtLoads.legalForTypesWithMemDesc(
1764 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1765
1766 ExtLoads.legalForTypesWithMemDesc(ST.useRealTrue16Insts(),
1767 {{S16, FlatPtr, S8, GlobalAlign8}});
1768 }
1769
1770 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1771 // 64-bits.
1772 //
1773 // TODO: Should generalize bitcast action into coerce, which will also cover
1774 // inserting addrspacecasts.
1775 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1776
1777 ExtLoads.narrowScalarIf(
1778 [](const LegalityQuery &Query) {
1779 LLT MemTy = Query.MMODescrs[0].MemoryTy;
1780 return MemTy.isAnyScalar() && MemTy.getSizeInBits() > 32 &&
1781 Query.Types[0].getSizeInBits() > MemTy.getSizeInBits();
1782 }, // For large MemSize, narrowscalar to MemSize (load MemSize + ext)
1784 ExtLoads.clampScalar(0, S32, S32)
1785 .widenScalarToNextPow2(0)
1786 .lower();
1787
1788 auto &Atomics = getActionDefinitionsBuilder(
1789 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1790 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1791 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1792 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1793 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1794 {S64, GlobalPtr}, {S64, LocalPtr},
1795 {S32, RegionPtr}, {S64, RegionPtr}});
1796 if (ST.hasFlatAddressSpace()) {
1797 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1798 }
1799
1800 auto &Atomics32 =
1801 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1802 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1803 if (ST.hasFlatAddressSpace()) {
1804 Atomics32.legalFor({{S32, FlatPtr}});
1805 }
1806
1807 // TODO: v2bf16 operations, and fat buffer pointer support.
1808 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1809 if (ST.hasLDSFPAtomicAddF32()) {
1810 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1811 if (ST.hasLdsAtomicAddF64())
1812 Atomic.legalFor({{S64, LocalPtr}});
1813 if (ST.hasAtomicDsPkAdd16Insts())
1814 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1815 }
1816 if (ST.hasAtomicFaddInsts())
1817 Atomic.legalFor({{S32, GlobalPtr}});
1818 if (ST.hasFlatAtomicFaddF32Inst())
1819 Atomic.legalFor({{S32, FlatPtr}});
1820
1821 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1822 // These are legal with some caveats, and should have undergone expansion in
1823 // the IR in most situations
1824 // TODO: Move atomic expansion into legalizer
1825 Atomic.legalFor({
1826 {S32, GlobalPtr},
1827 {S64, GlobalPtr},
1828 {S64, FlatPtr}
1829 });
1830 }
1831
1832 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1833 ST.hasAtomicBufferGlobalPkAddF16Insts())
1834 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1835 if (ST.hasAtomicGlobalPkAddBF16Inst())
1836 Atomic.legalFor({{V2BF16, GlobalPtr}});
1837 if (ST.hasAtomicFlatPkAdd16Insts())
1838 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1839
1840
1841 // Most of the legalization work here is done by AtomicExpand. We could
1842 // probably use a simpler legality rule that just assumes anything is OK.
1843 auto &AtomicFMinFMax =
1844 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1845 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1846
1847 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1848 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1849 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1850 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1851 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1852 AtomicFMinFMax.legalFor({F32, FlatPtr});
1853 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1854 AtomicFMinFMax.legalFor({F64, FlatPtr});
1855
1856 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1857 // demarshalling
1858 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1859 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1860 {S32, FlatPtr}, {S64, FlatPtr}})
1861 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1862 {S32, RegionPtr}, {S64, RegionPtr}});
1863 // TODO: Pointer types, any 32-bit or 64-bit vector
1864
1865 // Condition should be s32 for scalar, s1 for vector.
1866 getActionDefinitionsBuilder(G_SELECT)
1867 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1868 LocalPtr, FlatPtr, PrivatePtr,
1869 LLT::fixed_vector(2, LocalPtr),
1870 LLT::fixed_vector(2, PrivatePtr)},
1871 {S1, S32})
1872 .clampScalar(0, S16, S64)
1873 .scalarize(1)
1874 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1875 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1876 .clampMaxNumElements(0, S32, 2)
1877 .clampMaxNumElements(0, LocalPtr, 2)
1878 .clampMaxNumElements(0, PrivatePtr, 2)
1879 .scalarize(0)
1880 .widenScalarToNextPow2(0)
1881 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1882
1883 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1884 // be more flexible with the shift amount type.
1885 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1886 .legalFor({{S32, S32}, {S64, S32}});
1887 if (ST.has16BitInsts()) {
1888 if (ST.hasVOP3PInsts()) {
1889 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1890 .clampMaxNumElements(0, S16, 2);
1891 } else
1892 Shifts.legalFor({{S16, S16}});
1893
1894 // TODO: Support 16-bit shift amounts for all types
1895 Shifts.widenScalarIf(
1896 [=](const LegalityQuery &Query) {
1897 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1898 // 32-bit amount.
1899 const LLT ValTy = Query.Types[0];
1900 const LLT AmountTy = Query.Types[1];
1901 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1902 AmountTy.getSizeInBits() < 16;
1903 }, changeTo(1, S16));
1904 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1905 Shifts.clampScalar(1, S32, S32);
1906 Shifts.widenScalarToNextPow2(0, 16);
1907 Shifts.clampScalar(0, S16, S64);
1908
1909 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1910 .minScalar(0, S16)
1911 .scalarize(0)
1912 .lower();
1913 } else {
1914 // Make sure we legalize the shift amount type first, as the general
1915 // expansion for the shifted type will produce much worse code if it hasn't
1916 // been truncated already.
1917 Shifts.clampScalar(1, S32, S32);
1918 Shifts.widenScalarToNextPow2(0, 32);
1919 Shifts.clampScalar(0, S32, S64);
1920
1921 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1922 .minScalar(0, S32)
1923 .scalarize(0)
1924 .lower();
1925 }
1926 Shifts.scalarize(0);
1927
1928 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1929 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1930 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1931 unsigned IdxTypeIdx = 2;
1932
1933 getActionDefinitionsBuilder(Op)
1934 .customIf([=](const LegalityQuery &Query) {
1935 const LLT EltTy = Query.Types[EltTypeIdx];
1936 const LLT VecTy = Query.Types[VecTypeIdx];
1937 const LLT IdxTy = Query.Types[IdxTypeIdx];
1938 const unsigned EltSize = EltTy.getSizeInBits();
1939 const bool isLegalVecType =
1941 // Address space 8 pointers are 128-bit wide values, but the logic
1942 // below will try to bitcast them to 2N x s64, which will fail.
1943 // Therefore, as an intermediate step, wrap extracts/insertions from a
1944 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1945 // extraction result) in order to produce a vector operation that can
1946 // be handled by the logic below.
1947 if (EltTy.isPointer() && EltSize > 64)
1948 return true;
1949 return (EltSize == 32 || EltSize == 64) &&
1950 VecTy.getSizeInBits() % 32 == 0 &&
1951 VecTy.getSizeInBits() <= MaxRegisterSize &&
1952 IdxTy.getSizeInBits() == 32 &&
1953 isLegalVecType;
1954 })
1955 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1956 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1957 bitcastToVectorElement32(VecTypeIdx))
1958 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1959 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1960 scalarOrEltWiderThan(VecTypeIdx, 64)),
1961 [=](const LegalityQuery &Query) {
1962 // For > 64-bit element types, try to turn this into a
1963 // 64-bit element vector since we may be able to do better
1964 // indexing if this is scalar. If not, fall back to 32.
1965 const LLT EltTy = Query.Types[EltTypeIdx];
1966 const LLT VecTy = Query.Types[VecTypeIdx];
1967 const unsigned DstEltSize = EltTy.getSizeInBits();
1968 const unsigned VecSize = VecTy.getSizeInBits();
1969
1970 const unsigned TargetEltSize =
1971 DstEltSize % 64 == 0 ? 64 : 32;
1972 return std::pair(VecTypeIdx,
1973 LLT::fixed_vector(VecSize / TargetEltSize,
1974 TargetEltSize));
1975 })
1976 .clampScalar(EltTypeIdx, S32, S64)
1977 .clampScalar(VecTypeIdx, S32, S64)
1978 .clampScalar(IdxTypeIdx, S32, S32)
1979 .clampMaxNumElements(VecTypeIdx, S32, 32)
1980 // TODO: Clamp elements for 64-bit vectors?
1981 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1983 // It should only be necessary with variable indexes.
1984 // As a last resort, lower to the stack
1985 .lower();
1986 }
1987
1988 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1989 .unsupportedIf([=](const LegalityQuery &Query) {
1990 const LLT &EltTy = Query.Types[1].getElementType();
1991 return Query.Types[0] != EltTy;
1992 });
1993
1994 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1995 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1996 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1997 getActionDefinitionsBuilder(Op)
1998 .widenScalarIf(
1999 [=](const LegalityQuery &Query) {
2000 const LLT BigTy = Query.Types[BigTyIdx];
2001 return (BigTy.getScalarSizeInBits() < 16);
2002 },
2004 .widenScalarIf(
2005 [=](const LegalityQuery &Query) {
2006 const LLT LitTy = Query.Types[LitTyIdx];
2007 return (LitTy.getScalarSizeInBits() < 16);
2008 },
2010 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
2011 .widenScalarToNextPow2(BigTyIdx, 32)
2012 .customIf([=](const LegalityQuery &Query) {
2013 // Generic lower operates on the full-width value, producing
2014 // shift+trunc/mask sequences. For simple cases where extract/insert
2015 // values are 32-bit aligned, we can instead unmerge/merge and work on
2016 // the 32-bit components. However, we can't check the offset here so
2017 // custom lower function will have to call generic lowering if offset
2018 // is not 32-bit aligned.
2019 const LLT BigTy = Query.Types[BigTyIdx];
2020 const LLT LitTy = Query.Types[LitTyIdx];
2021 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
2022 LitTy.getSizeInBits() % 32 == 0;
2023 })
2024 .lower();
2025 }
2026
2027 auto &BuildVector =
2028 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2029 .legalForCartesianProduct(AllS32Vectors, {S32})
2030 .legalForCartesianProduct(AllS64Vectors, {S64})
2031 .clampNumElements(0, V16S32, V32S32)
2032 .clampNumElements(0, V2S64, V16S64)
2033 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
2034 .moreElementsIf(isIllegalRegisterType(ST, 0),
2036
2037 if (ST.hasScalarPackInsts()) {
2038 BuildVector
2039 // FIXME: Should probably widen s1 vectors straight to s32
2040 .minScalarOrElt(0, S16)
2041 .minScalar(1, S16);
2042
2043 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2044 .legalFor({V2S16, S32})
2045 .lower();
2046 } else {
2047 BuildVector.customFor({V2S16, S16});
2048 BuildVector.minScalarOrElt(0, S32);
2049
2050 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2051 .customFor({V2S16, S32})
2052 .lower();
2053 }
2054
2055 BuildVector.legalIf(isRegisterType(ST, 0));
2056
2057 // FIXME: Clamp maximum size
2058 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2059 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2060 .clampMaxNumElements(0, S32, 32)
2061 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2062 .clampMaxNumElements(0, S16, 64);
2063
2064 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2065
2066 // Merge/Unmerge
2067 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2068 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2069 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2070
2071 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2072 const LLT Ty = Query.Types[TypeIdx];
2073 if (Ty.isVector()) {
2074 const LLT &EltTy = Ty.getElementType();
2075 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2076 return true;
2078 return true;
2079 }
2080 return false;
2081 };
2082
2083 auto &Builder =
2084 getActionDefinitionsBuilder(Op)
2085 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2086 .lowerFor({{S16, V2S16}})
2087 .lowerIf([=](const LegalityQuery &Query) {
2088 const LLT BigTy = Query.Types[BigTyIdx];
2089 return BigTy.getSizeInBits() == 32;
2090 })
2091 // Try to widen to s16 first for small types.
2092 // TODO: Only do this on targets with legal s16 shifts
2093 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2094 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2095 .moreElementsIf(isSmallOddVector(BigTyIdx),
2096 oneMoreElement(BigTyIdx))
2097 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
2098 elementTypeIs(1, S16)),
2099 changeTo(1, V2S16))
2100 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2101 // not worth considering the multiples of 64 since 2*192 and 2*384
2102 // are not valid.
2103 .clampScalar(LitTyIdx, S32, S512)
2104 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2105 // Break up vectors with weird elements into scalars
2106 .fewerElementsIf(
2107 [=](const LegalityQuery &Query) {
2108 return notValidElt(Query, LitTyIdx);
2109 },
2110 scalarize(0))
2111 .fewerElementsIf(
2112 [=](const LegalityQuery &Query) {
2113 return notValidElt(Query, BigTyIdx);
2114 },
2115 scalarize(1))
2116 .clampScalar(BigTyIdx, S32, MaxScalar);
2117
2118 if (Op == G_MERGE_VALUES) {
2119 Builder.widenScalarIf(
2120 // TODO: Use 16-bit shifts if legal for 8-bit values?
2121 [=](const LegalityQuery &Query) {
2122 const LLT Ty = Query.Types[LitTyIdx];
2123 return Ty.getSizeInBits() < 32;
2124 },
2125 changeTo(LitTyIdx, S32));
2126 }
2127
2128 Builder.widenScalarIf(
2129 [=](const LegalityQuery &Query) {
2130 const LLT Ty = Query.Types[BigTyIdx];
2131 return Ty.getSizeInBits() % 16 != 0;
2132 },
2133 [=](const LegalityQuery &Query) {
2134 // Pick the next power of 2, or a multiple of 64 over 128.
2135 // Whichever is smaller.
2136 const LLT &Ty = Query.Types[BigTyIdx];
2137 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2138 if (NewSizeInBits >= 256) {
2139 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2140 if (RoundedTo < NewSizeInBits)
2141 NewSizeInBits = RoundedTo;
2142 }
2143 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2144 })
2145 // Any vectors left are the wrong size. Scalarize them.
2146 .scalarize(0)
2147 .scalarize(1);
2148 }
2149
2150 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2151 // RegBankSelect.
2152 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2153 .legalFor({{S32}, {S64}})
2154 .clampScalar(0, S32, S64);
2155
2156 if (ST.hasVOP3PInsts()) {
2157 SextInReg.lowerFor({{V2S16}})
2158 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2159 // get more vector shift opportunities, since we'll get those when
2160 // expanded.
2161 .clampMaxNumElementsStrict(0, S16, 2);
2162 } else if (ST.has16BitInsts()) {
2163 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2164 } else {
2165 // Prefer to promote to s32 before lowering if we don't have 16-bit
2166 // shifts. This avoid a lot of intermediate truncate and extend operations.
2167 SextInReg.lowerFor({{S32}, {S64}});
2168 }
2169
2170 SextInReg
2171 .scalarize(0)
2172 .clampScalar(0, S32, S64)
2173 .lower();
2174
2175 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2176 .scalarize(0)
2177 .lower();
2178
2179 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2180 FSHRActionDefs.legalFor({{S32, S32}})
2181 .clampMaxNumElementsStrict(0, S16, 2);
2182 if (ST.hasVOP3PInsts())
2183 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2184 FSHRActionDefs.scalarize(0).lower();
2185
2186 if (ST.hasVOP3PInsts()) {
2187 getActionDefinitionsBuilder(G_FSHL)
2188 .lowerFor({{V2S16, V2S16}})
2189 .clampMaxNumElementsStrict(0, S16, 2)
2190 .scalarize(0)
2191 .lower();
2192 } else {
2193 getActionDefinitionsBuilder(G_FSHL)
2194 .scalarize(0)
2195 .lower();
2196 }
2197
2198 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2199 .legalFor({S64});
2200
2201 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2202
2203 getActionDefinitionsBuilder(G_FENCE)
2204 .alwaysLegal();
2205
2206 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2207 .scalarize(0)
2208 .minScalar(0, S32)
2209 .lower();
2210
2211 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2212 .legalFor({{S32, S32}, {S64, S32}})
2213 .clampScalar(1, S32, S32)
2214 .clampScalar(0, S32, S64)
2215 .widenScalarToNextPow2(0)
2216 .scalarize(0);
2217
2218 getActionDefinitionsBuilder(
2219 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2220 G_FCOPYSIGN,
2221
2222 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2223 G_READ_REGISTER, G_WRITE_REGISTER,
2224
2225 G_SADDO, G_SSUBO})
2226 .lower();
2227
2228 if (ST.hasIEEEMinimumMaximumInsts()) {
2229 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2230 .legalFor(FPTypesPK16)
2231 .clampMaxNumElements(0, S16, 2)
2232 .scalarize(0);
2233 } else if (ST.hasVOP3PInsts()) {
2234 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2235 .lowerFor({V2S16})
2236 .clampMaxNumElementsStrict(0, S16, 2)
2237 .scalarize(0)
2238 .lower();
2239 } else {
2240 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2241 .scalarize(0)
2242 .clampScalar(0, S32, S64)
2243 .lower();
2244 }
2245
2246 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2247 .lower();
2248
2249 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2250
2251 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2252 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2253 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2254 .unsupported();
2255
2256 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2257
2258 getActionDefinitionsBuilder(
2259 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2260 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2261 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2262 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2263 .legalFor(AllVectors)
2264 .scalarize(1)
2265 .lower();
2266
2267 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2268 G_INTRINSIC_CONVERGENT,
2269 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2270 .alwaysLegal();
2271
2272 getLegacyLegalizerInfo().computeTables();
2273 verify(*ST.getInstrInfo());
2274}
2275
2278 LostDebugLocObserver &LocObserver) const {
2279 MachineIRBuilder &B = Helper.MIRBuilder;
2280 MachineRegisterInfo &MRI = *B.getMRI();
2281
2282 switch (MI.getOpcode()) {
2283 case TargetOpcode::G_ADDRSPACE_CAST:
2284 return legalizeAddrSpaceCast(MI, MRI, B);
2285 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2286 return legalizeFroundeven(MI, MRI, B);
2287 case TargetOpcode::G_FCEIL:
2288 return legalizeFceil(MI, MRI, B);
2289 case TargetOpcode::G_FREM:
2290 return legalizeFrem(MI, MRI, B);
2291 case TargetOpcode::G_INTRINSIC_TRUNC:
2292 return legalizeIntrinsicTrunc(MI, MRI, B);
2293 case TargetOpcode::G_SITOFP:
2294 return legalizeITOFP(MI, MRI, B, true);
2295 case TargetOpcode::G_UITOFP:
2296 return legalizeITOFP(MI, MRI, B, false);
2297 case TargetOpcode::G_FPTOSI:
2298 return legalizeFPTOI(MI, MRI, B, true);
2299 case TargetOpcode::G_FPTOUI:
2300 return legalizeFPTOI(MI, MRI, B, false);
2301 case TargetOpcode::G_FMINNUM:
2302 case TargetOpcode::G_FMAXNUM:
2303 case TargetOpcode::G_FMINIMUMNUM:
2304 case TargetOpcode::G_FMAXIMUMNUM:
2305 return legalizeMinNumMaxNum(Helper, MI);
2306 case TargetOpcode::G_EXTRACT:
2307 return legalizeExtract(Helper, MI);
2308 case TargetOpcode::G_INSERT:
2309 return legalizeInsert(Helper, MI);
2310 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2311 return legalizeExtractVectorElt(MI, MRI, B);
2312 case TargetOpcode::G_INSERT_VECTOR_ELT:
2313 return legalizeInsertVectorElt(MI, MRI, B);
2314 case TargetOpcode::G_FSIN:
2315 case TargetOpcode::G_FCOS:
2316 return legalizeSinCos(MI, MRI, B);
2317 case TargetOpcode::G_GLOBAL_VALUE:
2318 return legalizeGlobalValue(MI, MRI, B);
2319 case TargetOpcode::G_LOAD:
2320 case TargetOpcode::G_SEXTLOAD:
2321 case TargetOpcode::G_ZEXTLOAD:
2322 return legalizeLoad(Helper, MI);
2323 case TargetOpcode::G_STORE:
2324 return legalizeStore(Helper, MI);
2325 case TargetOpcode::G_FMAD:
2326 return legalizeFMad(MI, MRI, B);
2327 case TargetOpcode::G_FDIV:
2328 return legalizeFDIV(MI, MRI, B);
2329 case TargetOpcode::G_FFREXP:
2330 return legalizeFFREXP(MI, MRI, B);
2331 case TargetOpcode::G_FSQRT:
2332 return legalizeFSQRT(MI, MRI, B);
2333 case TargetOpcode::G_UDIV:
2334 case TargetOpcode::G_UREM:
2335 case TargetOpcode::G_UDIVREM:
2336 return legalizeUnsignedDIV_REM(MI, MRI, B);
2337 case TargetOpcode::G_SDIV:
2338 case TargetOpcode::G_SREM:
2339 case TargetOpcode::G_SDIVREM:
2340 return legalizeSignedDIV_REM(MI, MRI, B);
2341 case TargetOpcode::G_ATOMIC_CMPXCHG:
2342 return legalizeAtomicCmpXChg(MI, MRI, B);
2343 case TargetOpcode::G_FLOG2:
2344 return legalizeFlog2(MI, B);
2345 case TargetOpcode::G_FLOG:
2346 case TargetOpcode::G_FLOG10:
2347 return legalizeFlogCommon(MI, B);
2348 case TargetOpcode::G_FEXP2:
2349 return legalizeFExp2(MI, B);
2350 case TargetOpcode::G_FEXP:
2351 case TargetOpcode::G_FEXP10:
2352 return legalizeFExp(MI, B);
2353 case TargetOpcode::G_FPOW:
2354 return legalizeFPow(MI, B);
2355 case TargetOpcode::G_FFLOOR:
2356 return legalizeFFloor(MI, MRI, B);
2357 case TargetOpcode::G_BUILD_VECTOR:
2358 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2359 return legalizeBuildVector(MI, MRI, B);
2360 case TargetOpcode::G_MUL:
2361 return legalizeMul(Helper, MI);
2362 case TargetOpcode::G_CTLZ:
2363 case TargetOpcode::G_CTTZ:
2364 return legalizeCTLZ_CTTZ(MI, MRI, B);
2365 case TargetOpcode::G_CTLS:
2366 return legalizeCTLS(MI, MRI, B);
2367 case TargetOpcode::G_CTLZ_ZERO_POISON:
2368 return legalizeCTLZ_ZERO_POISON(MI, MRI, B);
2369 case TargetOpcode::G_STACKSAVE:
2370 return legalizeStackSave(MI, B);
2371 case TargetOpcode::G_GET_FPENV:
2372 return legalizeGetFPEnv(MI, MRI, B);
2373 case TargetOpcode::G_SET_FPENV:
2374 return legalizeSetFPEnv(MI, MRI, B);
2375 case TargetOpcode::G_TRAP:
2376 return legalizeTrap(MI, MRI, B);
2377 case TargetOpcode::G_DEBUGTRAP:
2378 return legalizeDebugTrap(MI, MRI, B);
2379 default:
2380 return false;
2381 }
2382
2383 llvm_unreachable("expected switch to return");
2384}
2385
2387 unsigned AS,
2389 MachineIRBuilder &B) const {
2390 MachineFunction &MF = B.getMF();
2391 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2392 const LLT S32 = LLT::scalar(32);
2393 const LLT S64 = LLT::scalar(64);
2394
2396
2397 if (ST.hasApertureRegs()) {
2398 // Note: this register is somewhat broken. When used as a 32-bit operand,
2399 // it only returns zeroes. The real value is in the upper 32 bits.
2400 // Thus, we must emit extract the high 32 bits.
2401 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2402 ? AMDGPU::SRC_SHARED_BASE
2403 : AMDGPU::SRC_PRIVATE_BASE;
2404 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2405 !ST.hasGloballyAddressableScratch()) &&
2406 "Cannot use src_private_base with globally addressable scratch!");
2408 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2409 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2410 return B.buildUnmerge(S32, Dst).getReg(1);
2411 }
2412
2415 // For code object version 5, private_base and shared_base are passed through
2416 // implicit kernargs.
2420
2425 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2426
2427 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2429
2430 if (!loadInputValue(KernargPtrReg, B,
2432 return Register();
2433
2435 PtrInfo.getWithOffset(Offset),
2439
2440 // Pointer address
2441 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2442 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2443 // Load address
2444 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2445 }
2446
2449
2451 return Register();
2452
2453 // TODO: Use custom PseudoSourceValue
2455
2456 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2457 // private_segment_aperture_base_hi.
2458 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2459
2461 PtrInfo,
2464 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2465
2466 B.buildObjectPtrOffset(
2467 LoadAddr, QueuePtr,
2468 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2469 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2470}
2471
2472/// Return true if the value is a known valid address, such that a null check is
2473/// not necessary.
2475 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2476 MachineInstr *Def = MRI.getVRegDef(Val);
2477 switch (Def->getOpcode()) {
2478 case AMDGPU::G_FRAME_INDEX:
2479 case AMDGPU::G_GLOBAL_VALUE:
2480 case AMDGPU::G_BLOCK_ADDR:
2481 return true;
2482 case AMDGPU::G_CONSTANT: {
2483 const ConstantInt *CI = Def->getOperand(1).getCImm();
2484 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2485 }
2486 default:
2487 return false;
2488 }
2489
2490 return false;
2491}
2492
2495 MachineIRBuilder &B) const {
2496 MachineFunction &MF = B.getMF();
2497
2498 // MI can either be a G_ADDRSPACE_CAST or a
2499 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2500 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2501 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2502 Intrinsic::amdgcn_addrspacecast_nonnull));
2503
2504 const LLT S32 = LLT::scalar(32);
2505 Register Dst = MI.getOperand(0).getReg();
2506 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2507 : MI.getOperand(1).getReg();
2508 LLT DstTy = MRI.getType(Dst);
2509 LLT SrcTy = MRI.getType(Src);
2510 unsigned DestAS = DstTy.getAddressSpace();
2511 unsigned SrcAS = SrcTy.getAddressSpace();
2512
2513 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2514 // vector element.
2515 assert(!DstTy.isVector());
2516
2517 const AMDGPUTargetMachine &TM
2518 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2519
2520 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2521 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2522 return true;
2523 }
2524
2525 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2526 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2527 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2528 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2529 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2530 ST.hasGloballyAddressableScratch()) {
2531 // flat -> private with globally addressable scratch: subtract
2532 // src_flat_scratch_base_lo.
2533 const LLT S32 = LLT::scalar(32);
2534 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2535 Register FlatScratchBaseLo =
2536 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2537 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2538 .getReg(0);
2539 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2540 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2541 return B.buildIntToPtr(Dst, Sub).getReg(0);
2542 }
2543
2544 // Extract low 32-bits of the pointer.
2545 return B.buildExtract(Dst, Src, 0).getReg(0);
2546 };
2547
2548 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2549 // G_ADDRSPACE_CAST we need to guess.
2550 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2551 castFlatToLocalOrPrivate(Dst);
2552 MI.eraseFromParent();
2553 return true;
2554 }
2555
2556 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2557
2558 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2559 auto FlatNull = B.buildConstant(SrcTy, 0);
2560
2561 // Extract low 32-bits of the pointer.
2562 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2563
2564 auto CmpRes =
2565 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2566 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2567
2568 MI.eraseFromParent();
2569 return true;
2570 }
2571
2572 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2573 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2574 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2575 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2576 // Coerce the type of the low half of the result so we can use
2577 // merge_values.
2578 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2579
2580 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2581 ST.hasGloballyAddressableScratch()) {
2582 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2583 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2584 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2585 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2586 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2587 .addUse(AllOnes)
2588 .addUse(ThreadID)
2589 .getReg(0);
2590 if (ST.isWave64()) {
2591 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2592 .addUse(AllOnes)
2593 .addUse(ThreadID)
2594 .getReg(0);
2595 }
2596 Register ShAmt =
2597 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2598 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2599 Register CvtPtr =
2600 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2601 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2602 // 64-bit hi:lo value.
2603 Register FlatScratchBase =
2604 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2605 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2606 .getReg(0);
2607 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2608 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2609 }
2610
2611 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2612 if (!ApertureReg.isValid())
2613 return false;
2614
2615 // TODO: Should we allow mismatched types but matching sizes in merges to
2616 // avoid the ptrtoint?
2617 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2618 };
2619
2620 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2621 // G_ADDRSPACE_CAST we need to guess.
2622 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2623 castLocalOrPrivateToFlat(Dst);
2624 MI.eraseFromParent();
2625 return true;
2626 }
2627
2628 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2629
2630 auto SegmentNull =
2631 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2632 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2633
2634 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2635 SegmentNull.getReg(0));
2636
2637 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2638
2639 MI.eraseFromParent();
2640 return true;
2641 }
2642
2643 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2644 SrcTy.getSizeInBits() == 64) {
2645 // Truncate.
2646 B.buildExtract(Dst, Src, 0);
2647 MI.eraseFromParent();
2648 return true;
2649 }
2650
2651 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2652 DstTy.getSizeInBits() == 64) {
2654 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2655 auto PtrLo = B.buildPtrToInt(S32, Src);
2656 if (AddrHiVal == 0) {
2657 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2658 B.buildIntToPtr(Dst, Zext);
2659 } else {
2660 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2661 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2662 }
2663
2664 MI.eraseFromParent();
2665 return true;
2666 }
2667
2668 // Invalid casts are poison.
2669 // TODO: Should return poison
2670 B.buildUndef(Dst);
2671 MI.eraseFromParent();
2672 return true;
2673}
2674
2677 MachineIRBuilder &B) const {
2678 Register Src = MI.getOperand(1).getReg();
2679 LLT Ty = MRI.getType(Src);
2680 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2681
2682 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2683 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2684
2685 auto C1 = B.buildFConstant(Ty, C1Val);
2686 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2687
2688 // TODO: Should this propagate fast-math-flags?
2689 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2690 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2691
2692 auto C2 = B.buildFConstant(Ty, C2Val);
2693 auto Fabs = B.buildFAbs(Ty, Src);
2694
2695 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2696 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2697 MI.eraseFromParent();
2698 return true;
2699}
2700
2703 MachineIRBuilder &B) const {
2704
2705 const LLT S1 = LLT::scalar(1);
2706 const LLT S64 = LLT::scalar(64);
2707
2708 Register Src = MI.getOperand(1).getReg();
2709 assert(MRI.getType(Src) == S64);
2710
2711 // result = trunc(src)
2712 // if (src > 0.0 && src != result)
2713 // result += 1.0
2714
2715 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2716
2717 const auto Zero = B.buildFConstant(S64, 0.0);
2718 const auto One = B.buildFConstant(S64, 1.0);
2719 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2720 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2721 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2722 auto Add = B.buildSelect(S64, And, One, Zero);
2723
2724 // TODO: Should this propagate fast-math-flags?
2725 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2726 MI.eraseFromParent();
2727 return true;
2728}
2729
2732 MachineIRBuilder &B) const {
2733 Register DstReg = MI.getOperand(0).getReg();
2734 Register Src0Reg = MI.getOperand(1).getReg();
2735 Register Src1Reg = MI.getOperand(2).getReg();
2736 auto Flags = MI.getFlags();
2737 LLT Ty = MRI.getType(DstReg);
2738
2739 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2740 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2741 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2742 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2743 MI.eraseFromParent();
2744 return true;
2745}
2746
2749 const unsigned FractBits = 52;
2750 const unsigned ExpBits = 11;
2751 LLT S32 = LLT::scalar(32);
2752
2753 auto Const0 = B.buildConstant(S32, FractBits - 32);
2754 auto Const1 = B.buildConstant(S32, ExpBits);
2755
2756 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2757 .addUse(Hi)
2758 .addUse(Const0.getReg(0))
2759 .addUse(Const1.getReg(0));
2760
2761 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2762}
2763
2766 MachineIRBuilder &B) const {
2767 const LLT S1 = LLT::scalar(1);
2768 const LLT S32 = LLT::scalar(32);
2769 const LLT S64 = LLT::scalar(64);
2770
2771 Register Src = MI.getOperand(1).getReg();
2772 assert(MRI.getType(Src) == S64);
2773
2774 // TODO: Should this use extract since the low half is unused?
2775 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2776 Register Hi = Unmerge.getReg(1);
2777
2778 // Extract the upper half, since this is where we will find the sign and
2779 // exponent.
2780 auto Exp = extractF64Exponent(Hi, B);
2781
2782 const unsigned FractBits = 52;
2783
2784 // Extract the sign bit.
2785 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2786 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2787
2788 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2789
2790 const auto Zero32 = B.buildConstant(S32, 0);
2791
2792 // Extend back to 64-bits.
2793 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2794
2795 auto Shr = B.buildAShr(S64, FractMask, Exp);
2796 auto Not = B.buildNot(S64, Shr);
2797 auto Tmp0 = B.buildAnd(S64, Src, Not);
2798 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2799
2800 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2801 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2802
2803 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2804 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2805 MI.eraseFromParent();
2806 return true;
2807}
2808
2811 MachineIRBuilder &B, bool Signed) const {
2812
2813 Register Dst = MI.getOperand(0).getReg();
2814 Register Src = MI.getOperand(1).getReg();
2815
2816 const LLT S64 = LLT::scalar(64);
2817 const LLT S32 = LLT::scalar(32);
2818
2819 assert(MRI.getType(Src) == S64);
2820
2821 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2822 auto ThirtyTwo = B.buildConstant(S32, 32);
2823
2824 if (MRI.getType(Dst) == S64) {
2825 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2826 : B.buildUITOFP(S64, Unmerge.getReg(1));
2827
2828 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2829 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2830
2831 // TODO: Should this propagate fast-math-flags?
2832 B.buildFAdd(Dst, LdExp, CvtLo);
2833 MI.eraseFromParent();
2834 return true;
2835 }
2836
2837 assert(MRI.getType(Dst) == S32);
2838
2839 auto One = B.buildConstant(S32, 1);
2840
2841 MachineInstrBuilder ShAmt;
2842 if (Signed) {
2843 auto ThirtyOne = B.buildConstant(S32, 31);
2844 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2845 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2846 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2847 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2848 .addUse(Unmerge.getReg(1));
2849 auto LS2 = B.buildSub(S32, LS, One);
2850 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2851 } else
2852 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2853 auto Norm = B.buildShl(S64, Src, ShAmt);
2854 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2855 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2856 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2857 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2858 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2859 B.buildFLdexp(Dst, FVal, Scale);
2860 MI.eraseFromParent();
2861 return true;
2862}
2863
2864// TODO: Copied from DAG implementation. Verify logic and document how this
2865// actually works.
2869 bool Signed) const {
2870
2871 Register Dst = MI.getOperand(0).getReg();
2872 Register Src = MI.getOperand(1).getReg();
2873
2874 const LLT S64 = LLT::scalar(64);
2875 const LLT S32 = LLT::scalar(32);
2876
2877 const LLT SrcLT = MRI.getType(Src);
2878 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2879
2880 unsigned Flags = MI.getFlags();
2881
2882 // The basic idea of converting a floating point number into a pair of 32-bit
2883 // integers is illustrated as follows:
2884 //
2885 // tf := trunc(val);
2886 // hif := floor(tf * 2^-32);
2887 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2888 // hi := fptoi(hif);
2889 // lo := fptoi(lof);
2890 //
2891 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2893 if (Signed && SrcLT == S32) {
2894 // However, a 32-bit floating point number has only 23 bits mantissa and
2895 // it's not enough to hold all the significant bits of `lof` if val is
2896 // negative. To avoid the loss of precision, We need to take the absolute
2897 // value after truncating and flip the result back based on the original
2898 // signedness.
2899 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2900 Trunc = B.buildFAbs(S32, Trunc, Flags);
2901 }
2902 MachineInstrBuilder K0, K1;
2903 if (SrcLT == S64) {
2904 K0 = B.buildFConstant(
2905 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2906 K1 = B.buildFConstant(
2907 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2908 } else {
2909 K0 = B.buildFConstant(
2910 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2911 K1 = B.buildFConstant(
2912 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2913 }
2914
2915 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2916 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2917 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2918
2919 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2920 : B.buildFPTOUI(S32, FloorMul);
2921 auto Lo = B.buildFPTOUI(S32, Fma);
2922
2923 if (Signed && SrcLT == S32) {
2924 // Flip the result based on the signedness, which is either all 0s or 1s.
2925 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2926 // r := xor({lo, hi}, sign) - sign;
2927 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2928 Sign);
2929 } else
2930 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2931 MI.eraseFromParent();
2932
2933 return true;
2934}
2935
2937 MachineInstr &MI) const {
2938 MachineFunction &MF = Helper.MIRBuilder.getMF();
2940
2941 // With ieee_mode disabled, the instructions have the correct behavior.
2942 if (!MFI->getMode().IEEE)
2943 return true;
2944
2946}
2947
2949 MachineInstr &MI) const {
2950 MachineIRBuilder &B = Helper.MIRBuilder;
2951 MachineRegisterInfo &MRI = *B.getMRI();
2952 Register DstReg = MI.getOperand(0).getReg();
2953 Register SrcReg = MI.getOperand(1).getReg();
2954 uint64_t Offset = MI.getOperand(2).getImm();
2955
2956 // Fall back to generic lowering for offset 0 (trivial trunc) and
2957 // non-32-bit-aligned cases which require shift+trunc sequences
2958 // that generic code handles correctly.
2959 if (Offset == 0 || Offset % 32 != 0)
2960 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2961
2962 const LLT DstTy = MRI.getType(DstReg);
2963 unsigned StartIdx = Offset / 32;
2964 unsigned DstCount = DstTy.getSizeInBits() / 32;
2965 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2966
2967 if (DstCount == 1) {
2968 if (DstTy.isPointer())
2969 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2970 else
2971 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2972 } else {
2973 SmallVector<Register, 8> MergeVec;
2974 for (unsigned I = 0; I < DstCount; ++I)
2975 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
2976 B.buildMergeLikeInstr(DstReg, MergeVec);
2977 }
2978
2979 MI.eraseFromParent();
2980 return true;
2981}
2982
2984 MachineInstr &MI) const {
2985 MachineIRBuilder &B = Helper.MIRBuilder;
2986 MachineRegisterInfo &MRI = *B.getMRI();
2987 Register DstReg = MI.getOperand(0).getReg();
2988 Register SrcReg = MI.getOperand(1).getReg();
2989 Register InsertSrc = MI.getOperand(2).getReg();
2990 uint64_t Offset = MI.getOperand(3).getImm();
2991
2992 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2993 const LLT InsertTy = MRI.getType(InsertSrc);
2994 unsigned InsertSize = InsertTy.getSizeInBits();
2995
2996 // Fall back to generic lowering for non-32-bit-aligned cases which
2997 // require shift+mask sequences that generic code handles correctly.
2998 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2999 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
3000
3001 const LLT S32 = LLT::scalar(32);
3002 unsigned DstCount = DstSize / 32;
3003 unsigned InsertCount = InsertSize / 32;
3004 unsigned StartIdx = Offset / 32;
3005
3006 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
3007
3008 SmallVector<Register, 8> MergeVec;
3009 for (unsigned I = 0; I < StartIdx; ++I)
3010 MergeVec.push_back(SrcUnmerge.getReg(I));
3011
3012 if (InsertCount == 1) {
3013 // Merge-like instructions require same source types. Convert pointer
3014 // to scalar when inserting a pointer value into a scalar.
3015 if (InsertTy.isPointer())
3016 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
3017 MergeVec.push_back(InsertSrc);
3018 } else {
3019 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
3020 for (unsigned I = 0; I < InsertCount; ++I)
3021 MergeVec.push_back(InsertUnmerge.getReg(I));
3022 }
3023
3024 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
3025 MergeVec.push_back(SrcUnmerge.getReg(I));
3026
3027 B.buildMergeLikeInstr(DstReg, MergeVec);
3028
3029 MI.eraseFromParent();
3030 return true;
3031}
3032
3035 MachineIRBuilder &B) const {
3036 // TODO: Should move some of this into LegalizerHelper.
3037
3038 // TODO: Promote dynamic indexing of s16 to s32
3039
3040 Register Dst = MI.getOperand(0).getReg();
3041 Register Vec = MI.getOperand(1).getReg();
3042
3043 LLT VecTy = MRI.getType(Vec);
3044 LLT EltTy = VecTy.getElementType();
3045 assert(EltTy == MRI.getType(Dst));
3046
3047 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3048 // but we can't go directly to that logic becasue you can't bitcast a vector
3049 // of pointers to a vector of integers. Therefore, introduce an intermediate
3050 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3051 // drive the legalization forward.
3052 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3053 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3054 LLT IntVecTy = VecTy.changeElementType(IntTy);
3055
3056 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3057 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3058 B.buildIntToPtr(Dst, IntElt);
3059
3060 MI.eraseFromParent();
3061 return true;
3062 }
3063
3064 // FIXME: Artifact combiner probably should have replaced the truncated
3065 // constant before this, so we shouldn't need
3066 // getIConstantVRegValWithLookThrough.
3067 std::optional<ValueAndVReg> MaybeIdxVal =
3068 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3069 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3070 return true;
3071 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3072
3073 if (IdxVal < VecTy.getNumElements()) {
3074 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3075 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3076 } else {
3077 B.buildUndef(Dst);
3078 }
3079
3080 MI.eraseFromParent();
3081 return true;
3082}
3083
3086 MachineIRBuilder &B) const {
3087 // TODO: Should move some of this into LegalizerHelper.
3088
3089 // TODO: Promote dynamic indexing of s16 to s32
3090
3091 Register Dst = MI.getOperand(0).getReg();
3092 Register Vec = MI.getOperand(1).getReg();
3093 Register Ins = MI.getOperand(2).getReg();
3094
3095 LLT VecTy = MRI.getType(Vec);
3096 LLT EltTy = VecTy.getElementType();
3097 assert(EltTy == MRI.getType(Ins));
3098
3099 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3100 // but we can't go directly to that logic becasue you can't bitcast a vector
3101 // of pointers to a vector of integers. Therefore, make the pointer vector
3102 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3103 // new value, and then inttoptr the result vector back. This will then allow
3104 // the rest of legalization to take over.
3105 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3106 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3107 LLT IntVecTy = VecTy.changeElementType(IntTy);
3108
3109 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3110 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3111 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3112 MI.getOperand(3));
3113 B.buildIntToPtr(Dst, IntVecDest);
3114 MI.eraseFromParent();
3115 return true;
3116 }
3117
3118 // FIXME: Artifact combiner probably should have replaced the truncated
3119 // constant before this, so we shouldn't need
3120 // getIConstantVRegValWithLookThrough.
3121 std::optional<ValueAndVReg> MaybeIdxVal =
3122 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3123 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3124 return true;
3125
3126 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3127
3128 unsigned NumElts = VecTy.getNumElements();
3129 if (IdxVal < NumElts) {
3131 for (unsigned i = 0; i < NumElts; ++i)
3132 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3133 B.buildUnmerge(SrcRegs, Vec);
3134
3135 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3136 B.buildMergeLikeInstr(Dst, SrcRegs);
3137 } else {
3138 B.buildUndef(Dst);
3139 }
3140
3141 MI.eraseFromParent();
3142 return true;
3143}
3144
3147 MachineIRBuilder &B) const {
3148
3149 Register DstReg = MI.getOperand(0).getReg();
3150 Register SrcReg = MI.getOperand(1).getReg();
3151 LLT Ty = MRI.getType(DstReg);
3152 unsigned Flags = MI.getFlags();
3153
3154 Register TrigVal;
3155 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3156 if (ST.hasTrigReducedRange()) {
3157 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3158 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3159 .addUse(MulVal.getReg(0))
3160 .setMIFlags(Flags)
3161 .getReg(0);
3162 } else
3163 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3164
3165 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3166 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3167 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3168 .addUse(TrigVal)
3169 .setMIFlags(Flags);
3170 MI.eraseFromParent();
3171 return true;
3172}
3173
3176 const GlobalValue *GV,
3177 int64_t Offset,
3178 unsigned GAFlags) const {
3179 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3180 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3181 // to the following code sequence:
3182 //
3183 // For constant address space:
3184 // s_getpc_b64 s[0:1]
3185 // s_add_u32 s0, s0, $symbol
3186 // s_addc_u32 s1, s1, 0
3187 //
3188 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3189 // a fixup or relocation is emitted to replace $symbol with a literal
3190 // constant, which is a pc-relative offset from the encoding of the $symbol
3191 // operand to the global variable.
3192 //
3193 // For global address space:
3194 // s_getpc_b64 s[0:1]
3195 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3196 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3197 //
3198 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3199 // fixups or relocations are emitted to replace $symbol@*@lo and
3200 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3201 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3202 // operand to the global variable.
3203
3205
3206 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3207 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3208
3209 if (ST.has64BitLiterals()) {
3210 assert(GAFlags != SIInstrInfo::MO_NONE);
3211
3213 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3214 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3215 } else {
3217 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3218
3219 MIB.addGlobalAddress(GV, Offset, GAFlags);
3220 if (GAFlags == SIInstrInfo::MO_NONE)
3221 MIB.addImm(0);
3222 else
3223 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3224 }
3225
3226 if (!B.getMRI()->getRegClassOrNull(PCReg))
3227 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3228
3229 if (PtrTy.getSizeInBits() == 32)
3230 B.buildExtract(DstReg, PCReg, 0);
3231 return true;
3232}
3233
3234// Emit a ABS32_LO / ABS32_HI relocation stub.
3236 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3237 MachineRegisterInfo &MRI) const {
3238 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3239
3240 if (RequiresHighHalf && ST.has64BitLiterals()) {
3241 if (!MRI.getRegClassOrNull(DstReg))
3242 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3243 B.buildInstr(AMDGPU::S_MOV_B64)
3244 .addDef(DstReg)
3245 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3246 return;
3247 }
3248
3249 LLT S32 = LLT::scalar(32);
3250
3251 // Use the destination directly, if and only if we store the lower address
3252 // part only and we don't have a register class being set.
3253 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3254 ? DstReg
3256
3257 if (!MRI.getRegClassOrNull(AddrLo))
3258 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3259
3260 // Write the lower half.
3261 B.buildInstr(AMDGPU::S_MOV_B32)
3262 .addDef(AddrLo)
3263 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3264
3265 // If required, write the upper half as well.
3266 if (RequiresHighHalf) {
3267 assert(PtrTy.getSizeInBits() == 64 &&
3268 "Must provide a 64-bit pointer type!");
3269
3271 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3272
3273 B.buildInstr(AMDGPU::S_MOV_B32)
3274 .addDef(AddrHi)
3275 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3276
3277 // Use the destination directly, if and only if we don't have a register
3278 // class being set.
3279 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3280 ? DstReg
3282
3283 if (!MRI.getRegClassOrNull(AddrDst))
3284 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3285
3286 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3287
3288 // If we created a new register for the destination, cast the result into
3289 // the final output.
3290 if (AddrDst != DstReg)
3291 B.buildCast(DstReg, AddrDst);
3292 } else if (AddrLo != DstReg) {
3293 // If we created a new register for the destination, cast the result into
3294 // the final output.
3295 B.buildCast(DstReg, AddrLo);
3296 }
3297}
3298
3301 MachineIRBuilder &B) const {
3302 Register DstReg = MI.getOperand(0).getReg();
3303 LLT Ty = MRI.getType(DstReg);
3304 unsigned AS = Ty.getAddressSpace();
3305
3306 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3307 MachineFunction &MF = B.getMF();
3309
3311 if (!MFI->isModuleEntryFunction() &&
3312 GV->getName() != "llvm.amdgcn.module.lds" &&
3314 const Function &Fn = MF.getFunction();
3316 Fn, "local memory global used by non-kernel function",
3317 MI.getDebugLoc(), DS_Warning));
3318
3319 // We currently don't have a way to correctly allocate LDS objects that
3320 // aren't directly associated with a kernel. We do force inlining of
3321 // functions that use local objects. However, if these dead functions are
3322 // not eliminated, we don't want a compile time error. Just emit a warning
3323 // and a trap, since there should be no callable path here.
3324 B.buildTrap();
3325 B.buildUndef(DstReg);
3326 MI.eraseFromParent();
3327 return true;
3328 }
3329
3330 // TODO: We could emit code to handle the initialization somewhere.
3331 // We ignore the initializer for now and legalize it to allow selection.
3332 // The initializer will anyway get errored out during assembly emission.
3333 const SITargetLowering *TLI = ST.getTargetLowering();
3334 if (!TLI->shouldUseLDSConstAddress(GV)) {
3335 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3336 return true; // Leave in place;
3337 }
3338
3339 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3340 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3341 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3342 // zero-sized type in other languages to declare the dynamic shared
3343 // memory which size is not known at the compile time. They will be
3344 // allocated by the runtime and placed directly after the static
3345 // allocated ones. They all share the same offset.
3346 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3347 // Adjust alignment for that dynamic shared memory array.
3348 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3349 LLT S32 = LLT::scalar(32);
3350 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3351 B.buildIntToPtr(DstReg, Sz);
3352 MI.eraseFromParent();
3353 return true;
3354 }
3355 }
3356
3357 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3358 MI.eraseFromParent();
3359 return true;
3360 }
3361
3362 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3363 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3364 MI.eraseFromParent();
3365 return true;
3366 }
3367
3368 const SITargetLowering *TLI = ST.getTargetLowering();
3369
3370 if (TLI->shouldEmitFixup(GV)) {
3371 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3372 MI.eraseFromParent();
3373 return true;
3374 }
3375
3376 if (TLI->shouldEmitPCReloc(GV)) {
3377 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3378 MI.eraseFromParent();
3379 return true;
3380 }
3381
3383 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3384
3385 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3390 LoadTy, Align(8));
3391
3392 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3393
3394 if (Ty.getSizeInBits() == 32) {
3395 // Truncate if this is a 32-bit constant address.
3396 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3397 B.buildExtract(DstReg, Load, 0);
3398 } else
3399 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3400
3401 MI.eraseFromParent();
3402 return true;
3403}
3404
3406 if (Ty.isVector())
3407 return Ty.changeElementCount(
3408 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3409 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3410}
3411
3413 MachineInstr &MI) const {
3414 MachineIRBuilder &B = Helper.MIRBuilder;
3415 MachineRegisterInfo &MRI = *B.getMRI();
3416 GISelChangeObserver &Observer = Helper.Observer;
3417
3418 Register PtrReg = MI.getOperand(1).getReg();
3419 LLT PtrTy = MRI.getType(PtrReg);
3420 unsigned AddrSpace = PtrTy.getAddressSpace();
3421
3422 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3424 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3425 Observer.changingInstr(MI);
3426 MI.getOperand(1).setReg(Cast.getReg(0));
3427 Observer.changedInstr(MI);
3428 return true;
3429 }
3430
3431 if (MI.getOpcode() != AMDGPU::G_LOAD)
3432 return false;
3433
3434 Register ValReg = MI.getOperand(0).getReg();
3435 LLT ValTy = MRI.getType(ValReg);
3436
3437 if (hasBufferRsrcWorkaround(ValTy)) {
3438 Observer.changingInstr(MI);
3439 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3440 Observer.changedInstr(MI);
3441 return true;
3442 }
3443
3444 MachineMemOperand *MMO = *MI.memoperands_begin();
3445 const unsigned ValSize = ValTy.getSizeInBits();
3446 const LLT MemTy = MMO->getMemoryType();
3447 const Align MemAlign = MMO->getAlign();
3448 const unsigned MemSize = MemTy.getSizeInBits();
3449 const uint64_t AlignInBits = 8 * MemAlign.value();
3450
3451 // Widen non-power-of-2 loads to the alignment if needed
3452 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3453 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3454
3455 // This was already the correct extending load result type, so just adjust
3456 // the memory type.
3457 if (WideMemSize == ValSize) {
3458 MachineFunction &MF = B.getMF();
3459
3460 MachineMemOperand *WideMMO =
3461 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3462 Observer.changingInstr(MI);
3463 MI.setMemRefs(MF, {WideMMO});
3464 Observer.changedInstr(MI);
3465 return true;
3466 }
3467
3468 // Don't bother handling edge case that should probably never be produced.
3469 if (ValSize > WideMemSize)
3470 return false;
3471
3472 LLT WideTy = widenToNextPowerOf2(ValTy);
3473
3474 Register WideLoad;
3475 if (!WideTy.isVector()) {
3476 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3477 B.buildTrunc(ValReg, WideLoad).getReg(0);
3478 } else {
3479 // Extract the subvector.
3480
3481 if (isRegisterType(ST, ValTy)) {
3482 // If this a case where G_EXTRACT is legal, use it.
3483 // (e.g. <3 x s32> -> <4 x s32>)
3484 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3485 B.buildExtract(ValReg, WideLoad, 0);
3486 } else {
3487 // For cases where the widened type isn't a nice register value, unmerge
3488 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3489 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3490 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3491 }
3492 }
3493
3494 MI.eraseFromParent();
3495 return true;
3496 }
3497
3498 return false;
3499}
3500
3502 MachineInstr &MI) const {
3503 MachineIRBuilder &B = Helper.MIRBuilder;
3504 MachineRegisterInfo &MRI = *B.getMRI();
3505 GISelChangeObserver &Observer = Helper.Observer;
3506
3507 Register DataReg = MI.getOperand(0).getReg();
3508 LLT DataTy = MRI.getType(DataReg);
3509
3510 if (hasBufferRsrcWorkaround(DataTy)) {
3511 Observer.changingInstr(MI);
3513 Observer.changedInstr(MI);
3514 return true;
3515 }
3516 return false;
3517}
3518
3521 MachineIRBuilder &B) const {
3522 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3523 assert(Ty.isScalar());
3524
3525 MachineFunction &MF = B.getMF();
3527
3528 // TODO: Always legal with future ftz flag.
3529 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3530 // FIXME: Do we need just output?
3531 if (Ty == LLT::scalar(32) &&
3533 return true;
3534 if (Ty == LLT::scalar(16) &&
3536 return true;
3537
3538 MachineIRBuilder HelperBuilder(MI);
3539 GISelObserverWrapper DummyObserver;
3540 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3541 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3542}
3543
3546 Register DstReg = MI.getOperand(0).getReg();
3547 Register PtrReg = MI.getOperand(1).getReg();
3548 Register CmpVal = MI.getOperand(2).getReg();
3549 Register NewVal = MI.getOperand(3).getReg();
3550
3552 "this should not have been custom lowered");
3553
3554 LLT ValTy = MRI.getType(CmpVal);
3555 LLT VecTy = LLT::fixed_vector(2, ValTy);
3556
3557 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3558
3559 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3560 .addDef(DstReg)
3561 .addUse(PtrReg)
3562 .addUse(PackedVal)
3563 .setMemRefs(MI.memoperands());
3564
3565 MI.eraseFromParent();
3566 return true;
3567}
3568
3569/// Return true if it's known that \p Src can never be an f32 denormal value.
3571 Register Src) {
3572 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3573 switch (DefMI->getOpcode()) {
3574 case TargetOpcode::G_INTRINSIC: {
3576 case Intrinsic::amdgcn_frexp_mant:
3577 case Intrinsic::amdgcn_log:
3578 case Intrinsic::amdgcn_log_clamp:
3579 case Intrinsic::amdgcn_exp2:
3580 case Intrinsic::amdgcn_sqrt:
3581 return true;
3582 default:
3583 break;
3584 }
3585
3586 break;
3587 }
3588 case TargetOpcode::G_FSQRT:
3589 return true;
3590 case TargetOpcode::G_FFREXP: {
3591 if (DefMI->getOperand(0).getReg() == Src)
3592 return true;
3593 break;
3594 }
3595 case TargetOpcode::G_FPEXT: {
3596 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3597 }
3598 default:
3599 return false;
3600 }
3601
3602 return false;
3603}
3604
3605static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3606 return Flags & MachineInstr::FmAfn;
3607}
3608
3610 unsigned Flags) {
3611 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3614}
3615
3616std::pair<Register, Register>
3618 unsigned Flags) const {
3619 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3620 return {};
3621
3622 const LLT F32 = LLT::scalar(32);
3623 auto SmallestNormal = B.buildFConstant(
3625 auto IsLtSmallestNormal =
3626 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3627
3628 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3629 auto One = B.buildFConstant(F32, 1.0);
3630 auto ScaleFactor =
3631 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3632 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3633
3634 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3635}
3636
3638 MachineIRBuilder &B) const {
3639 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3640 // If we have to handle denormals, scale up the input and adjust the result.
3641
3642 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3643 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3644
3645 Register Dst = MI.getOperand(0).getReg();
3646 Register Src = MI.getOperand(1).getReg();
3647 LLT Ty = B.getMRI()->getType(Dst);
3648 unsigned Flags = MI.getFlags();
3649
3650 if (Ty == LLT::scalar(16)) {
3651 const LLT F32 = LLT::scalar(32);
3652 // Nothing in half is a denormal when promoted to f32.
3653 auto Ext = B.buildFPExt(F32, Src, Flags);
3654 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3655 .addUse(Ext.getReg(0))
3656 .setMIFlags(Flags);
3657 B.buildFPTrunc(Dst, Log2, Flags);
3658 MI.eraseFromParent();
3659 return true;
3660 }
3661
3662 assert(Ty == LLT::scalar(32));
3663
3664 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3665 if (!ScaledInput) {
3666 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3667 .addUse(Src)
3668 .setMIFlags(Flags);
3669 MI.eraseFromParent();
3670 return true;
3671 }
3672
3673 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3674 .addUse(ScaledInput)
3675 .setMIFlags(Flags);
3676
3677 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3678 auto Zero = B.buildFConstant(Ty, 0.0);
3679 auto ResultOffset =
3680 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3681 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3682
3683 MI.eraseFromParent();
3684 return true;
3685}
3686
3688 Register Z, unsigned Flags) {
3689 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3690 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3691}
3692
3694 MachineIRBuilder &B) const {
3695 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3696 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3697
3698 MachineRegisterInfo &MRI = *B.getMRI();
3699 Register Dst = MI.getOperand(0).getReg();
3700 Register X = MI.getOperand(1).getReg();
3701 unsigned Flags = MI.getFlags();
3702 const LLT Ty = MRI.getType(X);
3703
3704 const LLT F32 = LLT::scalar(32);
3705 const LLT F16 = LLT::scalar(16);
3706
3707 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3708 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3709 // depending on !fpmath metadata.
3710 bool PromoteToF32 =
3711 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3712 if (PromoteToF32) {
3714 auto PromoteSrc = B.buildFPExt(F32, X);
3715 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3716 B.buildFPTrunc(Dst, LogVal);
3717 } else {
3718 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3719 }
3720
3721 MI.eraseFromParent();
3722 return true;
3723 }
3724
3725 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3726 if (ScaledInput)
3727 X = ScaledInput;
3728
3729 auto Y =
3730 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3731
3732 Register R;
3733 if (ST.hasFastFMAF32()) {
3734 // c+cc are ln(2)/ln(10) to more than 49 bits
3735 const float c_log10 = 0x1.344134p-2f;
3736 const float cc_log10 = 0x1.09f79ep-26f;
3737
3738 // c + cc is ln(2) to more than 49 bits
3739 const float c_log = 0x1.62e42ep-1f;
3740 const float cc_log = 0x1.efa39ep-25f;
3741
3742 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3743 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3744 // This adds correction terms for which contraction may lead to an increase
3745 // in the error of the approximation, so disable it.
3746 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3747 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3748 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3749 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3750 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3751 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3752 } else {
3753 // ch+ct is ln(2)/ln(10) to more than 36 bits
3754 const float ch_log10 = 0x1.344000p-2f;
3755 const float ct_log10 = 0x1.3509f6p-18f;
3756
3757 // ch + ct is ln(2) to more than 36 bits
3758 const float ch_log = 0x1.62e000p-1f;
3759 const float ct_log = 0x1.0bfbe8p-15f;
3760
3761 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3762 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3763
3764 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3765 auto YH = B.buildAnd(Ty, Y, MaskConst);
3766 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3767 // This adds correction terms for which contraction may lead to an increase
3768 // in the error of the approximation, so disable it.
3769 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3770 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3771
3772 Register Mad0 =
3773 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3774 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3775 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3776 }
3777
3778 const bool IsFiniteOnly =
3780
3781 if (!IsFiniteOnly) {
3782 // Expand isfinite(x) => fabs(x) < inf
3783 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3784 auto Fabs = B.buildFAbs(Ty, Y);
3785 auto IsFinite =
3786 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3787 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3788 }
3789
3790 if (ScaledInput) {
3791 auto Zero = B.buildFConstant(Ty, 0.0);
3792 auto ShiftK =
3793 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3794 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3795 B.buildFSub(Dst, R, Shift, Flags);
3796 } else {
3797 B.buildCopy(Dst, R);
3798 }
3799
3800 MI.eraseFromParent();
3801 return true;
3802}
3803
3805 Register Src, bool IsLog10,
3806 unsigned Flags) const {
3807 const double Log2BaseInverted =
3809
3810 LLT Ty = B.getMRI()->getType(Dst);
3811
3812 if (Ty == LLT::scalar(32)) {
3813 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3814 if (ScaledInput) {
3815 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3816 .addUse(Src)
3817 .setMIFlags(Flags);
3818 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3819 auto Zero = B.buildFConstant(Ty, 0.0);
3820 auto ResultOffset =
3821 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3822 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3823
3824 if (ST.hasFastFMAF32())
3825 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3826 else {
3827 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3828 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3829 }
3830
3831 return true;
3832 }
3833 }
3834
3835 auto Log2Operand = Ty == LLT::scalar(16)
3836 ? B.buildFLog2(Ty, Src, Flags)
3837 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3838 .addUse(Src)
3839 .setMIFlags(Flags);
3840 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3841 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3842 return true;
3843}
3844
3846 MachineIRBuilder &B) const {
3847 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3848 // If we have to handle denormals, scale up the input and adjust the result.
3849
3850 Register Dst = MI.getOperand(0).getReg();
3851 Register Src = MI.getOperand(1).getReg();
3852 unsigned Flags = MI.getFlags();
3853 LLT Ty = B.getMRI()->getType(Dst);
3854 const LLT F16 = LLT::scalar(16);
3855 const LLT F32 = LLT::scalar(32);
3856 const LLT F64 = LLT::scalar(64);
3857
3858 if (Ty == F64)
3859 return legalizeFEXPF64(MI, B);
3860
3861 if (Ty == F16) {
3862 // Nothing in half is a denormal when promoted to f32.
3863 auto Ext = B.buildFPExt(F32, Src, Flags);
3864 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3865 .addUse(Ext.getReg(0))
3866 .setMIFlags(Flags);
3867 B.buildFPTrunc(Dst, Log2, Flags);
3868 MI.eraseFromParent();
3869 return true;
3870 }
3871
3872 assert(Ty == F32);
3873
3874 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3875 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3876 .addUse(Src)
3877 .setMIFlags(Flags);
3878 MI.eraseFromParent();
3879 return true;
3880 }
3881
3882 // bool needs_scaling = x < -0x1.f80000p+6f;
3883 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3884
3885 // -nextafter(128.0, -1)
3886 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3887 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3888 RangeCheckConst, Flags);
3889
3890 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3891 auto Zero = B.buildFConstant(Ty, 0.0);
3892 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3893 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3894
3895 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3896 .addUse(AddInput.getReg(0))
3897 .setMIFlags(Flags);
3898
3899 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3900 auto One = B.buildFConstant(Ty, 1.0);
3901 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3902 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3903 MI.eraseFromParent();
3904 return true;
3905}
3906
3908 const SrcOp &Src, unsigned Flags) {
3909 LLT Ty = Dst.getLLTTy(*B.getMRI());
3910
3911 if (Ty == LLT::scalar(32)) {
3912 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3913 .addUse(Src.getReg())
3914 .setMIFlags(Flags);
3915 }
3916 return B.buildFExp2(Dst, Src, Flags);
3917}
3918
3920 Register Dst, Register X,
3921 unsigned Flags,
3922 bool IsExp10) const {
3923 LLT Ty = B.getMRI()->getType(X);
3924
3925 // exp(x) -> exp2(M_LOG2E_F * x);
3926 // exp10(x) -> exp2(log2(10) * x);
3927 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3928 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3929 buildExp(B, Dst, Mul, Flags);
3930 return true;
3931}
3932
3934 Register X, unsigned Flags) const {
3935 LLT Ty = B.getMRI()->getType(Dst);
3936 LLT F32 = LLT::scalar(32);
3937
3938 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3939 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3940 }
3941
3942 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3943 auto NeedsScaling =
3944 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3945 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3946 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3947 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3948
3949 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3950 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3951
3952 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3953 .addUse(ExpInput.getReg(0))
3954 .setMIFlags(Flags);
3955
3956 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3957 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3958 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3959 return true;
3960}
3961
3963 Register Dst, Register X,
3964 unsigned Flags) const {
3965 LLT Ty = B.getMRI()->getType(Dst);
3966 LLT F32 = LLT::scalar(32);
3967
3968 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3969 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3970 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3971 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3972
3973 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3974 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3975 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3976 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3977 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3978 return true;
3979 }
3980
3981 // bool s = x < -0x1.2f7030p+5f;
3982 // x += s ? 0x1.0p+5f : 0.0f;
3983 // exp10 = exp2(x * 0x1.a92000p+1f) *
3984 // exp2(x * 0x1.4f0978p-11f) *
3985 // (s ? 0x1.9f623ep-107f : 1.0f);
3986
3987 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3988 auto NeedsScaling =
3989 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3990
3991 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3992 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3993 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3994
3995 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3996 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3997
3998 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3999 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
4000 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
4001 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
4002
4003 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
4004 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
4005 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4006
4007 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4008 return true;
4009}
4010
4011// This expansion gives a result slightly better than 1ulp.
4013 MachineIRBuilder &B) const {
4014
4015 Register X = MI.getOperand(1).getReg();
4016 LLT S64 = LLT::scalar(64);
4017 LLT S32 = LLT::scalar(32);
4018 LLT S1 = LLT::scalar(1);
4019
4020 // TODO: Check if reassoc is safe. There is an output change in exp2 and
4021 // exp10, which slightly increases ulp.
4022 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
4023
4024 Register Dn, F, T;
4025
4026 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
4027 // Dn = rint(X)
4028 Dn = B.buildFRint(S64, X, Flags).getReg(0);
4029 // F = X - Dn
4030 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
4031 // T = F*C1 + F*C2
4032 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4033 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4034 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
4035 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
4036
4037 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
4038 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
4039 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4040 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4041
4042 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4043 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
4044 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
4045 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4046 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4047
4048 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4049 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4050 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4051 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4052
4053 } else { // G_FEXP
4054 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4055 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4056 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4057
4058 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4059 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4060 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4061 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4062 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4063 }
4064
4065 // Polynomial chain for P
4066 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4067 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4068 Flags);
4069 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4070 Flags);
4071 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4072 Flags);
4073 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4074 Flags);
4075 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4076 Flags);
4077 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4078 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4079 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4080 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4081
4082 auto One = B.buildFConstant(S64, 1.0);
4083 P = B.buildFMA(S64, T, P, One, Flags);
4084 P = B.buildFMA(S64, T, P, One, Flags);
4085
4086 // Z = FLDEXP(P, (int)Dn)
4087 auto DnInt = B.buildFPTOSI(S32, Dn);
4088 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4089
4090 if (!(Flags & MachineInstr::FmNoInfs)) {
4091 // Overflow guard: if X <= 1024.0 then Z else +inf
4092 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4093 B.buildFConstant(S64, APFloat(1024.0)));
4094 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4095 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4096 }
4097
4098 // Underflow guard: if X >= -1075.0 then Z else 0.0
4099 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4100 B.buildFConstant(S64, APFloat(-1075.0)));
4101 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4102 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4103
4104 MI.eraseFromParent();
4105 return true;
4106}
4107
4109 MachineIRBuilder &B) const {
4110 Register Dst = MI.getOperand(0).getReg();
4111 Register X = MI.getOperand(1).getReg();
4112 const unsigned Flags = MI.getFlags();
4113 MachineFunction &MF = B.getMF();
4114 MachineRegisterInfo &MRI = *B.getMRI();
4115 LLT Ty = MRI.getType(Dst);
4116
4117 const LLT F64 = LLT::scalar(64);
4118
4119 if (Ty == F64)
4120 return legalizeFEXPF64(MI, B);
4121
4122 const LLT F16 = LLT::scalar(16);
4123 const LLT F32 = LLT::scalar(32);
4124 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4125
4126 if (Ty == F16) {
4127 // v_exp_f16 (fmul x, log2e)
4128 if (allowApproxFunc(MF, Flags)) {
4129 // TODO: Does this really require fast?
4130 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4131 : legalizeFExpUnsafe(B, Dst, X, Flags);
4132 MI.eraseFromParent();
4133 return true;
4134 }
4135
4136 // Nothing in half is a denormal when promoted to f32.
4137 //
4138 // exp(f16 x) ->
4139 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4140 //
4141 // exp10(f16 x) ->
4142 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4143 auto Ext = B.buildFPExt(F32, X, Flags);
4145 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4146 B.buildFPTrunc(Dst, Lowered, Flags);
4147 MI.eraseFromParent();
4148 return true;
4149 }
4150
4151 assert(Ty == F32);
4152
4153 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4154 // library behavior. Also, is known-not-daz source sufficient?
4155 if (allowApproxFunc(MF, Flags)) {
4156 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4157 : legalizeFExpUnsafe(B, Dst, X, Flags);
4158 MI.eraseFromParent();
4159 return true;
4160 }
4161
4162 // Algorithm:
4163 //
4164 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4165 //
4166 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4167 // n = 64*m + j, 0 <= j < 64
4168 //
4169 // e^x = 2^((64*m + j + f)/64)
4170 // = (2^m) * (2^(j/64)) * 2^(f/64)
4171 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4172 //
4173 // f = x*(64/ln(2)) - n
4174 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4175 //
4176 // e^x = (2^m) * (2^(j/64)) * e^r
4177 //
4178 // (2^(j/64)) is precomputed
4179 //
4180 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4181 // e^r = 1 + q
4182 //
4183 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4184 //
4185 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4186 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4187 Register PH, PL;
4188
4189 if (ST.hasFastFMAF32()) {
4190 const float c_exp = numbers::log2ef;
4191 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4192 const float c_exp10 = 0x1.a934f0p+1f;
4193 const float cc_exp10 = 0x1.2f346ep-24f;
4194
4195 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4196 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4197 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4198 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4199
4200 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4201 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4202 } else {
4203 const float ch_exp = 0x1.714000p+0f;
4204 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4205
4206 const float ch_exp10 = 0x1.a92000p+1f;
4207 const float cl_exp10 = 0x1.4f0978p-11f;
4208
4209 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4210 auto XH = B.buildAnd(Ty, X, MaskConst);
4211 auto XL = B.buildFSub(Ty, X, XH, Flags);
4212
4213 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4214 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4215
4216 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4217 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4218
4219 Register Mad0 =
4220 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4221 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4222 }
4223
4224 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4225
4226 // It is unsafe to contract this fsub into the PH multiply.
4227 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4228 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4229 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4230
4231 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4232 .addUse(A.getReg(0))
4233 .setMIFlags(Flags);
4234 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4235
4236 auto UnderflowCheckConst =
4237 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4238 auto Zero = B.buildFConstant(Ty, 0.0);
4239 auto Underflow =
4240 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4241
4242 R = B.buildSelect(Ty, Underflow, Zero, R);
4243
4244 if (!(Flags & MachineInstr::FmNoInfs)) {
4245 auto OverflowCheckConst =
4246 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4247
4248 auto Overflow =
4249 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4250 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4251 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4252 }
4253
4254 B.buildCopy(Dst, R);
4255 MI.eraseFromParent();
4256 return true;
4257}
4258
4260 MachineIRBuilder &B) const {
4261 Register Dst = MI.getOperand(0).getReg();
4262 Register Src0 = MI.getOperand(1).getReg();
4263 Register Src1 = MI.getOperand(2).getReg();
4264 unsigned Flags = MI.getFlags();
4265 LLT Ty = B.getMRI()->getType(Dst);
4266 const LLT F16 = LLT::scalar(16); // TODO: Expected LLT::float16()
4267 const LLT F32 = LLT::scalar(32); // TODO: Expected LLT::float32()
4268
4269 if (Ty == F32) {
4270 auto Log = B.buildFLog2(F32, Src0, Flags);
4271 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4272 .addUse(Log.getReg(0))
4273 .addUse(Src1)
4274 .setMIFlags(Flags);
4275 B.buildFExp2(Dst, Mul, Flags);
4276 } else if (Ty == F16) {
4277 // There's no f16 fmul_legacy, so we need to convert for it.
4278 auto Log = B.buildFLog2(F16, Src0, Flags);
4279 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4280 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4281 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4282 .addUse(Ext0.getReg(0))
4283 .addUse(Ext1.getReg(0))
4284 .setMIFlags(Flags);
4285 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4286 } else
4287 return false;
4288
4289 MI.eraseFromParent();
4290 return true;
4291}
4292
4293// Find a source register, ignoring any possible source modifiers.
4295 Register ModSrc = OrigSrc;
4296 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4297 ModSrc = SrcFNeg->getOperand(1).getReg();
4298 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4299 ModSrc = SrcFAbs->getOperand(1).getReg();
4300 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4301 ModSrc = SrcFAbs->getOperand(1).getReg();
4302 return ModSrc;
4303}
4304
4307 MachineIRBuilder &B) const {
4308
4309 const LLT S1 = LLT::scalar(1);
4310 const LLT F64 = LLT::scalar(64); // TODO: Expected float64
4311 Register Dst = MI.getOperand(0).getReg();
4312 Register OrigSrc = MI.getOperand(1).getReg();
4313 unsigned Flags = MI.getFlags();
4314 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4315 "this should not have been custom lowered");
4316
4317 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4318 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4319 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4320 // V_FRACT bug is:
4321 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4322 //
4323 // Convert floor(x) to (x - fract(x))
4324
4325 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4326 .addUse(OrigSrc)
4327 .setMIFlags(Flags);
4328
4329 // Give source modifier matching some assistance before obscuring a foldable
4330 // pattern.
4331
4332 // TODO: We can avoid the neg on the fract? The input sign to fract
4333 // shouldn't matter?
4334 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4335
4336 auto Const =
4337 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4338
4340
4341 // We don't need to concern ourselves with the snan handling difference, so
4342 // use the one which will directly select.
4343 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4344 if (MFI->getMode().IEEE)
4345 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4346 else
4347 B.buildFMinNum(Min, Fract, Const, Flags);
4348
4349 Register CorrectedFract = Min;
4350 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4351 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4352 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4353 }
4354
4355 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4356 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4357
4358 MI.eraseFromParent();
4359 return true;
4360}
4361
4362// Turn an illegal packed v2s16 build vector into bit operations.
4363// TODO: This should probably be a bitcast action in LegalizerHelper.
4366 Register Dst = MI.getOperand(0).getReg();
4367 const LLT S32 = LLT::scalar(32);
4368 const LLT S16 = LLT::scalar(16);
4369 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4370
4371 Register Src0 = MI.getOperand(1).getReg();
4372 Register Src1 = MI.getOperand(2).getReg();
4373
4374 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4375 assert(MRI.getType(Src0) == S32);
4376 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4377 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4378 }
4379
4380 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4381 B.buildBitcast(Dst, Merge);
4382
4383 MI.eraseFromParent();
4384 return true;
4385}
4386
4387// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4388//
4389// Source and accumulation registers must all be 32-bits.
4390//
4391// TODO: When the multiply is uniform, we should produce a code sequence
4392// that is better suited to instruction selection on the SALU. Instead of
4393// the outer loop going over parts of the result, the outer loop should go
4394// over parts of one of the factors. This should result in instruction
4395// selection that makes full use of S_ADDC_U32 instructions.
4398 ArrayRef<Register> Src0,
4399 ArrayRef<Register> Src1,
4400 bool UsePartialMad64_32,
4401 bool SeparateOddAlignedProducts) const {
4402 // Use (possibly empty) vectors of S1 registers to represent the set of
4403 // carries from one pair of positions to the next.
4404 using Carry = SmallVector<Register, 2>;
4405
4406 MachineIRBuilder &B = Helper.MIRBuilder;
4407 GISelValueTracking &VT = *Helper.getValueTracking();
4408
4409 const LLT S1 = LLT::scalar(1);
4410 const LLT S32 = LLT::scalar(32);
4411 const LLT S64 = LLT::scalar(64);
4412
4413 Register Zero32;
4414 Register Zero64;
4415
4416 auto getZero32 = [&]() -> Register {
4417 if (!Zero32)
4418 Zero32 = B.buildConstant(S32, 0).getReg(0);
4419 return Zero32;
4420 };
4421 auto getZero64 = [&]() -> Register {
4422 if (!Zero64)
4423 Zero64 = B.buildConstant(S64, 0).getReg(0);
4424 return Zero64;
4425 };
4426
4427 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4428 for (unsigned i = 0; i < Src0.size(); ++i) {
4429 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4430 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4431 }
4432
4433 // Merge the given carries into the 32-bit LocalAccum, which is modified
4434 // in-place.
4435 //
4436 // Returns the carry-out, which is a single S1 register or null.
4437 auto mergeCarry =
4438 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4439 if (CarryIn.empty())
4440 return Register();
4441
4442 bool HaveCarryOut = true;
4443 Register CarryAccum;
4444 if (CarryIn.size() == 1) {
4445 if (!LocalAccum) {
4446 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4447 return Register();
4448 }
4449
4450 CarryAccum = getZero32();
4451 } else {
4452 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4453 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4454 CarryAccum =
4455 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4456 .getReg(0);
4457 }
4458
4459 if (!LocalAccum) {
4460 LocalAccum = getZero32();
4461 HaveCarryOut = false;
4462 }
4463 }
4464
4465 auto Add =
4466 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4467 LocalAccum = Add.getReg(0);
4468 return HaveCarryOut ? Add.getReg(1) : Register();
4469 };
4470
4471 // Build a multiply-add chain to compute
4472 //
4473 // LocalAccum + (partial products at DstIndex)
4474 // + (opportunistic subset of CarryIn)
4475 //
4476 // LocalAccum is an array of one or two 32-bit registers that are updated
4477 // in-place. The incoming registers may be null.
4478 //
4479 // In some edge cases, carry-ins can be consumed "for free". In that case,
4480 // the consumed carry bits are removed from CarryIn in-place.
4481 auto buildMadChain =
4482 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4483 -> Carry {
4484 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4485 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4486
4487 Carry CarryOut;
4488 unsigned j0 = 0;
4489
4490 // Use plain 32-bit multiplication for the most significant part of the
4491 // result by default.
4492 if (LocalAccum.size() == 1 &&
4493 (!UsePartialMad64_32 || !CarryIn.empty())) {
4494 do {
4495 // Skip multiplication if one of the operands is 0
4496 unsigned j1 = DstIndex - j0;
4497 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4498 ++j0;
4499 continue;
4500 }
4501 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4502 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4503 LocalAccum[0] = Mul.getReg(0);
4504 } else {
4505 if (CarryIn.empty()) {
4506 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4507 } else {
4508 LocalAccum[0] =
4509 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4510 .getReg(0);
4511 CarryIn.pop_back();
4512 }
4513 }
4514 ++j0;
4515 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4516 }
4517
4518 // Build full 64-bit multiplies.
4519 if (j0 <= DstIndex) {
4520 bool HaveSmallAccum = false;
4521 Register Tmp;
4522
4523 if (LocalAccum[0]) {
4524 if (LocalAccum.size() == 1) {
4525 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4526 HaveSmallAccum = true;
4527 } else if (LocalAccum[1]) {
4528 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4529 HaveSmallAccum = false;
4530 } else {
4531 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4532 HaveSmallAccum = true;
4533 }
4534 } else {
4535 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4536 Tmp = getZero64();
4537 HaveSmallAccum = true;
4538 }
4539
4540 do {
4541 unsigned j1 = DstIndex - j0;
4542 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4543 ++j0;
4544 continue;
4545 }
4546 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4547 {Src0[j0], Src1[j1], Tmp});
4548 Tmp = Mad.getReg(0);
4549 if (!HaveSmallAccum)
4550 CarryOut.push_back(Mad.getReg(1));
4551 HaveSmallAccum = false;
4552
4553 ++j0;
4554 } while (j0 <= DstIndex);
4555
4556 auto Unmerge = B.buildUnmerge(S32, Tmp);
4557 LocalAccum[0] = Unmerge.getReg(0);
4558 if (LocalAccum.size() > 1)
4559 LocalAccum[1] = Unmerge.getReg(1);
4560 }
4561
4562 return CarryOut;
4563 };
4564
4565 // Outer multiply loop, iterating over destination parts from least
4566 // significant to most significant parts.
4567 //
4568 // The columns of the following diagram correspond to the destination parts
4569 // affected by one iteration of the outer loop (ignoring boundary
4570 // conditions).
4571 //
4572 // Dest index relative to 2 * i: 1 0 -1
4573 // ------
4574 // Carries from previous iteration: e o
4575 // Even-aligned partial product sum: E E .
4576 // Odd-aligned partial product sum: O O
4577 //
4578 // 'o' is OddCarry, 'e' is EvenCarry.
4579 // EE and OO are computed from partial products via buildMadChain and use
4580 // accumulation where possible and appropriate.
4581 //
4582 Register SeparateOddCarry;
4583 Carry EvenCarry;
4584 Carry OddCarry;
4585
4586 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4587 Carry OddCarryIn = std::move(OddCarry);
4588 Carry EvenCarryIn = std::move(EvenCarry);
4589 OddCarry.clear();
4590 EvenCarry.clear();
4591
4592 // Partial products at offset 2 * i.
4593 if (2 * i < Accum.size()) {
4594 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4595 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4596 }
4597
4598 // Partial products at offset 2 * i - 1.
4599 if (i > 0) {
4600 if (!SeparateOddAlignedProducts) {
4601 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4602 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4603 } else {
4604 bool IsHighest = 2 * i >= Accum.size();
4605 Register SeparateOddOut[2];
4606 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4607 .take_front(IsHighest ? 1 : 2);
4608 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4609
4611
4612 if (i == 1) {
4613 if (!IsHighest)
4614 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4615 else
4616 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4617 } else {
4618 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4619 SeparateOddCarry);
4620 }
4621 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4622
4623 if (!IsHighest) {
4624 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4625 Lo->getOperand(1).getReg());
4626 Accum[2 * i] = Hi.getReg(0);
4627 SeparateOddCarry = Hi.getReg(1);
4628 }
4629 }
4630 }
4631
4632 // Add in the carries from the previous iteration
4633 if (i > 0) {
4634 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4635 EvenCarryIn.push_back(CarryOut);
4636
4637 if (2 * i < Accum.size()) {
4638 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4639 OddCarry.push_back(CarryOut);
4640 }
4641 }
4642 }
4643}
4644
4645// Custom narrowing of wide multiplies using wide multiply-add instructions.
4646//
4647// TODO: If the multiply is followed by an addition, we should attempt to
4648// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4650 MachineInstr &MI) const {
4651 assert(ST.hasMad64_32());
4652 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4653
4654 MachineIRBuilder &B = Helper.MIRBuilder;
4655 MachineRegisterInfo &MRI = *B.getMRI();
4656
4657 Register DstReg = MI.getOperand(0).getReg();
4658 Register Src0 = MI.getOperand(1).getReg();
4659 Register Src1 = MI.getOperand(2).getReg();
4660
4661 LLT Ty = MRI.getType(DstReg);
4662 assert(Ty.isScalar());
4663
4664 unsigned Size = Ty.getSizeInBits();
4665 if (ST.hasVMulU64Inst() && Size == 64)
4666 return true;
4667
4668 unsigned NumParts = Size / 32;
4669 assert((Size % 32) == 0);
4670 assert(NumParts >= 2);
4671
4672 // Whether to use MAD_64_32 for partial products whose high half is
4673 // discarded. This avoids some ADD instructions but risks false dependency
4674 // stalls on some subtargets in some cases.
4675 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4676
4677 // Whether to compute odd-aligned partial products separately. This is
4678 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4679 // in an even-aligned VGPR.
4680 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4681
4682 LLT S32 = LLT::scalar(32);
4683 SmallVector<Register, 2> Src0Parts, Src1Parts;
4684 for (unsigned i = 0; i < NumParts; ++i) {
4687 }
4688 B.buildUnmerge(Src0Parts, Src0);
4689 B.buildUnmerge(Src1Parts, Src1);
4690
4691 SmallVector<Register, 2> AccumRegs(NumParts);
4692 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4693 SeparateOddAlignedProducts);
4694
4695 B.buildMergeLikeInstr(DstReg, AccumRegs);
4696 MI.eraseFromParent();
4697 return true;
4698}
4699
4700// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4701// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input
4702// case with a single min instruction instead of a compare+select.
4705 MachineIRBuilder &B) const {
4706 Register Dst = MI.getOperand(0).getReg();
4707 Register Src = MI.getOperand(1).getReg();
4708 LLT DstTy = MRI.getType(Dst);
4709 LLT SrcTy = MRI.getType(Src);
4710
4711 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4712 ? AMDGPU::G_AMDGPU_FFBH_U32
4713 : AMDGPU::G_AMDGPU_FFBL_B32;
4714 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4715 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4716
4717 MI.eraseFromParent();
4718 return true;
4719}
4720
4723 MachineIRBuilder &B) const {
4724 Register Dst = MI.getOperand(0).getReg();
4725 Register Src = MI.getOperand(1).getReg();
4726 LLT SrcTy = MRI.getType(Src);
4727 TypeSize NumBits = SrcTy.getSizeInBits();
4728
4729 assert(NumBits < 32u);
4730
4731 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4732 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4733 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4734 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4735 B.buildTrunc(Dst, Ctlz);
4736 MI.eraseFromParent();
4737 return true;
4738}
4739
4742 MachineIRBuilder &B) const {
4743 Register Dst = MI.getOperand(0).getReg();
4744 Register Src = MI.getOperand(1).getReg();
4745 LLT SrcTy = MRI.getType(Src);
4746 const LLT S32 = LLT::scalar(32);
4747 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4748 unsigned BitWidth = SrcTy.getSizeInBits();
4749
4750 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4751 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4752 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4753 MI.eraseFromParent();
4754 return true;
4755}
4756
4757// Check that this is a G_XOR x, -1
4758static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4759 if (MI.getOpcode() != TargetOpcode::G_XOR)
4760 return false;
4761 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4762 return ConstVal == -1;
4763}
4764
4765// Return the use branch instruction, otherwise null if the usage is invalid.
4766static MachineInstr *
4768 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4769 Register CondDef = MI.getOperand(0).getReg();
4770 if (!MRI.hasOneNonDBGUse(CondDef))
4771 return nullptr;
4772
4773 MachineBasicBlock *Parent = MI.getParent();
4774 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4775
4776 if (isNot(MRI, *UseMI)) {
4777 Register NegatedCond = UseMI->getOperand(0).getReg();
4778 if (!MRI.hasOneNonDBGUse(NegatedCond))
4779 return nullptr;
4780
4781 // We're deleting the def of this value, so we need to remove it.
4782 eraseInstr(*UseMI, MRI);
4783
4784 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4785 Negated = true;
4786 }
4787
4788 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4789 return nullptr;
4790
4791 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4792 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4793 if (Next == Parent->end()) {
4794 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4795 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4796 return nullptr;
4797 UncondBrTarget = &*NextMBB;
4798 } else {
4799 if (Next->getOpcode() != AMDGPU::G_BR)
4800 return nullptr;
4801 Br = &*Next;
4802 UncondBrTarget = Br->getOperand(0).getMBB();
4803 }
4804
4805 return UseMI;
4806}
4807
4810 const ArgDescriptor *Arg,
4811 const TargetRegisterClass *ArgRC,
4812 LLT ArgTy) const {
4813 MCRegister SrcReg = Arg->getRegister();
4814 assert(SrcReg.isPhysical() && "Physical register expected");
4815 assert(DstReg.isVirtual() && "Virtual register expected");
4816
4817 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4818 *ArgRC, B.getDebugLoc(), ArgTy);
4819 if (Arg->isMasked()) {
4820 // TODO: Should we try to emit this once in the entry block?
4821 const LLT S32 = LLT::scalar(32);
4822 const unsigned Mask = Arg->getMask();
4823 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4824
4825 Register AndMaskSrc = LiveIn;
4826
4827 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4828 // 0.
4829 if (Shift != 0) {
4830 auto ShiftAmt = B.buildConstant(S32, Shift);
4831 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4832 }
4833
4834 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4835 } else {
4836 B.buildCopy(DstReg, LiveIn);
4837 }
4838}
4839
4844 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4845 Register DstReg = MI.getOperand(0).getReg();
4846 if (!ST.hasClusters()) {
4847 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4848 return false;
4849 MI.eraseFromParent();
4850 return true;
4851 }
4852
4853 // Clusters are supported. Return the global position in the grid. If clusters
4854 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4855
4856 // WorkGroupIdXYZ = ClusterId == 0 ?
4857 // ClusterIdXYZ :
4858 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4859 MachineRegisterInfo &MRI = *B.getMRI();
4860 const LLT S32 = LLT::scalar(32);
4861 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4862 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4863 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4864 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4865 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4866 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4867 return false;
4868
4869 auto One = B.buildConstant(S32, 1);
4870 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4871 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4872 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4873
4874 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4875
4876 switch (MFI->getClusterDims().getKind()) {
4879 B.buildCopy(DstReg, GlobalIdXYZ);
4880 MI.eraseFromParent();
4881 return true;
4882 }
4884 B.buildCopy(DstReg, ClusterIdXYZ);
4885 MI.eraseFromParent();
4886 return true;
4887 }
4889 using namespace AMDGPU::Hwreg;
4890 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4891 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4892 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4893 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4894 .addDef(ClusterId)
4895 .addImm(ClusterIdField);
4896 auto Zero = B.buildConstant(S32, 0);
4897 auto NoClusters =
4898 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4899 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4900 MI.eraseFromParent();
4901 return true;
4902 }
4903 }
4904
4905 llvm_unreachable("nothing should reach here");
4906}
4907
4909 Register DstReg, MachineIRBuilder &B,
4911 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4912 const ArgDescriptor *Arg = nullptr;
4913 const TargetRegisterClass *ArgRC;
4914 LLT ArgTy;
4915
4916 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4917 const ArgDescriptor WorkGroupIDX =
4918 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4919 // If GridZ is not programmed in an entry function then the hardware will set
4920 // it to all zeros, so there is no need to mask the GridY value in the low
4921 // order bits.
4922 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4923 AMDGPU::TTMP7,
4924 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4925 const ArgDescriptor WorkGroupIDZ =
4926 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4927 const ArgDescriptor ClusterWorkGroupIDX =
4928 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4929 const ArgDescriptor ClusterWorkGroupIDY =
4930 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4931 const ArgDescriptor ClusterWorkGroupIDZ =
4932 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4933 const ArgDescriptor ClusterWorkGroupMaxIDX =
4934 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4935 const ArgDescriptor ClusterWorkGroupMaxIDY =
4936 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4937 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4938 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4939 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4940 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4941
4942 auto LoadConstant = [&](unsigned N) {
4943 B.buildConstant(DstReg, N);
4944 return true;
4945 };
4946
4947 if (ST.hasArchitectedSGPRs() &&
4949 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4950 bool HasFixedDims = ClusterDims.isFixedDims();
4951
4952 switch (ArgType) {
4954 Arg = &WorkGroupIDX;
4955 ArgRC = &AMDGPU::SReg_32RegClass;
4956 ArgTy = LLT::scalar(32);
4957 break;
4959 Arg = &WorkGroupIDY;
4960 ArgRC = &AMDGPU::SReg_32RegClass;
4961 ArgTy = LLT::scalar(32);
4962 break;
4964 Arg = &WorkGroupIDZ;
4965 ArgRC = &AMDGPU::SReg_32RegClass;
4966 ArgTy = LLT::scalar(32);
4967 break;
4969 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4970 return LoadConstant(0);
4971 Arg = &ClusterWorkGroupIDX;
4972 ArgRC = &AMDGPU::SReg_32RegClass;
4973 ArgTy = LLT::scalar(32);
4974 break;
4976 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4977 return LoadConstant(0);
4978 Arg = &ClusterWorkGroupIDY;
4979 ArgRC = &AMDGPU::SReg_32RegClass;
4980 ArgTy = LLT::scalar(32);
4981 break;
4983 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4984 return LoadConstant(0);
4985 Arg = &ClusterWorkGroupIDZ;
4986 ArgRC = &AMDGPU::SReg_32RegClass;
4987 ArgTy = LLT::scalar(32);
4988 break;
4990 if (HasFixedDims)
4991 return LoadConstant(ClusterDims.getDims()[0] - 1);
4992 Arg = &ClusterWorkGroupMaxIDX;
4993 ArgRC = &AMDGPU::SReg_32RegClass;
4994 ArgTy = LLT::scalar(32);
4995 break;
4997 if (HasFixedDims)
4998 return LoadConstant(ClusterDims.getDims()[1] - 1);
4999 Arg = &ClusterWorkGroupMaxIDY;
5000 ArgRC = &AMDGPU::SReg_32RegClass;
5001 ArgTy = LLT::scalar(32);
5002 break;
5004 if (HasFixedDims)
5005 return LoadConstant(ClusterDims.getDims()[2] - 1);
5006 Arg = &ClusterWorkGroupMaxIDZ;
5007 ArgRC = &AMDGPU::SReg_32RegClass;
5008 ArgTy = LLT::scalar(32);
5009 break;
5011 Arg = &ClusterWorkGroupMaxFlatID;
5012 ArgRC = &AMDGPU::SReg_32RegClass;
5013 ArgTy = LLT::scalar(32);
5014 break;
5015 default:
5016 break;
5017 }
5018 }
5019
5020 if (!Arg)
5021 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5022
5023 if (!Arg) {
5025 // The intrinsic may appear when we have a 0 sized kernarg segment, in
5026 // which case the pointer argument may be missing and we use null.
5027 return LoadConstant(0);
5028 }
5029
5030 // It's undefined behavior if a function marked with the amdgpu-no-*
5031 // attributes uses the corresponding intrinsic.
5032 B.buildUndef(DstReg);
5033 return true;
5034 }
5035
5036 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5037 return false; // TODO: Handle these
5038 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5039 return true;
5040}
5041
5045 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
5046 return false;
5047
5048 MI.eraseFromParent();
5049 return true;
5050}
5051
5053 int64_t C) {
5054 B.buildConstant(MI.getOperand(0).getReg(), C);
5055 MI.eraseFromParent();
5056 return true;
5057}
5058
5061 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5062 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5063 if (MaxID == 0)
5064 return replaceWithConstant(B, MI, 0);
5065
5066 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5067 const ArgDescriptor *Arg;
5068 const TargetRegisterClass *ArgRC;
5069 LLT ArgTy;
5070 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5071
5072 Register DstReg = MI.getOperand(0).getReg();
5073 if (!Arg) {
5074 // It's undefined behavior if a function marked with the amdgpu-no-*
5075 // attributes uses the corresponding intrinsic.
5076 B.buildUndef(DstReg);
5077 MI.eraseFromParent();
5078 return true;
5079 }
5080
5081 if (Arg->isMasked()) {
5082 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5083 // masking operations anyway.
5084 //
5085 // TODO: We could assert the top bit is 0 for the source copy.
5086 if (!loadInputValue(DstReg, B, ArgType))
5087 return false;
5088 } else {
5090 if (!loadInputValue(TmpReg, B, ArgType))
5091 return false;
5092 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5093 }
5094
5095 MI.eraseFromParent();
5096 return true;
5097}
5098
5101 // This isn't really a constant pool but close enough.
5104 return PtrInfo;
5105}
5106
5108 int64_t Offset) const {
5110 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5111
5112 // TODO: If we passed in the base kernel offset we could have a better
5113 // alignment than 4, but we don't really need it.
5114 if (!loadInputValue(KernArgReg, B,
5116 llvm_unreachable("failed to find kernarg segment ptr");
5117
5118 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5119 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5120}
5121
5122/// Legalize a value that's loaded from kernel arguments. This is only used by
5123/// legacy intrinsics.
5127 Align Alignment) const {
5128 Register DstReg = MI.getOperand(0).getReg();
5129
5130 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5131 "unexpected kernarg parameter type");
5132
5135 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5138 MI.eraseFromParent();
5139 return true;
5140}
5141
5144 MachineIRBuilder &B) const {
5145 Register Dst = MI.getOperand(0).getReg();
5146 LLT DstTy = MRI.getType(Dst);
5147 LLT S16 = LLT::scalar(16);
5148 LLT S32 = LLT::scalar(32);
5149 LLT S64 = LLT::scalar(64);
5150
5151 if (DstTy == S16)
5152 return legalizeFDIV16(MI, MRI, B);
5153 if (DstTy == S32)
5154 return legalizeFDIV32(MI, MRI, B);
5155 if (DstTy == S64)
5156 return legalizeFDIV64(MI, MRI, B);
5157
5158 return false;
5159}
5160
5162 Register DstDivReg,
5163 Register DstRemReg,
5164 Register X,
5165 Register Y) const {
5166 const LLT S1 = LLT::scalar(1);
5167 const LLT S32 = LLT::scalar(32);
5168
5169 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5170 // algorithm used here.
5171
5172 // Initial estimate of inv(y).
5173 auto FloatY = B.buildUITOFP(S32, Y);
5174 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5175 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5176 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5177 auto Z = B.buildFPTOUI(S32, ScaledY);
5178
5179 // One round of UNR.
5180 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5181 auto NegYZ = B.buildMul(S32, NegY, Z);
5182 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5183
5184 // Quotient/remainder estimate.
5185 auto Q = B.buildUMulH(S32, X, Z);
5186 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5187
5188 // First quotient/remainder refinement.
5189 auto One = B.buildConstant(S32, 1);
5190 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5191 if (DstDivReg)
5192 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5193 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5194
5195 // Second quotient/remainder refinement.
5196 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5197 if (DstDivReg)
5198 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5199
5200 if (DstRemReg)
5201 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5202}
5203
5204// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5205//
5206// Return lo, hi of result
5207//
5208// %cvt.lo = G_UITOFP Val.lo
5209// %cvt.hi = G_UITOFP Val.hi
5210// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5211// %rcp = G_AMDGPU_RCP_IFLAG %mad
5212// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5213// %mul2 = G_FMUL %mul1, 2**(-32)
5214// %trunc = G_INTRINSIC_TRUNC %mul2
5215// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5216// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5217static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5218 Register Val) {
5219 const LLT S32 = LLT::scalar(32);
5220 auto Unmerge = B.buildUnmerge(S32, Val);
5221
5222 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5223 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5224
5225 auto Mad = B.buildFMAD(
5226 S32, CvtHi, // 2**32
5227 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5228
5229 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5230 auto Mul1 = B.buildFMul(
5231 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5232
5233 // 2**(-32)
5234 auto Mul2 = B.buildFMul(
5235 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5236 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5237
5238 // -(2**32)
5239 auto Mad2 = B.buildFMAD(
5240 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5241 Mul1);
5242
5243 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5244 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5245
5246 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5247}
5248
5250 Register DstDivReg,
5251 Register DstRemReg,
5252 Register Numer,
5253 Register Denom) const {
5254 const LLT S32 = LLT::scalar(32);
5255 const LLT S64 = LLT::scalar(64);
5256 const LLT S1 = LLT::scalar(1);
5257 Register RcpLo, RcpHi;
5258
5259 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5260
5261 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5262
5263 auto Zero64 = B.buildConstant(S64, 0);
5264 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5265
5266 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5267 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5268
5269 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5270 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5271 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5272
5273 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5274 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5275 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5276
5277 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5278 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5279 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5280 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5281 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5282
5283 auto Zero32 = B.buildConstant(S32, 0);
5284 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5285 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5286 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5287
5288 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5289 Register NumerLo = UnmergeNumer.getReg(0);
5290 Register NumerHi = UnmergeNumer.getReg(1);
5291
5292 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5293 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5294 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5295 Register Mul3_Lo = UnmergeMul3.getReg(0);
5296 Register Mul3_Hi = UnmergeMul3.getReg(1);
5297 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5298 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5299 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5300 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5301
5302 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5303 Register DenomLo = UnmergeDenom.getReg(0);
5304 Register DenomHi = UnmergeDenom.getReg(1);
5305
5306 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5307 auto C1 = B.buildSExt(S32, CmpHi);
5308
5309 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5310 auto C2 = B.buildSExt(S32, CmpLo);
5311
5312 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5313 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5314
5315 // TODO: Here and below portions of the code can be enclosed into if/endif.
5316 // Currently control flow is unconditional and we have 4 selects after
5317 // potential endif to substitute PHIs.
5318
5319 // if C3 != 0 ...
5320 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5321 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5322 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5323 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5324
5325 auto One64 = B.buildConstant(S64, 1);
5326 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5327
5328 auto C4 =
5329 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5330 auto C5 =
5331 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5332 auto C6 = B.buildSelect(
5333 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5334
5335 // if (C6 != 0)
5336 auto Add4 = B.buildAdd(S64, Add3, One64);
5337 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5338
5339 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5340 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5341 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5342
5343 // endif C6
5344 // endif C3
5345
5346 if (DstDivReg) {
5347 auto Sel1 = B.buildSelect(
5348 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5349 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5350 Sel1, MulHi3);
5351 }
5352
5353 if (DstRemReg) {
5354 auto Sel2 = B.buildSelect(
5355 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5356 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5357 Sel2, Sub1);
5358 }
5359}
5360
5363 MachineIRBuilder &B) const {
5364 Register DstDivReg, DstRemReg;
5365 switch (MI.getOpcode()) {
5366 default:
5367 llvm_unreachable("Unexpected opcode!");
5368 case AMDGPU::G_UDIV: {
5369 DstDivReg = MI.getOperand(0).getReg();
5370 break;
5371 }
5372 case AMDGPU::G_UREM: {
5373 DstRemReg = MI.getOperand(0).getReg();
5374 break;
5375 }
5376 case AMDGPU::G_UDIVREM: {
5377 DstDivReg = MI.getOperand(0).getReg();
5378 DstRemReg = MI.getOperand(1).getReg();
5379 break;
5380 }
5381 }
5382
5383 const LLT S64 = LLT::scalar(64);
5384 const LLT S32 = LLT::scalar(32);
5385 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5386 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5387 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5388 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5389
5390 if (Ty == S32)
5391 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5392 else if (Ty == S64)
5393 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5394 else
5395 return false;
5396
5397 MI.eraseFromParent();
5398 return true;
5399}
5400
5403 MachineIRBuilder &B) const {
5404 const LLT S64 = LLT::scalar(64);
5405 const LLT S32 = LLT::scalar(32);
5406
5407 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5408 if (Ty != S32 && Ty != S64)
5409 return false;
5410
5411 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5412 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5413 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5414
5415 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5416 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5417 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5418
5419 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5420 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5421
5422 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5423 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5424
5425 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5426 switch (MI.getOpcode()) {
5427 default:
5428 llvm_unreachable("Unexpected opcode!");
5429 case AMDGPU::G_SDIV: {
5430 DstDivReg = MI.getOperand(0).getReg();
5431 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5432 break;
5433 }
5434 case AMDGPU::G_SREM: {
5435 DstRemReg = MI.getOperand(0).getReg();
5436 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5437 break;
5438 }
5439 case AMDGPU::G_SDIVREM: {
5440 DstDivReg = MI.getOperand(0).getReg();
5441 DstRemReg = MI.getOperand(1).getReg();
5442 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5443 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5444 break;
5445 }
5446 }
5447
5448 if (Ty == S32)
5449 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5450 else
5451 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5452
5453 if (DstDivReg) {
5454 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5455 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5456 B.buildSub(DstDivReg, SignXor, Sign);
5457 }
5458
5459 if (DstRemReg) {
5460 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5461 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5462 B.buildSub(DstRemReg, SignXor, Sign);
5463 }
5464
5465 MI.eraseFromParent();
5466 return true;
5467}
5468
5471 MachineIRBuilder &B) const {
5472 Register Res = MI.getOperand(0).getReg();
5473 Register LHS = MI.getOperand(1).getReg();
5474 Register RHS = MI.getOperand(2).getReg();
5475 uint16_t Flags = MI.getFlags();
5476 LLT ResTy = MRI.getType(Res);
5477
5478 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5479
5480 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5481 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5482 return false;
5483
5484 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5485 // the CI documentation has a worst case error of 1 ulp.
5486 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5487 // use it as long as we aren't trying to use denormals.
5488 //
5489 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5490
5491 // 1 / x -> RCP(x)
5492 if (CLHS->isExactlyValue(1.0)) {
5493 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5494 .addUse(RHS)
5495 .setMIFlags(Flags);
5496
5497 MI.eraseFromParent();
5498 return true;
5499 }
5500
5501 // -1 / x -> RCP( FNEG(x) )
5502 if (CLHS->isExactlyValue(-1.0)) {
5503 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5504 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5505 .addUse(FNeg.getReg(0))
5506 .setMIFlags(Flags);
5507
5508 MI.eraseFromParent();
5509 return true;
5510 }
5511 }
5512
5513 // For f16 require afn or arcp.
5514 // For f32 require afn.
5515 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5516 !MI.getFlag(MachineInstr::FmArcp)))
5517 return false;
5518
5519 // x / y -> x * (1.0 / y)
5520 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5521 .addUse(RHS)
5522 .setMIFlags(Flags);
5523 B.buildFMul(Res, LHS, RCP, Flags);
5524
5525 MI.eraseFromParent();
5526 return true;
5527}
5528
5531 MachineIRBuilder &B) const {
5532 Register Res = MI.getOperand(0).getReg();
5533 Register X = MI.getOperand(1).getReg();
5534 Register Y = MI.getOperand(2).getReg();
5535 uint16_t Flags = MI.getFlags();
5536 LLT ResTy = MRI.getType(Res);
5537
5538 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5539
5540 if (!AllowInaccurateRcp)
5541 return false;
5542
5543 const ConstantFP *CLHS = getConstantFPVRegVal(X, MRI);
5544 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
5545
5546 // Pull out the negation so it folds for free into the source modifiers.
5547 if (IsNegRcp)
5548 X = B.buildFConstant(ResTy, 1.0).getReg(0);
5549
5550 Register NegY = IsNegRcp ? Y : B.buildFNeg(ResTy, Y).getReg(0);
5551 auto One = B.buildFConstant(ResTy, 1.0);
5552
5553 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5554 .addUse(Y)
5555 .setMIFlags(Flags);
5556 if (IsNegRcp)
5557 R = B.buildFNeg(ResTy, R);
5558
5559 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5560 R = B.buildFMA(ResTy, Tmp0, R, R);
5561
5562 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5563 R = B.buildFMA(ResTy, Tmp1, R, R);
5564
5565 // Skip the last 2 correction terms for reciprocal.
5566 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0))) {
5567 B.buildCopy(Res, R);
5568 MI.eraseFromParent();
5569 return true;
5570 }
5571
5572 auto Ret = B.buildFMul(ResTy, X, R);
5573 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5574
5575 B.buildFMA(Res, Tmp2, R, Ret);
5576 MI.eraseFromParent();
5577 return true;
5578}
5579
5582 MachineIRBuilder &B) const {
5583 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5584 return true;
5585
5586 Register Res = MI.getOperand(0).getReg();
5587 Register LHS = MI.getOperand(1).getReg();
5588 Register RHS = MI.getOperand(2).getReg();
5589
5590 uint16_t Flags = MI.getFlags();
5591
5592 LLT S16 = LLT::scalar(16);
5593 LLT S32 = LLT::scalar(32);
5594
5595 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5596 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5597 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5598 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5599 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5600 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5601 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5602 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5603 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5604 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5605 // q16.u = opx(V_CVT_F16_F32, q32.u);
5606 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5607
5608 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5609 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5610 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5611 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5612 .addUse(RHSExt.getReg(0))
5613 .setMIFlags(Flags);
5614 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5616 if (ST.hasMadMacF32Insts()) {
5617 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5618 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5619 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5620 } else {
5621 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5622 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5623 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5624 }
5625 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5626 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5627 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5628 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5629 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5630 .addUse(RDst.getReg(0))
5631 .addUse(RHS)
5632 .addUse(LHS)
5633 .setMIFlags(Flags);
5634
5635 MI.eraseFromParent();
5636 return true;
5637}
5638
5639static constexpr unsigned SPDenormModeBitField =
5641
5642// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5643// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5645 const GCNSubtarget &ST,
5647 // Set SP denorm mode to this value.
5648 unsigned SPDenormMode =
5649 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5650
5651 if (ST.hasDenormModeInst()) {
5652 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5653 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5654
5655 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5656 B.buildInstr(AMDGPU::S_DENORM_MODE)
5657 .addImm(NewDenormModeValue);
5658
5659 } else {
5660 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5661 .addImm(SPDenormMode)
5662 .addImm(SPDenormModeBitField);
5663 }
5664}
5665
5668 MachineIRBuilder &B) const {
5669 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5670 return true;
5671
5672 Register Res = MI.getOperand(0).getReg();
5673 Register LHS = MI.getOperand(1).getReg();
5674 Register RHS = MI.getOperand(2).getReg();
5675 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5676 SIModeRegisterDefaults Mode = MFI->getMode();
5677
5678 uint16_t Flags = MI.getFlags();
5679
5680 LLT S32 = LLT::scalar(32);
5681 LLT S1 = LLT::scalar(1);
5682
5683 auto One = B.buildFConstant(S32, 1.0f);
5684
5685 auto DenominatorScaled =
5686 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5687 .addUse(LHS)
5688 .addUse(RHS)
5689 .addImm(0)
5690 .setMIFlags(Flags);
5691 auto NumeratorScaled =
5692 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5693 .addUse(LHS)
5694 .addUse(RHS)
5695 .addImm(1)
5696 .setMIFlags(Flags);
5697
5698 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5699 .addUse(DenominatorScaled.getReg(0))
5700 .setMIFlags(Flags);
5701 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5702
5703 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5704 const bool HasDynamicDenormals =
5705 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5706 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5707
5708 Register SavedSPDenormMode;
5709 if (!PreservesDenormals) {
5710 if (HasDynamicDenormals) {
5711 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5712 B.buildInstr(AMDGPU::S_GETREG_B32)
5713 .addDef(SavedSPDenormMode)
5714 .addImm(SPDenormModeBitField);
5715 }
5716 toggleSPDenormMode(true, B, ST, Mode);
5717 }
5718
5719 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5720 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5721 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5722 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5723 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5724 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5725
5726 if (!PreservesDenormals) {
5727 if (HasDynamicDenormals) {
5728 assert(SavedSPDenormMode);
5729 B.buildInstr(AMDGPU::S_SETREG_B32)
5730 .addReg(SavedSPDenormMode)
5731 .addImm(SPDenormModeBitField);
5732 } else
5733 toggleSPDenormMode(false, B, ST, Mode);
5734 }
5735
5736 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5737 .addUse(Fma4.getReg(0))
5738 .addUse(Fma1.getReg(0))
5739 .addUse(Fma3.getReg(0))
5740 .addUse(NumeratorScaled.getReg(1))
5741 .setMIFlags(Flags);
5742
5743 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5744 .addUse(Fmas.getReg(0))
5745 .addUse(RHS)
5746 .addUse(LHS)
5747 .setMIFlags(Flags);
5748
5749 MI.eraseFromParent();
5750 return true;
5751}
5752
5755 MachineIRBuilder &B) const {
5756 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5757 return true;
5758
5759 Register Res = MI.getOperand(0).getReg();
5760 Register LHS = MI.getOperand(1).getReg();
5761 Register RHS = MI.getOperand(2).getReg();
5762
5763 uint16_t Flags = MI.getFlags();
5764
5765 LLT S64 = LLT::scalar(64);
5766 LLT S1 = LLT::scalar(1);
5767
5768 auto One = B.buildFConstant(S64, 1.0);
5769
5770 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5771 .addUse(LHS)
5772 .addUse(RHS)
5773 .addImm(0)
5774 .setMIFlags(Flags);
5775
5776 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5777
5778 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5779 .addUse(DivScale0.getReg(0))
5780 .setMIFlags(Flags);
5781
5782 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5783 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5784 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5785
5786 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5787 .addUse(LHS)
5788 .addUse(RHS)
5789 .addImm(1)
5790 .setMIFlags(Flags);
5791
5792 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5793 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5794 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5795
5796 Register Scale;
5797 if (!ST.hasUsableDivScaleConditionOutput()) {
5798 // Workaround a hardware bug on SI where the condition output from div_scale
5799 // is not usable.
5800
5801 LLT S32 = LLT::scalar(32);
5802
5803 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5804 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5805 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5806 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5807
5808 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5809 Scale1Unmerge.getReg(1));
5810 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5811 Scale0Unmerge.getReg(1));
5812 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5813 } else {
5814 Scale = DivScale1.getReg(1);
5815 }
5816
5817 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5818 .addUse(Fma4.getReg(0))
5819 .addUse(Fma3.getReg(0))
5820 .addUse(Mul.getReg(0))
5821 .addUse(Scale)
5822 .setMIFlags(Flags);
5823
5824 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5825 .addUse(Fmas.getReg(0))
5826 .addUse(RHS)
5827 .addUse(LHS)
5828 .setMIFlags(Flags);
5829
5830 MI.eraseFromParent();
5831 return true;
5832}
5833
5836 MachineIRBuilder &B) const {
5837 Register Res0 = MI.getOperand(0).getReg();
5838 Register Res1 = MI.getOperand(1).getReg();
5839 Register Val = MI.getOperand(2).getReg();
5840 uint16_t Flags = MI.getFlags();
5841
5842 LLT Ty = MRI.getType(Res0);
5843 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5844
5845 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5846 .addUse(Val)
5847 .setMIFlags(Flags);
5848 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5849 .addUse(Val)
5850 .setMIFlags(Flags);
5851
5852 if (ST.hasFractBug()) {
5853 auto Fabs = B.buildFAbs(Ty, Val);
5854 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5855 auto IsFinite =
5856 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5857 auto Zero = B.buildConstant(InstrExpTy, 0);
5858 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5859 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5860 }
5861
5862 B.buildCopy(Res0, Mant);
5863 B.buildSExtOrTrunc(Res1, Exp);
5864
5865 MI.eraseFromParent();
5866 return true;
5867}
5868
5871 MachineIRBuilder &B) const {
5872 Register Res = MI.getOperand(0).getReg();
5873 Register LHS = MI.getOperand(2).getReg();
5874 Register RHS = MI.getOperand(3).getReg();
5875 uint16_t Flags = MI.getFlags();
5876
5877 LLT S32 = LLT::scalar(32);
5878 LLT S1 = LLT::scalar(1);
5879
5880 auto Abs = B.buildFAbs(S32, RHS, Flags);
5881 const APFloat C0Val(1.0f);
5882
5883 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5884 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5885 auto C2 = B.buildFConstant(S32, 1.0f);
5886
5887 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5888 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5889
5890 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5891
5892 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5893 .addUse(Mul0.getReg(0))
5894 .setMIFlags(Flags);
5895
5896 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5897
5898 B.buildFMul(Res, Sel, Mul1, Flags);
5899
5900 MI.eraseFromParent();
5901 return true;
5902}
5903
5906 MachineIRBuilder &B) const {
5907 // Bypass the correct expansion a standard promotion through G_FSQRT would
5908 // get. The f32 op is accurate enough for the f16 cas.
5909 unsigned Flags = MI.getFlags();
5910 assert(!ST.has16BitInsts());
5911 const LLT F32 = LLT::scalar(32);
5912 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5913 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5914 .addUse(Ext.getReg(0))
5915 .setMIFlags(Flags);
5916 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5917 MI.eraseFromParent();
5918 return true;
5919}
5920
5923 MachineIRBuilder &B) const {
5924 MachineFunction &MF = B.getMF();
5925 Register Dst = MI.getOperand(0).getReg();
5926 Register X = MI.getOperand(1).getReg();
5927 const unsigned Flags = MI.getFlags();
5928 const LLT S1 = LLT::scalar(1);
5929 const LLT F32 = LLT::scalar(32);
5930 const LLT I32 = LLT::scalar(32);
5931
5932 if (allowApproxFunc(MF, Flags)) {
5933 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5934 .addUse(X)
5935 .setMIFlags(Flags);
5936 MI.eraseFromParent();
5937 return true;
5938 }
5939
5940 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5941 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5942 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5943 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5944 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5945
5947 if (needsDenormHandlingF32(MF, X, Flags)) {
5948 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5949 .addUse(SqrtX.getReg(0))
5950 .setMIFlags(Flags);
5951
5952 auto NegOne = B.buildConstant(I32, -1);
5953 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5954
5955 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5956 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5957
5958 auto PosOne = B.buildConstant(I32, 1);
5959 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5960
5961 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5962 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5963
5964 auto Zero = B.buildFConstant(F32, 0.0f);
5965 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5966
5967 SqrtS =
5968 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5969
5970 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5971 SqrtS =
5972 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5973 } else {
5974 auto SqrtR =
5975 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5976 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5977
5978 auto Half = B.buildFConstant(F32, 0.5f);
5979 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5980 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5981 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5982 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5983 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5984 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5985 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5986 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5987 }
5988
5989 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5990
5991 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5992
5993 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5994
5995 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5996 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5997
5998 MI.eraseFromParent();
5999 return true;
6000}
6001
6004 MachineIRBuilder &B) const {
6005 // For double type, the SQRT and RSQ instructions don't have required
6006 // precision, we apply Goldschmidt's algorithm to improve the result:
6007 //
6008 // y0 = rsq(x)
6009 // g0 = x * y0
6010 // h0 = 0.5 * y0
6011 //
6012 // r0 = 0.5 - h0 * g0
6013 // g1 = g0 * r0 + g0
6014 // h1 = h0 * r0 + h0
6015 //
6016 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
6017 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
6018 // h2 = h1 * r1 + h1
6019 //
6020 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
6021 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
6022 //
6023 // sqrt(x) = g3
6024
6025 const LLT S1 = LLT::scalar(1);
6026 const LLT S32 = LLT::scalar(32);
6027 const LLT F64 = LLT::scalar(64);
6028
6029 Register Dst = MI.getOperand(0).getReg();
6030 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
6031
6032 Register X = MI.getOperand(1).getReg();
6033 unsigned Flags = MI.getFlags();
6034
6035 Register SqrtX = X;
6036 Register Scaling, ZeroInt;
6037 if (!MI.getFlag(MachineInstr::FmAfn)) {
6038 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
6039
6040 ZeroInt = B.buildConstant(S32, 0).getReg(0);
6041 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
6042
6043 // Scale up input if it is too small.
6044 auto ScaleUpFactor = B.buildConstant(S32, 256);
6045 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
6046 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
6047 }
6048
6049 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
6050
6051 auto Half = B.buildFConstant(F64, 0.5);
6052 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
6053 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
6054
6055 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
6056 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
6057
6058 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
6059 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
6060
6061 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
6062 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6063
6064 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6065
6066 Register SqrtRet = SqrtS2.getReg(0);
6067 if (!MI.getFlag(MachineInstr::FmAfn)) {
6068 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6069 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6070 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6071
6072 // Scale down the result.
6073 auto ScaleDownFactor = B.buildConstant(S32, -128);
6074 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6075 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6076 }
6077
6078 Register IsZeroOrInf;
6079 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6080 auto ZeroFP = B.buildFConstant(F64, 0.0);
6081 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6082 } else {
6083 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6084 }
6085
6086 // TODO: Check for DAZ and expand to subnormals
6087
6088 // If x is +INF, +0, or -0, use its original value
6089 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6090
6091 MI.eraseFromParent();
6092 return true;
6093}
6094
6097 MachineIRBuilder &B) const {
6098 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6099 if (Ty == LLT::scalar(32))
6100 return legalizeFSQRTF32(MI, MRI, B);
6101 if (Ty == LLT::scalar(64))
6102 return legalizeFSQRTF64(MI, MRI, B);
6103 if (Ty == LLT::scalar(16))
6104 return legalizeFSQRTF16(MI, MRI, B);
6105 return false;
6106}
6107
6108// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6109// FIXME: Why do we handle this one but not other removed instructions?
6110//
6111// Reciprocal square root. The clamp prevents infinite results, clamping
6112// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6113// +-max_float.
6116 MachineIRBuilder &B) const {
6117 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6118 return true;
6119
6120 Register Dst = MI.getOperand(0).getReg();
6121 Register Src = MI.getOperand(2).getReg();
6122 auto Flags = MI.getFlags();
6123
6124 LLT Ty = MRI.getType(Dst);
6125
6126 const fltSemantics *FltSemantics;
6127 if (Ty == LLT::scalar(32))
6128 FltSemantics = &APFloat::IEEEsingle();
6129 else if (Ty == LLT::scalar(64))
6130 FltSemantics = &APFloat::IEEEdouble();
6131 else
6132 return false;
6133
6134 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6135 .addUse(Src)
6136 .setMIFlags(Flags);
6137
6138 // We don't need to concern ourselves with the snan handling difference, since
6139 // the rsq quieted (or not) so use the one which will directly select.
6140 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6141 const bool UseIEEE = MFI->getMode().IEEE;
6142
6143 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6144 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6145 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6146
6147 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6148
6149 if (UseIEEE)
6150 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6151 else
6152 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6153 MI.eraseFromParent();
6154 return true;
6155}
6156
6157// TODO: Fix pointer type handling
6160 Intrinsic::ID IID) const {
6161
6162 MachineIRBuilder &B = Helper.MIRBuilder;
6163 MachineRegisterInfo &MRI = *B.getMRI();
6164
6165 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6166 IID == Intrinsic::amdgcn_permlanex16;
6167 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6168 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6169 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6170 IID == Intrinsic::amdgcn_permlane_up ||
6171 IID == Intrinsic::amdgcn_permlane_down ||
6172 IID == Intrinsic::amdgcn_permlane_xor;
6173
6174 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6175 Register Src2, LLT VT) -> Register {
6176 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6177 switch (IID) {
6178 case Intrinsic::amdgcn_readfirstlane:
6179 case Intrinsic::amdgcn_permlane64:
6180 return LaneOp.getReg(0);
6181 case Intrinsic::amdgcn_readlane:
6182 case Intrinsic::amdgcn_set_inactive:
6183 case Intrinsic::amdgcn_set_inactive_chain_arg:
6184 return LaneOp.addUse(Src1).getReg(0);
6185 case Intrinsic::amdgcn_writelane:
6186 case Intrinsic::amdgcn_permlane_bcast:
6187 case Intrinsic::amdgcn_permlane_up:
6188 case Intrinsic::amdgcn_permlane_down:
6189 case Intrinsic::amdgcn_permlane_xor:
6190 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6191 case Intrinsic::amdgcn_permlane16:
6192 case Intrinsic::amdgcn_permlanex16: {
6193 Register Src3 = MI.getOperand(5).getReg();
6194 int64_t Src4 = MI.getOperand(6).getImm();
6195 int64_t Src5 = MI.getOperand(7).getImm();
6196 return LaneOp.addUse(Src1)
6197 .addUse(Src2)
6198 .addUse(Src3)
6199 .addImm(Src4)
6200 .addImm(Src5)
6201 .getReg(0);
6202 }
6203 case Intrinsic::amdgcn_mov_dpp8:
6204 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6205 case Intrinsic::amdgcn_update_dpp:
6206 return LaneOp.addUse(Src1)
6207 .addImm(MI.getOperand(4).getImm())
6208 .addImm(MI.getOperand(5).getImm())
6209 .addImm(MI.getOperand(6).getImm())
6210 .addImm(MI.getOperand(7).getImm())
6211 .getReg(0);
6212 default:
6213 llvm_unreachable("unhandled lane op");
6214 }
6215 };
6216
6217 Register DstReg = MI.getOperand(0).getReg();
6218 Register Src0 = MI.getOperand(2).getReg();
6219 Register Src1, Src2;
6220 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6221 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6222 IsPermlaneShuffle) {
6223 Src1 = MI.getOperand(3).getReg();
6224 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6225 IsPermlaneShuffle) {
6226 Src2 = MI.getOperand(4).getReg();
6227 }
6228 }
6229
6230 LLT Ty = MRI.getType(DstReg);
6231 unsigned Size = Ty.getSizeInBits();
6232
6233 unsigned SplitSize = 32;
6234 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6235 ST.hasDPALU_DPP() &&
6236 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6237 SplitSize = 64;
6238
6239 if (Size == SplitSize) {
6240 // Already legal
6241 return true;
6242 }
6243
6244 if (Size < 32) {
6245 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6246
6247 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6248 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6249
6250 if (IID == Intrinsic::amdgcn_writelane)
6251 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6252
6253 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6254 B.buildTrunc(DstReg, LaneOpDst);
6255 MI.eraseFromParent();
6256 return true;
6257 }
6258
6259 if (Size % SplitSize != 0)
6260 return false;
6261
6262 LLT PartialResTy = LLT::scalar(SplitSize);
6263 bool NeedsBitcast = false;
6264 if (Ty.isVector()) {
6265 LLT EltTy = Ty.getElementType();
6266 unsigned EltSize = EltTy.getSizeInBits();
6267 if (EltSize == SplitSize) {
6268 PartialResTy = EltTy;
6269 } else if (EltSize == 16 || EltSize == 32) {
6270 unsigned NElem = SplitSize / EltSize;
6271 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6272 } else {
6273 // Handle all other cases via S32/S64 pieces
6274 NeedsBitcast = true;
6275 }
6276 }
6277
6278 SmallVector<Register, 4> PartialRes;
6279 unsigned NumParts = Size / SplitSize;
6280 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6281 MachineInstrBuilder Src1Parts, Src2Parts;
6282
6283 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6284 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6285
6286 if (IID == Intrinsic::amdgcn_writelane)
6287 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6288
6289 for (unsigned i = 0; i < NumParts; ++i) {
6290 Src0 = Src0Parts.getReg(i);
6291
6292 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6293 Src1 = Src1Parts.getReg(i);
6294
6295 if (IID == Intrinsic::amdgcn_writelane)
6296 Src2 = Src2Parts.getReg(i);
6297
6298 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6299 }
6300
6301 if (NeedsBitcast)
6302 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6303 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6304 else
6305 B.buildMergeLikeInstr(DstReg, PartialRes);
6306
6307 MI.eraseFromParent();
6308 return true;
6309}
6310
6313 MachineIRBuilder &B) const {
6315 ST.getTargetLowering()->getImplicitParameterOffset(
6317 LLT DstTy = MRI.getType(DstReg);
6318 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6319
6320 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6321 if (!loadInputValue(KernargPtrReg, B,
6323 return false;
6324
6325 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6326 B.buildConstant(IdxTy, Offset).getReg(0));
6327 return true;
6328}
6329
6330/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6331/// bits of the pointer and replace them with the stride argument, then
6332/// merge_values everything together. In the common case of a raw buffer (the
6333/// stride component is 0), we can just AND off the upper half.
6336 Register Result = MI.getOperand(0).getReg();
6337 Register Pointer = MI.getOperand(2).getReg();
6338 Register Stride = MI.getOperand(3).getReg();
6339 Register NumRecords = MI.getOperand(4).getReg();
6340 Register Flags = MI.getOperand(5).getReg();
6341
6342 LLT S32 = LLT::scalar(32);
6343 LLT S64 = LLT::scalar(64);
6344
6345 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6346
6347 auto ExtStride = B.buildAnyExt(S32, Stride);
6348
6349 if (ST.has45BitNumRecordsBufferResource()) {
6350 Register Zero = B.buildConstant(S32, 0).getReg(0);
6351 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6352 // num_records.
6353 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6354 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6355 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6356 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6357 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6358
6359 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6360 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6361 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6362 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6363 auto ExtShiftedStride =
6364 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6365 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6366 auto ExtShiftedFlags =
6367 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6368 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6369 Register HighHalf =
6370 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6371 B.buildMergeValues(Result, {LowHalf, HighHalf});
6372 } else {
6373 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6374 auto Unmerge = B.buildUnmerge(S32, Pointer);
6375 auto LowHalf = Unmerge.getReg(0);
6376 auto HighHalf = Unmerge.getReg(1);
6377
6378 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6379 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6380 auto ShiftConst = B.buildConstant(S32, 16);
6381 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6382 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6383 Register NewHighHalfReg = NewHighHalf.getReg(0);
6384 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6385 }
6386
6387 MI.eraseFromParent();
6388 return true;
6389}
6390
6393 MachineIRBuilder &B) const {
6394 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6395 if (!MFI->isEntryFunction()) {
6396 return legalizePreloadedArgIntrin(MI, MRI, B,
6398 }
6399
6400 Register DstReg = MI.getOperand(0).getReg();
6401 if (!getImplicitArgPtr(DstReg, MRI, B))
6402 return false;
6403
6404 MI.eraseFromParent();
6405 return true;
6406}
6407
6410 MachineIRBuilder &B) const {
6411 Function &F = B.getMF().getFunction();
6412 std::optional<uint32_t> KnownSize =
6414 if (KnownSize.has_value())
6415 B.buildConstant(DstReg, *KnownSize);
6416 return false;
6417}
6418
6421 MachineIRBuilder &B) const {
6422
6423 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6424 if (!MFI->isEntryFunction()) {
6425 return legalizePreloadedArgIntrin(MI, MRI, B,
6427 }
6428
6429 Register DstReg = MI.getOperand(0).getReg();
6430 if (!getLDSKernelId(DstReg, MRI, B))
6431 return false;
6432
6433 MI.eraseFromParent();
6434 return true;
6435}
6436
6440 unsigned AddrSpace) const {
6441 const LLT S32 = LLT::scalar(32);
6442 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6443 Register Hi32 = Unmerge.getReg(1);
6444
6445 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6446 ST.hasGloballyAddressableScratch()) {
6447 Register FlatScratchBaseHi =
6448 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6449 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6450 .getReg(0);
6451 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6452 // Test bits 63..58 against the aperture address.
6453 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6454 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6455 B.buildConstant(S32, 1u << 26));
6456 } else {
6457 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6458 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6459 }
6460 MI.eraseFromParent();
6461 return true;
6462}
6463
6464// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6465// offset (the offset that is included in bounds checking and swizzling, to be
6466// split between the instruction's voffset and immoffset fields) and soffset
6467// (the offset that is excluded from bounds checking and swizzling, to go in
6468// the instruction's soffset field). This function takes the first kind of
6469// offset and figures out how to split it between voffset and immoffset.
6470std::pair<Register, unsigned>
6472 Register OrigOffset) const {
6473 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6474 Register BaseReg;
6475 unsigned ImmOffset;
6476 const LLT S32 = LLT::scalar(32);
6477 MachineRegisterInfo &MRI = *B.getMRI();
6478
6479 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6480 // being added, so we can only safely match a 32-bit addition with no unsigned
6481 // overflow.
6482 bool CheckNUW = ST.hasGFX1250Insts();
6483 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6484 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6485
6486 // If BaseReg is a pointer, convert it to int.
6487 if (MRI.getType(BaseReg).isPointer())
6488 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6489
6490 // If the immediate value is too big for the immoffset field, put only bits
6491 // that would normally fit in the immoffset field. The remaining value that
6492 // is copied/added for the voffset field is a large power of 2, and it
6493 // stands more chance of being CSEd with the copy/add for another similar
6494 // load/store.
6495 // However, do not do that rounding down if that is a negative
6496 // number, as it appears to be illegal to have a negative offset in the
6497 // vgpr, even if adding the immediate offset makes it positive.
6498 unsigned Overflow = ImmOffset & ~MaxImm;
6499 ImmOffset -= Overflow;
6500 if ((int32_t)Overflow < 0) {
6501 Overflow += ImmOffset;
6502 ImmOffset = 0;
6503 }
6504
6505 if (Overflow != 0) {
6506 if (!BaseReg) {
6507 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6508 } else {
6509 auto OverflowVal = B.buildConstant(S32, Overflow);
6510 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6511 }
6512 }
6513
6514 if (!BaseReg)
6515 BaseReg = B.buildConstant(S32, 0).getReg(0);
6516
6517 return std::pair(BaseReg, ImmOffset);
6518}
6519
6520/// Handle register layout difference for f16 images for some subtargets.
6523 Register Reg,
6524 bool ImageStore) const {
6525 const LLT S16 = LLT::scalar(16);
6526 const LLT S32 = LLT::scalar(32);
6527 LLT StoreVT = MRI.getType(Reg);
6528 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6529
6530 if (ST.hasUnpackedD16VMem()) {
6531 auto Unmerge = B.buildUnmerge(S16, Reg);
6532
6533 SmallVector<Register, 4> WideRegs;
6534 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6535 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6536
6537 int NumElts = StoreVT.getNumElements();
6538
6539 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6540 .getReg(0);
6541 }
6542
6543 if (ImageStore && ST.hasImageStoreD16Bug()) {
6544 if (StoreVT.getNumElements() == 2) {
6545 SmallVector<Register, 4> PackedRegs;
6546 Reg = B.buildBitcast(S32, Reg).getReg(0);
6547 PackedRegs.push_back(Reg);
6548 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6549 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6550 .getReg(0);
6551 }
6552
6553 if (StoreVT.getNumElements() == 3) {
6554 SmallVector<Register, 4> PackedRegs;
6555 auto Unmerge = B.buildUnmerge(S16, Reg);
6556 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6557 PackedRegs.push_back(Unmerge.getReg(I));
6558 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6559 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6560 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6561 }
6562
6563 if (StoreVT.getNumElements() == 4) {
6564 SmallVector<Register, 4> PackedRegs;
6565 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6566 auto Unmerge = B.buildUnmerge(S32, Reg);
6567 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6568 PackedRegs.push_back(Unmerge.getReg(I));
6569 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6570 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6571 .getReg(0);
6572 }
6573
6574 llvm_unreachable("invalid data type");
6575 }
6576
6577 if (StoreVT == LLT::fixed_vector(3, S16)) {
6578 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6579 .getReg(0);
6580 }
6581 return Reg;
6582}
6583
6585 Register VData, LLT MemTy,
6586 bool IsFormat) const {
6587 MachineRegisterInfo *MRI = B.getMRI();
6588 LLT Ty = MRI->getType(VData);
6589
6590 const LLT S16 = LLT::scalar(16);
6591
6592 // Fixup buffer resources themselves needing to be v4i128.
6594 return castBufferRsrcToV4I32(VData, B);
6595
6596 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6597 Ty = getBitcastRegisterType(Ty);
6598 VData = B.buildBitcast(Ty, VData).getReg(0);
6599 }
6600 // Fixup illegal register types for i8 stores.
6601 if (Ty == LLT::scalar(8) || Ty == S16) {
6602 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6603 return AnyExt;
6604 }
6605
6606 if (Ty.isVector()) {
6607 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6608 if (IsFormat)
6609 return handleD16VData(B, *MRI, VData);
6610 }
6611 }
6612
6613 return VData;
6614}
6615
6617 LegalizerHelper &Helper,
6618 bool IsTyped,
6619 bool IsFormat) const {
6620 MachineIRBuilder &B = Helper.MIRBuilder;
6621 MachineRegisterInfo &MRI = *B.getMRI();
6622
6623 Register VData = MI.getOperand(1).getReg();
6624 LLT Ty = MRI.getType(VData);
6625 LLT EltTy = Ty.getScalarType();
6626 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6627 const LLT S32 = LLT::scalar(32);
6628
6629 MachineMemOperand *MMO = *MI.memoperands_begin();
6630 const int MemSize = MMO->getSize().getValue();
6631 LLT MemTy = MMO->getMemoryType();
6632
6633 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6634
6636 Register RSrc = MI.getOperand(2).getReg();
6637
6638 unsigned ImmOffset;
6639
6640 // The typed intrinsics add an immediate after the registers.
6641 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6642
6643 // The struct intrinsic variants add one additional operand over raw.
6644 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6645 Register VIndex;
6646 int OpOffset = 0;
6647 if (HasVIndex) {
6648 VIndex = MI.getOperand(3).getReg();
6649 OpOffset = 1;
6650 } else {
6651 VIndex = B.buildConstant(S32, 0).getReg(0);
6652 }
6653
6654 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6655 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6656
6657 unsigned Format = 0;
6658 if (IsTyped) {
6659 Format = MI.getOperand(5 + OpOffset).getImm();
6660 ++OpOffset;
6661 }
6662
6663 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6664
6665 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6666
6667 unsigned Opc;
6668 if (IsTyped) {
6669 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6670 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6671 } else if (IsFormat) {
6672 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6673 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6674 } else {
6675 switch (MemSize) {
6676 case 1:
6677 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6678 break;
6679 case 2:
6680 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6681 break;
6682 default:
6683 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6684 break;
6685 }
6686 }
6687
6688 auto MIB = B.buildInstr(Opc)
6689 .addUse(VData) // vdata
6690 .addUse(RSrc) // rsrc
6691 .addUse(VIndex) // vindex
6692 .addUse(VOffset) // voffset
6693 .addUse(SOffset) // soffset
6694 .addImm(ImmOffset); // offset(imm)
6695
6696 if (IsTyped)
6697 MIB.addImm(Format);
6698
6699 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6700 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6701 .addMemOperand(MMO);
6702
6703 MI.eraseFromParent();
6704 return true;
6705}
6706
6707static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6708 Register VIndex, Register VOffset, Register SOffset,
6709 unsigned ImmOffset, unsigned Format,
6710 unsigned AuxiliaryData, MachineMemOperand *MMO,
6711 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6712 auto MIB = B.buildInstr(Opc)
6713 .addDef(LoadDstReg) // vdata
6714 .addUse(RSrc) // rsrc
6715 .addUse(VIndex) // vindex
6716 .addUse(VOffset) // voffset
6717 .addUse(SOffset) // soffset
6718 .addImm(ImmOffset); // offset(imm)
6719
6720 if (IsTyped)
6721 MIB.addImm(Format);
6722
6723 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6724 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6725 .addMemOperand(MMO);
6726}
6727
6729 LegalizerHelper &Helper,
6730 bool IsFormat,
6731 bool IsTyped) const {
6732 MachineIRBuilder &B = Helper.MIRBuilder;
6733 MachineRegisterInfo &MRI = *B.getMRI();
6734 GISelChangeObserver &Observer = Helper.Observer;
6735
6736 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6737 MachineMemOperand *MMO = *MI.memoperands_begin();
6738 const LLT MemTy = MMO->getMemoryType();
6739 const LLT S32 = LLT::scalar(32);
6740
6741 Register Dst = MI.getOperand(0).getReg();
6742
6743 Register StatusDst;
6744 int OpOffset = 0;
6745 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6746 bool IsTFE = MI.getNumExplicitDefs() == 2;
6747 if (IsTFE) {
6748 StatusDst = MI.getOperand(1).getReg();
6749 ++OpOffset;
6750 }
6751
6752 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6753 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6754
6755 // The typed intrinsics add an immediate after the registers.
6756 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6757
6758 // The struct intrinsic variants add one additional operand over raw.
6759 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6760 Register VIndex;
6761 if (HasVIndex) {
6762 VIndex = MI.getOperand(3 + OpOffset).getReg();
6763 ++OpOffset;
6764 } else {
6765 VIndex = B.buildConstant(S32, 0).getReg(0);
6766 }
6767
6768 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6769 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6770
6771 unsigned Format = 0;
6772 if (IsTyped) {
6773 Format = MI.getOperand(5 + OpOffset).getImm();
6774 ++OpOffset;
6775 }
6776
6777 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6778 unsigned ImmOffset;
6779
6780 LLT Ty = MRI.getType(Dst);
6781 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6782 // logic doesn't have to handle that case.
6783 if (hasBufferRsrcWorkaround(Ty)) {
6784 Observer.changingInstr(MI);
6785 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6786 Observer.changedInstr(MI);
6787 Dst = MI.getOperand(0).getReg();
6788 B.setInsertPt(B.getMBB(), MI);
6789 }
6790 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6791 Ty = getBitcastRegisterType(Ty);
6792 Observer.changingInstr(MI);
6793 Helper.bitcastDst(MI, Ty, 0);
6794 Observer.changedInstr(MI);
6795 Dst = MI.getOperand(0).getReg();
6796 B.setInsertPt(B.getMBB(), MI);
6797 }
6798
6799 LLT EltTy = Ty.getScalarType();
6800 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6801 const bool Unpacked = ST.hasUnpackedD16VMem();
6802
6803 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6804
6805 unsigned Opc;
6806
6807 // TODO: Support TFE for typed and narrow loads.
6808 if (IsTyped) {
6809 if (IsTFE)
6810 return false;
6811 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6812 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6813 } else if (IsFormat) {
6814 if (IsD16) {
6815 if (IsTFE)
6816 return false;
6817 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6818 } else {
6819 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6820 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6821 }
6822 } else {
6823 switch (MemTy.getSizeInBits()) {
6824 case 8:
6825 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6826 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6827 break;
6828 case 16:
6829 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6830 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6831 break;
6832 default:
6833 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6834 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6835 break;
6836 }
6837 }
6838
6839 if (IsTFE) {
6840 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6841 unsigned NumLoadDWords = NumValueDWords + 1;
6842 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6843 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6844 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6845 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6846 if (MemTy.getSizeInBits() < 32) {
6847 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6848 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6849 B.buildTrunc(Dst, ExtDst);
6850 } else if (NumValueDWords == 1) {
6851 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6852 } else {
6853 SmallVector<Register, 5> LoadElts;
6854 for (unsigned I = 0; I != NumValueDWords; ++I)
6855 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6856 LoadElts.push_back(StatusDst);
6857 B.buildUnmerge(LoadElts, LoadDstReg);
6858 LoadElts.truncate(NumValueDWords);
6859 B.buildMergeLikeInstr(Dst, LoadElts);
6860 }
6861 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6862 (IsD16 && !Ty.isVector())) {
6863 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6864 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6865 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6866 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6867 B.buildTrunc(Dst, LoadDstReg);
6868 } else if (Unpacked && IsD16 && Ty.isVector()) {
6869 LLT UnpackedTy = Ty.changeElementSize(32);
6870 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6871 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6872 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6873 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6874 // FIXME: G_TRUNC should work, but legalization currently fails
6875 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6877 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6878 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6879 B.buildMergeLikeInstr(Dst, Repack);
6880 } else {
6881 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6882 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6883 }
6884
6885 MI.eraseFromParent();
6886 return true;
6887}
6888
6889static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6890 switch (IntrID) {
6891 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6931 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6933 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6936 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6938 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6941 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6943 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6946 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6948 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6951 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6953 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6956 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6958 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6961 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6963 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6966 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6968 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6971 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6973 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6976 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6977 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6978 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6979 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6980 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6981 default:
6982 llvm_unreachable("unhandled atomic opcode");
6983 }
6984}
6985
6988 Intrinsic::ID IID) const {
6989 const bool IsCmpSwap =
6990 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6991 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6992 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6993 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6994
6995 Register Dst = MI.getOperand(0).getReg();
6996 // Since we don't have 128-bit atomics, we don't need to handle the case of
6997 // p8 argmunents to the atomic itself
6998 Register VData = MI.getOperand(2).getReg();
6999
7000 Register CmpVal;
7001 int OpOffset = 0;
7002
7003 if (IsCmpSwap) {
7004 CmpVal = MI.getOperand(3).getReg();
7005 ++OpOffset;
7006 }
7007
7008 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
7009 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
7010 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7011
7012 // The struct intrinsic variants add one additional operand over raw.
7013 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
7014 Register VIndex;
7015 if (HasVIndex) {
7016 VIndex = MI.getOperand(4 + OpOffset).getReg();
7017 ++OpOffset;
7018 } else {
7019 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
7020 }
7021
7022 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
7023 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
7024 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
7025
7026 MachineMemOperand *MMO = *MI.memoperands_begin();
7027
7028 unsigned ImmOffset;
7029 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
7030
7031 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
7032 .addDef(Dst)
7033 .addUse(VData); // vdata
7034
7035 if (IsCmpSwap)
7036 MIB.addReg(CmpVal);
7037
7038 MIB.addUse(RSrc) // rsrc
7039 .addUse(VIndex) // vindex
7040 .addUse(VOffset) // voffset
7041 .addUse(SOffset) // soffset
7042 .addImm(ImmOffset) // offset(imm)
7043 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
7044 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
7045 .addMemOperand(MMO);
7046
7047 MI.eraseFromParent();
7048 return true;
7049}
7050
7051/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
7052/// vector with s16 typed elements.
7054 SmallVectorImpl<Register> &PackedAddrs,
7055 unsigned ArgOffset,
7057 bool IsA16, bool IsG16) {
7058 const LLT S16 = LLT::scalar(16);
7059 const LLT V2S16 = LLT::fixed_vector(2, 16);
7060 auto EndIdx = Intr->VAddrEnd;
7061
7062 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7063 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7064 if (!SrcOp.isReg())
7065 continue; // _L to _LZ may have eliminated this.
7066
7067 Register AddrReg = SrcOp.getReg();
7068
7069 if ((I < Intr->GradientStart) ||
7070 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7071 (I >= Intr->CoordStart && !IsA16)) {
7072 if ((I < Intr->GradientStart) && IsA16 &&
7073 (B.getMRI()->getType(AddrReg) == S16)) {
7074 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7075 // Special handling of bias when A16 is on. Bias is of type half but
7076 // occupies full 32-bit.
7077 PackedAddrs.push_back(
7078 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7079 .getReg(0));
7080 } else {
7081 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7082 "Bias needs to be converted to 16 bit in A16 mode");
7083 // Handle any gradient or coordinate operands that should not be packed
7084 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7085 PackedAddrs.push_back(AddrReg);
7086 }
7087 } else {
7088 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7089 // derivatives dx/dh and dx/dv are packed with undef.
7090 if (((I + 1) >= EndIdx) ||
7091 ((Intr->NumGradients / 2) % 2 == 1 &&
7092 (I == static_cast<unsigned>(Intr->GradientStart +
7093 (Intr->NumGradients / 2) - 1) ||
7094 I == static_cast<unsigned>(Intr->GradientStart +
7095 Intr->NumGradients - 1))) ||
7096 // Check for _L to _LZ optimization
7097 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7098 PackedAddrs.push_back(
7099 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7100 .getReg(0));
7101 } else {
7102 PackedAddrs.push_back(
7103 B.buildBuildVector(
7104 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7105 .getReg(0));
7106 ++I;
7107 }
7108 }
7109 }
7110}
7111
7112/// Convert from separate vaddr components to a single vector address register,
7113/// and replace the remaining operands with $noreg.
7115 int DimIdx, int NumVAddrs) {
7116 const LLT S32 = LLT::scalar(32);
7117 (void)S32;
7118 SmallVector<Register, 8> AddrRegs;
7119 for (int I = 0; I != NumVAddrs; ++I) {
7120 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7121 if (SrcOp.isReg()) {
7122 AddrRegs.push_back(SrcOp.getReg());
7123 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7124 }
7125 }
7126
7127 int NumAddrRegs = AddrRegs.size();
7128 if (NumAddrRegs != 1) {
7129 auto VAddr =
7130 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7131 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7132 }
7133
7134 for (int I = 1; I != NumVAddrs; ++I) {
7135 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7136 if (SrcOp.isReg())
7137 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7138 }
7139}
7140
7141/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7142///
7143/// Depending on the subtarget, load/store with 16-bit element data need to be
7144/// rewritten to use the low half of 32-bit registers, or directly use a packed
7145/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7146/// registers.
7147///
7148/// We don't want to directly select image instructions just yet, but also want
7149/// to exposes all register repacking to the legalizer/combiners. We also don't
7150/// want a selected instruction entering RegBankSelect. In order to avoid
7151/// defining a multitude of intermediate image instructions, directly hack on
7152/// the intrinsic's arguments. In cases like a16 addresses, this requires
7153/// padding now unnecessary arguments with $noreg.
7156 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7157
7158 const MachineFunction &MF = *MI.getMF();
7159 const unsigned NumDefs = MI.getNumExplicitDefs();
7160 const unsigned ArgOffset = NumDefs + 1;
7161 bool IsTFE = NumDefs == 2;
7162 // We are only processing the operands of d16 image operations on subtargets
7163 // that use the unpacked register layout, or need to repack the TFE result.
7164
7165 // TODO: Do we need to guard against already legalized intrinsics?
7166 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7168
7169 MachineRegisterInfo *MRI = B.getMRI();
7170 const LLT S32 = LLT::scalar(32);
7171 const LLT S16 = LLT::scalar(16);
7172 const LLT V2S16 = LLT::fixed_vector(2, 16);
7173
7174 unsigned DMask = 0;
7175 Register VData;
7176 LLT Ty;
7177
7178 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7179 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7180 Ty = MRI->getType(VData);
7181 }
7182
7183 const bool IsAtomicPacked16Bit =
7184 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7185 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7186
7187 // Check for 16 bit addresses and pack if true.
7188 LLT GradTy =
7189 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7190 LLT AddrTy =
7191 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7192 const bool IsG16 =
7193 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7194 const bool IsA16 = AddrTy == S16;
7195 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7196
7197 int DMaskLanes = 0;
7198 if (!BaseOpcode->Atomic) {
7199 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7200 if (BaseOpcode->Gather4) {
7201 DMaskLanes = 4;
7202 } else if (DMask != 0) {
7203 DMaskLanes = llvm::popcount(DMask);
7204 } else if (!IsTFE && !BaseOpcode->Store) {
7205 // If dmask is 0, this is a no-op load. This can be eliminated.
7206 B.buildUndef(MI.getOperand(0));
7207 MI.eraseFromParent();
7208 return true;
7209 }
7210 }
7211
7212 Observer.changingInstr(MI);
7213 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7214
7215 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7216 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7217 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7218 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7219 unsigned NewOpcode = LoadOpcode;
7220 if (BaseOpcode->Store)
7221 NewOpcode = StoreOpcode;
7222 else if (BaseOpcode->NoReturn)
7223 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7224
7225 // Track that we legalized this
7226 MI.setDesc(B.getTII().get(NewOpcode));
7227
7228 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7229 // dmask to be at least 1 otherwise the instruction will fail
7230 if (IsTFE && DMask == 0) {
7231 DMask = 0x1;
7232 DMaskLanes = 1;
7233 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7234 }
7235
7236 if (BaseOpcode->Atomic) {
7237 Register VData0 = MI.getOperand(2).getReg();
7238 LLT Ty = MRI->getType(VData0);
7239
7240 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7241 if (Ty.isVector() && !IsAtomicPacked16Bit)
7242 return false;
7243
7244 if (BaseOpcode->AtomicX2) {
7245 Register VData1 = MI.getOperand(3).getReg();
7246 // The two values are packed in one register.
7247 LLT PackedTy = LLT::fixed_vector(2, Ty);
7248 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7249 MI.getOperand(2).setReg(Concat.getReg(0));
7250 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7251 }
7252 }
7253
7254 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7255
7256 // Rewrite the addressing register layout before doing anything else.
7257 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7258 // 16 bit gradients are supported, but are tied to the A16 control
7259 // so both gradients and addresses must be 16 bit
7260 return false;
7261 }
7262
7263 if (IsA16 && !ST.hasA16()) {
7264 // A16 not supported
7265 return false;
7266 }
7267
7268 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7269 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7270
7271 if (IsA16 || IsG16) {
7272 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7273 // instructions expect VGPR_32
7274 SmallVector<Register, 4> PackedRegs;
7275
7276 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7277
7278 // See also below in the non-a16 branch
7279 const bool UseNSA = ST.hasNSAEncoding() &&
7280 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7281 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7282 const bool UsePartialNSA =
7283 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7284
7285 if (UsePartialNSA) {
7286 // Pack registers that would go over NSAMaxSize into last VAddr register
7287 LLT PackedAddrTy =
7288 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7289 auto Concat = B.buildConcatVectors(
7290 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7291 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7292 PackedRegs.resize(NSAMaxSize);
7293 } else if (!UseNSA && PackedRegs.size() > 1) {
7294 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7295 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7296 PackedRegs[0] = Concat.getReg(0);
7297 PackedRegs.resize(1);
7298 }
7299
7300 const unsigned NumPacked = PackedRegs.size();
7301 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7302 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7303 if (!SrcOp.isReg()) {
7304 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7305 continue;
7306 }
7307
7308 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7309
7310 if (I - Intr->VAddrStart < NumPacked)
7311 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7312 else
7313 SrcOp.setReg(AMDGPU::NoRegister);
7314 }
7315 } else {
7316 // If the register allocator cannot place the address registers contiguously
7317 // without introducing moves, then using the non-sequential address encoding
7318 // is always preferable, since it saves VALU instructions and is usually a
7319 // wash in terms of code size or even better.
7320 //
7321 // However, we currently have no way of hinting to the register allocator
7322 // that MIMG addresses should be placed contiguously when it is possible to
7323 // do so, so force non-NSA for the common 2-address case as a heuristic.
7324 //
7325 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7326 // allocation when possible.
7327 //
7328 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7329 // set of the remaining addresses.
7330 const bool UseNSA = ST.hasNSAEncoding() &&
7331 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7332 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7333 const bool UsePartialNSA =
7334 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7335
7336 if (UsePartialNSA) {
7338 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7339 Intr->NumVAddrs - NSAMaxSize + 1);
7340 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7341 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7342 Intr->NumVAddrs);
7343 }
7344 }
7345
7346 int Flags = 0;
7347 if (IsA16)
7348 Flags |= 1;
7349 if (IsG16)
7350 Flags |= 2;
7351 MI.addOperand(MachineOperand::CreateImm(Flags));
7352
7353 if (BaseOpcode->NoReturn) { // No TFE for stores?
7354 // TODO: Handle dmask trim
7355 if (!Ty.isVector() || !IsD16)
7356 return true;
7357
7358 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7359 if (RepackedReg != VData) {
7360 MI.getOperand(1).setReg(RepackedReg);
7361 }
7362
7363 return true;
7364 }
7365
7366 Register DstReg = MI.getOperand(0).getReg();
7367 const LLT EltTy = Ty.getScalarType();
7368 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7369
7370 // Confirm that the return type is large enough for the dmask specified
7371 if (NumElts < DMaskLanes)
7372 return false;
7373
7374 if (NumElts > 4 || DMaskLanes > 4)
7375 return false;
7376
7377 // Image atomic instructions are using DMask to specify how many bits
7378 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7379 // DMaskLanes for image atomic has default value '0'.
7380 // We must be sure that atomic variants (especially packed) will not be
7381 // truncated from v2s16 or v4s16 to s16 type.
7382 //
7383 // ChangeElementCount will be needed for image load where Ty is always scalar.
7384 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7385 const LLT AdjustedTy =
7386 DMaskLanes == 0
7387 ? Ty
7388 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7389
7390 // The raw dword aligned data component of the load. The only legal cases
7391 // where this matters should be when using the packed D16 format, for
7392 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7393 LLT RoundedTy;
7394
7395 // S32 vector to cover all data, plus TFE result element.
7396 LLT TFETy;
7397
7398 // Register type to use for each loaded component. Will be S32 or V2S16.
7399 LLT RegTy;
7400
7401 if (IsD16 && ST.hasUnpackedD16VMem()) {
7402 RoundedTy =
7403 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7404 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7405 RegTy = S32;
7406 } else {
7407 unsigned EltSize = EltTy.getSizeInBits();
7408 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7409 unsigned RoundedSize = 32 * RoundedElts;
7410 RoundedTy = LLT::scalarOrVector(
7411 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7412 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7413 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7414 }
7415
7416 // The return type does not need adjustment.
7417 // TODO: Should we change s16 case to s32 or <2 x s16>?
7418 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7419 return true;
7420
7421 Register Dst1Reg;
7422
7423 // Insert after the instruction.
7424 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7425
7426 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7427 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7428 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7429 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7430
7431 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7432
7433 MI.getOperand(0).setReg(NewResultReg);
7434
7435 // In the IR, TFE is supposed to be used with a 2 element struct return
7436 // type. The instruction really returns these two values in one contiguous
7437 // register, with one additional dword beyond the loaded data. Rewrite the
7438 // return type to use a single register result.
7439
7440 if (IsTFE) {
7441 Dst1Reg = MI.getOperand(1).getReg();
7442 if (MRI->getType(Dst1Reg) != S32)
7443 return false;
7444
7445 // TODO: Make sure the TFE operand bit is set.
7446 MI.removeOperand(1);
7447
7448 // Handle the easy case that requires no repack instructions.
7449 if (Ty == S32) {
7450 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7451 return true;
7452 }
7453 }
7454
7455 // Now figure out how to copy the new result register back into the old
7456 // result.
7457 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7458
7459 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7460
7461 if (ResultNumRegs == 1) {
7462 assert(!IsTFE);
7463 ResultRegs[0] = NewResultReg;
7464 } else {
7465 // We have to repack into a new vector of some kind.
7466 for (int I = 0; I != NumDataRegs; ++I)
7467 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7468 B.buildUnmerge(ResultRegs, NewResultReg);
7469
7470 // Drop the final TFE element to get the data part. The TFE result is
7471 // directly written to the right place already.
7472 if (IsTFE)
7473 ResultRegs.resize(NumDataRegs);
7474 }
7475
7476 // For an s16 scalar result, we form an s32 result with a truncate regardless
7477 // of packed vs. unpacked.
7478 if (IsD16 && !Ty.isVector()) {
7479 B.buildTrunc(DstReg, ResultRegs[0]);
7480 return true;
7481 }
7482
7483 // Avoid a build/concat_vector of 1 entry.
7484 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7485 B.buildBitcast(DstReg, ResultRegs[0]);
7486 return true;
7487 }
7488
7489 assert(Ty.isVector());
7490
7491 if (IsD16) {
7492 // For packed D16 results with TFE enabled, all the data components are
7493 // S32. Cast back to the expected type.
7494 //
7495 // TODO: We don't really need to use load s32 elements. We would only need one
7496 // cast for the TFE result if a multiple of v2s16 was used.
7497 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7498 for (Register &Reg : ResultRegs)
7499 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7500 } else if (ST.hasUnpackedD16VMem()) {
7501 for (Register &Reg : ResultRegs)
7502 Reg = B.buildTrunc(S16, Reg).getReg(0);
7503 }
7504 }
7505
7506 auto padWithUndef = [&](LLT Ty, int NumElts) {
7507 if (NumElts == 0)
7508 return;
7509 Register Undef = B.buildUndef(Ty).getReg(0);
7510 for (int I = 0; I != NumElts; ++I)
7511 ResultRegs.push_back(Undef);
7512 };
7513
7514 // Pad out any elements eliminated due to the dmask.
7515 LLT ResTy = MRI->getType(ResultRegs[0]);
7516 if (!ResTy.isVector()) {
7517 padWithUndef(ResTy, NumElts - ResultRegs.size());
7518 B.buildBuildVector(DstReg, ResultRegs);
7519 return true;
7520 }
7521
7522 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7523 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7524
7525 // Deal with the one annoying legal case.
7526 const LLT V3S16 = LLT::fixed_vector(3, 16);
7527 if (Ty == V3S16) {
7528 if (IsTFE) {
7529 if (ResultRegs.size() == 1) {
7530 NewResultReg = ResultRegs[0];
7531 } else if (ResultRegs.size() == 2) {
7532 LLT V4S16 = LLT::fixed_vector(4, 16);
7533 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7534 } else {
7535 return false;
7536 }
7537 }
7538
7539 if (MRI->getType(DstReg).getNumElements() <
7540 MRI->getType(NewResultReg).getNumElements()) {
7541 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7542 } else {
7543 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7544 }
7545 return true;
7546 }
7547
7548 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7549 B.buildConcatVectors(DstReg, ResultRegs);
7550 return true;
7551}
7552
7554 MachineInstr &MI) const {
7555 MachineIRBuilder &B = Helper.MIRBuilder;
7556 GISelChangeObserver &Observer = Helper.Observer;
7557
7558 Register OrigDst = MI.getOperand(0).getReg();
7559 Register Dst;
7560 LLT Ty = B.getMRI()->getType(OrigDst);
7561 unsigned Size = Ty.getSizeInBits();
7562 MachineFunction &MF = B.getMF();
7563 unsigned Opc = 0;
7564 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7565 assert(Size == 8 || Size == 16);
7566 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7567 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7568 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7569 // destination register.
7570 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7571 } else {
7572 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7573 Dst = OrigDst;
7574 }
7575
7576 Observer.changingInstr(MI);
7577
7578 // Handle needing to s.buffer.load() a p8 value.
7579 if (hasBufferRsrcWorkaround(Ty)) {
7580 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7581 B.setInsertPt(B.getMBB(), MI);
7582 }
7584 Ty = getBitcastRegisterType(Ty);
7585 Helper.bitcastDst(MI, Ty, 0);
7586 B.setInsertPt(B.getMBB(), MI);
7587 }
7588
7589 // FIXME: We don't really need this intermediate instruction. The intrinsic
7590 // should be fixed to have a memory operand. Since it's readnone, we're not
7591 // allowed to add one.
7592 MI.setDesc(B.getTII().get(Opc));
7593 MI.removeOperand(1); // Remove intrinsic ID
7594
7595 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7596 const unsigned MemSize = (Size + 7) / 8;
7597 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7603 MemSize, MemAlign);
7604 MI.addMemOperand(MF, MMO);
7605 if (Dst != OrigDst) {
7606 MI.getOperand(0).setReg(Dst);
7607 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7608 B.buildTrunc(OrigDst, Dst);
7609 }
7610
7611 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7612 // always be legal. We may need to restore this to a 96-bit result if it turns
7613 // out this needs to be converted to a vector load during RegBankSelect.
7614 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7615 if (Ty.isVector())
7617 else
7618 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7619 }
7620
7621 Observer.changedInstr(MI);
7622 return true;
7623}
7624
7626 MachineInstr &MI) const {
7627 MachineIRBuilder &B = Helper.MIRBuilder;
7628 GISelChangeObserver &Observer = Helper.Observer;
7629 Observer.changingInstr(MI);
7630 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7631 MI.removeOperand(0); // Remove intrinsic ID
7633 Observer.changedInstr(MI);
7634 return true;
7635}
7636
7637// TODO: Move to selection
7640 MachineIRBuilder &B) const {
7641 if (!ST.hasTrapHandler() ||
7642 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7643 return legalizeTrapEndpgm(MI, MRI, B);
7644
7645 return ST.supportsGetDoorbellID() ?
7647}
7648
7651 const DebugLoc &DL = MI.getDebugLoc();
7652 MachineBasicBlock &BB = B.getMBB();
7653 MachineFunction *MF = BB.getParent();
7654
7655 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7656 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7657 .addImm(0);
7658 MI.eraseFromParent();
7659 return true;
7660 }
7661
7662 // We need a block split to make the real endpgm a terminator. We also don't
7663 // want to break phis in successor blocks, so we can't just delete to the
7664 // end of the block.
7665 BB.splitAt(MI, false /*UpdateLiveIns*/);
7667 MF->push_back(TrapBB);
7668 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7669 .addImm(0);
7670 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7671 .addMBB(TrapBB);
7672
7673 BB.addSuccessor(TrapBB);
7674 MI.eraseFromParent();
7675 return true;
7676}
7677
7680 MachineFunction &MF = B.getMF();
7681 const LLT S64 = LLT::scalar(64);
7682
7683 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7684 // For code object version 5, queue_ptr is passed through implicit kernarg.
7690 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7691
7692 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7694
7695 if (!loadInputValue(KernargPtrReg, B,
7697 return false;
7698
7699 // TODO: can we be smarter about machine pointer info?
7702 PtrInfo.getWithOffset(Offset),
7706
7707 // Pointer address
7710 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7711 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7712 // Load address
7713 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7714 B.buildCopy(SGPR01, Temp);
7715 B.buildInstr(AMDGPU::S_TRAP)
7716 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7717 .addReg(SGPR01, RegState::Implicit);
7718 MI.eraseFromParent();
7719 return true;
7720 }
7721
7722 // Pass queue pointer to trap handler as input, and insert trap instruction
7723 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7724 Register LiveIn =
7727 return false;
7728
7729 B.buildCopy(SGPR01, LiveIn);
7730 B.buildInstr(AMDGPU::S_TRAP)
7731 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7732 .addReg(SGPR01, RegState::Implicit);
7733
7734 MI.eraseFromParent();
7735 return true;
7736}
7737
7740 MachineIRBuilder &B) const {
7741 // We need to simulate the 's_trap 2' instruction on targets that run in
7742 // PRIV=1 (where it is treated as a nop).
7743 if (ST.hasPrivEnabledTrap2NopBug()) {
7744 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7745 MI.getDebugLoc());
7746 MI.eraseFromParent();
7747 return true;
7748 }
7749
7750 B.buildInstr(AMDGPU::S_TRAP)
7751 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7752 MI.eraseFromParent();
7753 return true;
7754}
7755
7758 MachineIRBuilder &B) const {
7759 // Is non-HSA path or trap-handler disabled? Then, report a warning
7760 // accordingly
7761 if (!ST.hasTrapHandler() ||
7762 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7763 Function &Fn = B.getMF().getFunction();
7765 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7766 } else {
7767 // Insert debug-trap instruction
7768 B.buildInstr(AMDGPU::S_TRAP)
7769 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7770 }
7771
7772 MI.eraseFromParent();
7773 return true;
7774}
7775
7777 MachineInstr &MI, MachineIRBuilder &B) const {
7778 MachineRegisterInfo &MRI = *B.getMRI();
7779 const LLT S16 = LLT::scalar(16);
7780 const LLT S32 = LLT::scalar(32);
7781 const LLT V2S16 = LLT::fixed_vector(2, 16);
7782 const LLT V3S32 = LLT::fixed_vector(3, 32);
7783
7784 Register DstReg = MI.getOperand(0).getReg();
7785 Register NodePtr = MI.getOperand(2).getReg();
7786 Register RayExtent = MI.getOperand(3).getReg();
7787 Register RayOrigin = MI.getOperand(4).getReg();
7788 Register RayDir = MI.getOperand(5).getReg();
7789 Register RayInvDir = MI.getOperand(6).getReg();
7790 Register TDescr = MI.getOperand(7).getReg();
7791
7792 if (!ST.hasGFX10_AEncoding()) {
7793 Function &Fn = B.getMF().getFunction();
7795 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7796 return false;
7797 }
7798
7799 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7800 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7801 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7802 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7803 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7804 const unsigned NumVDataDwords = 4;
7805 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7806 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7807 const bool UseNSA =
7808 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7809
7810 const unsigned BaseOpcodes[2][2] = {
7811 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7812 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7813 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7814 int Opcode;
7815 if (UseNSA) {
7816 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7817 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7818 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7819 : AMDGPU::MIMGEncGfx10NSA,
7820 NumVDataDwords, NumVAddrDwords);
7821 } else {
7822 assert(!IsGFX12Plus);
7823 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7824 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7825 : AMDGPU::MIMGEncGfx10Default,
7826 NumVDataDwords, NumVAddrDwords);
7827 }
7828 assert(Opcode != -1);
7829
7831 if (UseNSA && IsGFX11Plus) {
7832 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7833 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7834 auto Merged = B.buildMergeLikeInstr(
7835 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7836 Ops.push_back(Merged.getReg(0));
7837 };
7838
7839 Ops.push_back(NodePtr);
7840 Ops.push_back(RayExtent);
7841 packLanes(RayOrigin);
7842
7843 if (IsA16) {
7844 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7845 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7846 auto MergedDir = B.buildMergeLikeInstr(
7847 V3S32,
7848 {B.buildBitcast(
7849 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7850 UnmergeRayDir.getReg(0)}))
7851 .getReg(0),
7852 B.buildBitcast(
7853 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7854 UnmergeRayDir.getReg(1)}))
7855 .getReg(0),
7856 B.buildBitcast(
7857 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7858 UnmergeRayDir.getReg(2)}))
7859 .getReg(0)});
7860 Ops.push_back(MergedDir.getReg(0));
7861 } else {
7862 packLanes(RayDir);
7863 packLanes(RayInvDir);
7864 }
7865 } else {
7866 if (Is64) {
7867 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7868 Ops.push_back(Unmerge.getReg(0));
7869 Ops.push_back(Unmerge.getReg(1));
7870 } else {
7871 Ops.push_back(NodePtr);
7872 }
7873 Ops.push_back(RayExtent);
7874
7875 auto packLanes = [&Ops, &S32, &B](Register Src) {
7876 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7877 Ops.push_back(Unmerge.getReg(0));
7878 Ops.push_back(Unmerge.getReg(1));
7879 Ops.push_back(Unmerge.getReg(2));
7880 };
7881
7882 packLanes(RayOrigin);
7883 if (IsA16) {
7884 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7885 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7889 B.buildMergeLikeInstr(R1,
7890 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7891 B.buildMergeLikeInstr(
7892 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7893 B.buildMergeLikeInstr(
7894 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7895 Ops.push_back(R1);
7896 Ops.push_back(R2);
7897 Ops.push_back(R3);
7898 } else {
7899 packLanes(RayDir);
7900 packLanes(RayInvDir);
7901 }
7902 }
7903
7904 if (!UseNSA) {
7905 // Build a single vector containing all the operands so far prepared.
7906 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7907 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7908 Ops.clear();
7909 Ops.push_back(MergedOps);
7910 }
7911
7912 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7913 .addDef(DstReg)
7914 .addImm(Opcode);
7915
7916 for (Register R : Ops) {
7917 MIB.addUse(R);
7918 }
7919
7920 MIB.addUse(TDescr)
7921 .addImm(IsA16 ? 1 : 0)
7922 .cloneMemRefs(MI);
7923
7924 MI.eraseFromParent();
7925 return true;
7926}
7927
7929 MachineInstr &MI, MachineIRBuilder &B) const {
7930 const LLT S32 = LLT::scalar(32);
7931 const LLT V2S32 = LLT::fixed_vector(2, 32);
7932
7933 Register DstReg = MI.getOperand(0).getReg();
7934 Register DstOrigin = MI.getOperand(1).getReg();
7935 Register DstDir = MI.getOperand(2).getReg();
7936 Register NodePtr = MI.getOperand(4).getReg();
7937 Register RayExtent = MI.getOperand(5).getReg();
7938 Register InstanceMask = MI.getOperand(6).getReg();
7939 Register RayOrigin = MI.getOperand(7).getReg();
7940 Register RayDir = MI.getOperand(8).getReg();
7941 Register Offsets = MI.getOperand(9).getReg();
7942 Register TDescr = MI.getOperand(10).getReg();
7943
7944 if (!ST.hasBVHDualAndBVH8Insts()) {
7945 Function &Fn = B.getMF().getFunction();
7947 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7948 return false;
7949 }
7950
7951 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7952 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7953 const unsigned NumVDataDwords = 10;
7954 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7955 int Opcode = AMDGPU::getMIMGOpcode(
7956 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7957 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7958 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7959 assert(Opcode != -1);
7960
7961 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7962 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7963
7964 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7965 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7966 .addDef(DstReg)
7967 .addDef(DstOrigin)
7968 .addDef(DstDir)
7969 .addImm(Opcode)
7970 .addUse(NodePtr)
7971 .addUse(RayExtentInstanceMaskVec.getReg(0))
7972 .addUse(RayOrigin)
7973 .addUse(RayDir)
7974 .addUse(Offsets)
7975 .addUse(TDescr)
7976 .cloneMemRefs(MI);
7977
7978 MI.eraseFromParent();
7979 return true;
7980}
7981
7983 MachineIRBuilder &B) const {
7984 const SITargetLowering *TLI = ST.getTargetLowering();
7986 Register DstReg = MI.getOperand(0).getReg();
7987 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7988 MI.eraseFromParent();
7989 return true;
7990}
7991
7993 MachineIRBuilder &B) const {
7994 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7995 if (!ST.hasArchitectedSGPRs())
7996 return false;
7997 LLT S32 = LLT::scalar(32);
7998 Register DstReg = MI.getOperand(0).getReg();
7999 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
8000 auto LSB = B.buildConstant(S32, 25);
8001 auto Width = B.buildConstant(S32, 5);
8002 B.buildUbfx(DstReg, TTMP8, LSB, Width);
8003 MI.eraseFromParent();
8004 return true;
8005}
8006
8009 AMDGPU::Hwreg::Id HwReg,
8010 unsigned LowBit,
8011 unsigned Width) const {
8012 MachineRegisterInfo &MRI = *B.getMRI();
8013 Register DstReg = MI.getOperand(0).getReg();
8014 if (!MRI.getRegClassOrNull(DstReg))
8015 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8016 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8017 .addDef(DstReg)
8018 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
8019 MI.eraseFromParent();
8020 return true;
8021}
8022
8023static constexpr unsigned FPEnvModeBitField =
8025
8026static constexpr unsigned FPEnvTrapBitField =
8028
8031 MachineIRBuilder &B) const {
8032 Register Src = MI.getOperand(0).getReg();
8033 if (MRI.getType(Src) != S64)
8034 return false;
8035
8036 auto ModeReg =
8037 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8038 /*HasSideEffects=*/true, /*isConvergent=*/false)
8039 .addImm(FPEnvModeBitField);
8040 auto TrapReg =
8041 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8042 /*HasSideEffects=*/true, /*isConvergent=*/false)
8043 .addImm(FPEnvTrapBitField);
8044 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8045 MI.eraseFromParent();
8046 return true;
8047}
8048
8051 MachineIRBuilder &B) const {
8052 Register Src = MI.getOperand(0).getReg();
8053 if (MRI.getType(Src) != S64)
8054 return false;
8055
8056 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
8057 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8058 /*HasSideEffects=*/true, /*isConvergent=*/false)
8059 .addImm(static_cast<int16_t>(FPEnvModeBitField))
8060 .addReg(Unmerge.getReg(0));
8061 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8062 /*HasSideEffects=*/true, /*isConvergent=*/false)
8063 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
8064 .addReg(Unmerge.getReg(1));
8065 MI.eraseFromParent();
8066 return true;
8067}
8068
8070 MachineInstr &MI) const {
8071 MachineIRBuilder &B = Helper.MIRBuilder;
8072 MachineRegisterInfo &MRI = *B.getMRI();
8073
8074 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8075 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8076 switch (IntrID) {
8077 case Intrinsic::amdgcn_icmp: {
8078 // amdgcn.icmp(i1 src0, i1 0, NE) -> ballot(src0)
8079 // This is the only valid form of amdgcn.icmp with i1 inputs.
8080 Register Src0 = MI.getOperand(2).getReg();
8081 LLT SrcTy = MRI.getType(Src0);
8082 if (SrcTy != LLT::scalar(1))
8083 return true; // Not i1, leave for default handling.
8084
8085 // Check that src1 is constant 0.
8086 Register Src1 = MI.getOperand(3).getReg();
8087 auto Src1Const = getIConstantVRegValWithLookThrough(Src1, MRI);
8088 if (!Src1Const || Src1Const->Value != 0)
8089 return false; // Invalid i1 icmp form.
8090
8091 // Check that predicate is ICMP_NE.
8092 int64_t Pred = MI.getOperand(4).getImm();
8093 if (Pred != CmpInst::ICMP_NE)
8094 return false; // Invalid i1 icmp form.
8095
8096 // Convert to ballot.
8097 Register Dst = MI.getOperand(0).getReg();
8098 B.buildIntrinsic(Intrinsic::amdgcn_ballot, Dst).addUse(Src0);
8099 MI.eraseFromParent();
8100 return true;
8101 }
8102 case Intrinsic::sponentry:
8103 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8104 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8105 // that we can remove this cast.
8106 const LLT S32 = LLT::scalar(32);
8108 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8109
8110 Register DstReg = MI.getOperand(0).getReg();
8111 B.buildIntToPtr(DstReg, TmpReg);
8112 MI.eraseFromParent();
8113 } else {
8114 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8115 1, 0, /*IsImmutable=*/false);
8116 B.buildFrameIndex(MI.getOperand(0), FI);
8117 MI.eraseFromParent();
8118 }
8119 return true;
8120 case Intrinsic::amdgcn_if:
8121 case Intrinsic::amdgcn_else: {
8122 MachineInstr *Br = nullptr;
8123 MachineBasicBlock *UncondBrTarget = nullptr;
8124 bool Negated = false;
8125 if (MachineInstr *BrCond =
8126 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8127 const SIRegisterInfo *TRI
8128 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8129
8130 Register Def = MI.getOperand(1).getReg();
8131 Register Use = MI.getOperand(3).getReg();
8132
8133 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8134
8135 if (Negated)
8136 std::swap(CondBrTarget, UncondBrTarget);
8137
8138 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8139 if (IntrID == Intrinsic::amdgcn_if) {
8140 B.buildInstr(AMDGPU::SI_IF)
8141 .addDef(Def)
8142 .addUse(Use)
8143 .addMBB(UncondBrTarget);
8144 } else {
8145 B.buildInstr(AMDGPU::SI_ELSE)
8146 .addDef(Def)
8147 .addUse(Use)
8148 .addMBB(UncondBrTarget);
8149 }
8150
8151 if (Br) {
8152 Br->getOperand(0).setMBB(CondBrTarget);
8153 } else {
8154 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8155 // since we're swapping branch targets it needs to be reinserted.
8156 // FIXME: IRTranslator should probably not do this
8157 B.buildBr(*CondBrTarget);
8158 }
8159
8160 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8161 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8162 MI.eraseFromParent();
8163 BrCond->eraseFromParent();
8164 return true;
8165 }
8166
8167 return false;
8168 }
8169 case Intrinsic::amdgcn_loop: {
8170 MachineInstr *Br = nullptr;
8171 MachineBasicBlock *UncondBrTarget = nullptr;
8172 bool Negated = false;
8173 if (MachineInstr *BrCond =
8174 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8175 const SIRegisterInfo *TRI
8176 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8177
8178 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8179 Register Reg = MI.getOperand(2).getReg();
8180
8181 if (Negated)
8182 std::swap(CondBrTarget, UncondBrTarget);
8183
8184 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8185 B.buildInstr(AMDGPU::SI_LOOP)
8186 .addUse(Reg)
8187 .addMBB(UncondBrTarget);
8188
8189 if (Br)
8190 Br->getOperand(0).setMBB(CondBrTarget);
8191 else
8192 B.buildBr(*CondBrTarget);
8193
8194 MI.eraseFromParent();
8195 BrCond->eraseFromParent();
8196 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8197 return true;
8198 }
8199
8200 return false;
8201 }
8202 case Intrinsic::amdgcn_wave_reduce_min:
8203 case Intrinsic::amdgcn_wave_reduce_umin:
8204 case Intrinsic::amdgcn_wave_reduce_max:
8205 case Intrinsic::amdgcn_wave_reduce_umax:
8206 case Intrinsic::amdgcn_wave_reduce_add:
8207 case Intrinsic::amdgcn_wave_reduce_sub:
8208 case Intrinsic::amdgcn_wave_reduce_and:
8209 case Intrinsic::amdgcn_wave_reduce_or:
8210 case Intrinsic::amdgcn_wave_reduce_xor: {
8211 Register SrcReg = MI.getOperand(2).getReg();
8212 if (MRI.getType(SrcReg) != LLT::scalar(16))
8213 return true;
8214 Register DstReg = MI.getOperand(0).getReg();
8215 bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
8216 IntrID == Intrinsic::amdgcn_wave_reduce_max ||
8217 IntrID == Intrinsic::amdgcn_wave_reduce_add ||
8218 IntrID == Intrinsic::amdgcn_wave_reduce_sub;
8219 auto Ext = NeedsSignExt ? B.buildSExt(LLT::scalar(32), SrcReg)
8220 : B.buildZExt(LLT::scalar(32), SrcReg);
8221 auto NewDst = MRI.createGenericVirtualRegister(LLT::scalar(32));
8222 B.buildIntrinsic(IntrID, ArrayRef<Register>{NewDst},
8223 /*hasSideEffects=*/false, /*isConvergent=*/true)
8224 .addUse(Ext.getReg(0))
8225 .addImm(MI.getOperand(3).getImm()); // strategy
8226 B.buildTrunc(DstReg, NewDst);
8227 MI.eraseFromParent();
8228 return true;
8229 }
8230 case Intrinsic::amdgcn_addrspacecast_nonnull:
8231 return legalizeAddrSpaceCast(MI, MRI, B);
8232 case Intrinsic::amdgcn_make_buffer_rsrc:
8233 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8234 case Intrinsic::amdgcn_kernarg_segment_ptr:
8235 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8236 // This only makes sense to call in a kernel, so just lower to null.
8237 B.buildConstant(MI.getOperand(0).getReg(), 0);
8238 MI.eraseFromParent();
8239 return true;
8240 }
8241
8244 case Intrinsic::amdgcn_implicitarg_ptr:
8245 return legalizeImplicitArgPtr(MI, MRI, B);
8246 case Intrinsic::amdgcn_workitem_id_x:
8247 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8249 case Intrinsic::amdgcn_workitem_id_y:
8250 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8252 case Intrinsic::amdgcn_workitem_id_z:
8253 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8255 case Intrinsic::amdgcn_workgroup_id_x:
8256 return legalizeWorkGroupId(
8260 case Intrinsic::amdgcn_workgroup_id_y:
8261 return legalizeWorkGroupId(
8265 case Intrinsic::amdgcn_workgroup_id_z:
8266 return legalizeWorkGroupId(
8270 case Intrinsic::amdgcn_cluster_id_x:
8271 return ST.hasClusters() &&
8274 case Intrinsic::amdgcn_cluster_id_y:
8275 return ST.hasClusters() &&
8278 case Intrinsic::amdgcn_cluster_id_z:
8279 return ST.hasClusters() &&
8282 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8283 return ST.hasClusters() &&
8286 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8287 return ST.hasClusters() &&
8290 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8291 return ST.hasClusters() &&
8294 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8295 return ST.hasClusters() &&
8297 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8298 return ST.hasClusters() &&
8301 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8302 return ST.hasClusters() &&
8305 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8306 return ST.hasClusters() &&
8309 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8310 return ST.hasClusters() &&
8312 MI, MRI, B,
8314 case Intrinsic::amdgcn_wave_id:
8315 return legalizeWaveID(MI, B);
8316 case Intrinsic::amdgcn_lds_kernel_id:
8317 return legalizePreloadedArgIntrin(MI, MRI, B,
8319 case Intrinsic::amdgcn_dispatch_ptr:
8320 return legalizePreloadedArgIntrin(MI, MRI, B,
8322 case Intrinsic::amdgcn_queue_ptr:
8323 return legalizePreloadedArgIntrin(MI, MRI, B,
8325 case Intrinsic::amdgcn_implicit_buffer_ptr:
8328 case Intrinsic::amdgcn_dispatch_id:
8329 return legalizePreloadedArgIntrin(MI, MRI, B,
8331 case Intrinsic::r600_read_ngroups_x:
8332 // TODO: Emit error for hsa
8335 case Intrinsic::r600_read_ngroups_y:
8338 case Intrinsic::r600_read_ngroups_z:
8341 case Intrinsic::r600_read_local_size_x:
8342 // TODO: Could insert G_ASSERT_ZEXT from s16
8344 case Intrinsic::r600_read_local_size_y:
8345 // TODO: Could insert G_ASSERT_ZEXT from s16
8347 // TODO: Could insert G_ASSERT_ZEXT from s16
8348 case Intrinsic::r600_read_local_size_z:
8351 case Intrinsic::amdgcn_fdiv_fast:
8352 return legalizeFDIVFastIntrin(MI, MRI, B);
8353 case Intrinsic::amdgcn_is_shared:
8355 case Intrinsic::amdgcn_is_private:
8357 case Intrinsic::amdgcn_wavefrontsize: {
8358 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8359 MI.eraseFromParent();
8360 return true;
8361 }
8362 case Intrinsic::amdgcn_s_buffer_load:
8363 return legalizeSBufferLoad(Helper, MI);
8364 case Intrinsic::amdgcn_raw_buffer_store:
8365 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8366 case Intrinsic::amdgcn_struct_buffer_store:
8367 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8368 return legalizeBufferStore(MI, Helper, false, false);
8369 case Intrinsic::amdgcn_raw_buffer_store_format:
8370 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8371 case Intrinsic::amdgcn_struct_buffer_store_format:
8372 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8373 return legalizeBufferStore(MI, Helper, false, true);
8374 case Intrinsic::amdgcn_raw_tbuffer_store:
8375 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8376 case Intrinsic::amdgcn_struct_tbuffer_store:
8377 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8378 return legalizeBufferStore(MI, Helper, true, true);
8379 case Intrinsic::amdgcn_raw_buffer_load:
8380 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8381 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8382 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8383 case Intrinsic::amdgcn_struct_buffer_load:
8384 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8385 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8386 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8387 return legalizeBufferLoad(MI, Helper, false, false);
8388 case Intrinsic::amdgcn_raw_buffer_load_format:
8389 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8390 case Intrinsic::amdgcn_struct_buffer_load_format:
8391 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8392 return legalizeBufferLoad(MI, Helper, true, false);
8393 case Intrinsic::amdgcn_raw_tbuffer_load:
8394 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8395 case Intrinsic::amdgcn_struct_tbuffer_load:
8396 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8397 return legalizeBufferLoad(MI, Helper, true, true);
8398 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8399 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8400 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8401 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8402 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8403 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8404 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8405 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8406 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8407 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8408 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8409 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8410 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8411 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8412 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8413 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8414 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8415 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8416 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8417 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8418 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8419 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8420 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8421 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8422 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8423 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8424 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8425 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8426 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8427 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8428 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8429 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8430 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8431 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8432 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8433 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8434 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8436 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8437 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8438 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8439 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8440 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8441 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8442 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8444 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8445 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8446 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8448 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8449 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8450 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8451 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8452 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8453 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8454 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8455 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8456 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8457 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8458 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8460 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8461 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8462 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8463 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8464 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8465 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8466 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8468 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8470 return legalizeBufferAtomic(MI, B, IntrID);
8471 case Intrinsic::amdgcn_rsq_clamp:
8472 return legalizeRsqClampIntrinsic(MI, MRI, B);
8473 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8475 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8476 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8478 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8479 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8480 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8481 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8482 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8483 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8484 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8485 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8486 Register Index = MI.getOperand(5).getReg();
8487 LLT S64 = LLT::scalar(64);
8488 LLT IndexArgTy = MRI.getType(Index);
8489 if (IndexArgTy != S64) {
8490 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8491 : B.buildAnyExt(S64, Index);
8492 MI.getOperand(5).setReg(NewIndex.getReg(0));
8493 }
8494 return true;
8495 }
8496 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8497 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8498 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8499 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8500 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8501 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8502 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8503 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8504 Register Index = MI.getOperand(5).getReg();
8505 LLT S32 = LLT::scalar(32);
8506 if (MRI.getType(Index) != S32)
8507 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8508 return true;
8509 }
8510 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8511 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8512 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8513 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8514 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8515 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8516 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8517 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8518 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8519 Register Index = MI.getOperand(7).getReg();
8520 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8521 ? LLT::scalar(64)
8522 : LLT::scalar(32);
8523 LLT IndexArgTy = MRI.getType(Index);
8524 if (IndexArgTy != IdxTy) {
8525 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8526 : B.buildAnyExt(IdxTy, Index);
8527 MI.getOperand(7).setReg(NewIndex.getReg(0));
8528 }
8529 return true;
8530 }
8531
8532 case Intrinsic::amdgcn_fmed3: {
8533 GISelChangeObserver &Observer = Helper.Observer;
8534
8535 // FIXME: This is to workaround the inability of tablegen match combiners to
8536 // match intrinsics in patterns.
8537 Observer.changingInstr(MI);
8538 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8539 MI.removeOperand(1);
8540 Observer.changedInstr(MI);
8541 return true;
8542 }
8543 case Intrinsic::amdgcn_readlane:
8544 case Intrinsic::amdgcn_writelane:
8545 case Intrinsic::amdgcn_readfirstlane:
8546 case Intrinsic::amdgcn_permlane16:
8547 case Intrinsic::amdgcn_permlanex16:
8548 case Intrinsic::amdgcn_permlane64:
8549 case Intrinsic::amdgcn_set_inactive:
8550 case Intrinsic::amdgcn_set_inactive_chain_arg:
8551 case Intrinsic::amdgcn_mov_dpp8:
8552 case Intrinsic::amdgcn_update_dpp:
8553 case Intrinsic::amdgcn_permlane_bcast:
8554 case Intrinsic::amdgcn_permlane_up:
8555 case Intrinsic::amdgcn_permlane_down:
8556 case Intrinsic::amdgcn_permlane_xor:
8557 return legalizeLaneOp(Helper, MI, IntrID);
8558 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8559 return legalizeSBufferPrefetch(Helper, MI);
8560 case Intrinsic::amdgcn_dead: {
8561 // TODO: Use poison instead of undef
8562 for (const MachineOperand &Def : MI.defs())
8563 B.buildUndef(Def);
8564 MI.eraseFromParent();
8565 return true;
8566 }
8567 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8568 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8569 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8570 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8571 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8572 MI.eraseFromParent();
8573 return true;
8574 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8575 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8576 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8577 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8578 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8579 MI.eraseFromParent();
8580 return true;
8581 case Intrinsic::amdgcn_av_load_b128:
8582 case Intrinsic::amdgcn_av_store_b128: {
8583 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
8584 if (!ST.hasFlatGlobalInsts()) {
8585 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8586 ? "llvm.amdgcn.av.load.b128"
8587 : "llvm.amdgcn.av.store.b128";
8588 Function &Fn = B.getMF().getFunction();
8590 Fn, Twine(Name) + " not supported on subtarget", MI.getDebugLoc()));
8591 return false;
8592 }
8593 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8594 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8595 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8596 else
8597 B.buildStore(MI.getOperand(2), MI.getOperand(1),
8598 **MI.memoperands_begin());
8599 MI.eraseFromParent();
8600 return true;
8601 }
8602 case Intrinsic::amdgcn_flat_load_monitor_b32:
8603 case Intrinsic::amdgcn_flat_load_monitor_b64:
8604 case Intrinsic::amdgcn_flat_load_monitor_b128:
8605 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8606 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8607 .add(MI.getOperand(0))
8608 .add(MI.getOperand(2))
8609 .addMemOperand(*MI.memoperands_begin());
8610 MI.eraseFromParent();
8611 return true;
8612 case Intrinsic::amdgcn_global_load_monitor_b32:
8613 case Intrinsic::amdgcn_global_load_monitor_b64:
8614 case Intrinsic::amdgcn_global_load_monitor_b128:
8615 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8616 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8617 .add(MI.getOperand(0))
8618 .add(MI.getOperand(2))
8619 .addMemOperand(*MI.memoperands_begin());
8620 MI.eraseFromParent();
8621 return true;
8622 default: {
8623 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8625 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8626 return true;
8627 }
8628 }
8629
8630 return true;
8631}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:77
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1276
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1197
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:383
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:861
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:558
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:1987
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:656
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:464
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:317
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1685
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:436
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.